This is a list of publications, aimed at being a comprehensive bibliography of the field. Should you wish to have your publications listed here, you can either email us your BibTex .bib file or a link to your uptodate .bib file (the plugin will automatically update the list below using the bibtex entries from the link provided). See this help page for instructions on obtaining such a link with services like citeulike or bibsonomy.
If you’re registered on this blog and have editor access, you can edit this page and add the link yourself.
Also we recommend you to have a look at Memkite’s deep learning reading list as well:
http://memkite.com/deeplearningbibliography/
University of Montreal’s LISA lab deep learning publications:

[2010,article] Y. Bengio, O. Delalleau, and C. Simard, "Decision Trees do not Generalize to New Variations," Computational Intelligence, vol. 26, iss. 4, pp. 449467, 2010.
@ARTICLE{Bengiodecisiontrees10,
author = {Bengio, Yoshua and Delalleau, Olivier and Simard, Clarence},
keywords = {curse of dimensionality, decision trees, parity function},
month = nov, title = {Decision Trees do not Generalize to New Variations},
journal = {Computational Intelligence},
volume = {26},
number = {4},
year = {2010},
pages = {449467}
} 
[2010,inproceedings] D. Erhan, A. Courville, Y. Bengio, and P. Vincent, "Why Does Unsupervised Pretraining Help Deep Learning?," in Proceedings of AISTATS 2010, 2010, pp. 201208.
@INPROCEEDINGS{Erhanaistats2010,
author = {Erhan, Dumitru and Courville, Aaron and Bengio, Yoshua and Vincent, Pascal},
month = may, title = {Why Does Unsupervised Pretraining Help Deep Learning?},
booktitle = {Proceedings of AISTATS 2010},
volume = {9},
year = {2010},
pages = {201208},
location = {Chia Laguna Resort, Sardinia, Italy},
abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of autoencoder variants with impressive results being obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks often involve an unsupervised learning component, usually in an unsupervised pretraining phase. The main question investigated here is the following: why does unsupervised pretraining work so well? Through extensive experimentation, we explore several possible explanations discussed in the literature including its action as a regularizer (Erhan et al. 2009) and as an aid to optimization (Bengio et al. 2007). Our results build on the work of Erhan et al. 2009, showing that unsupervised pretraining appears to play predominantly a regularization role in subsequent supervised training. However our results in an online setting, with a virtually unlimited data stream, point to a somewhat more nuanced interpretation of the roles of optimization and regularization in the unsupervised pretraining effect.}
} 
[2010,inproceedings] Y. Bengio and X. Glorot, "Understanding the difficulty of training deep feedforward neural networks," in Proceedings of AISTATS 2010, 2010, pp. 249256.
@INPROCEEDINGS{GlorotAISTATS2010,
author = {Bengio, Yoshua and Glorot, Xavier},
month = may, title = {Understanding the difficulty of training deep feedforward neural networks},
booktitle = {Proceedings of AISTATS 2010},
volume = {9},
year = {2010},
pages = {249256},
location = {Chia Laguna Resort, Sardinia, Italy},
abstract = {Whereas before 2006 it appears that deep multilayer neural networks were not successfully trained, since then several algorithms have been shown to successfully train them, with experimental results showing the superiority of deeper vs less deep architectures. All these experimental results were obtained with new initialization or training mechanisms. Our objective here is to understand better why standard gradient descent from random initialization is doing so poorly with deep neural networks, to better understand these recent relative successes and help design better algorithms in the future. We first observe the influence of the nonlinear activations functions. We find that the logistic sigmoid activation is unsuited for deep networks with random initialization because of its mean value, which can drive especially the top hidden layer into saturation. Surprisingly, we find that saturated units can move out of saturation by themselves, albeit slowly, and explaining the plateaus sometimes seen when training neural networks. We find that a new nonlinearity that saturates less can often be beneficial. Finally, we study how activations and gradients vary across layers and during training, with the idea that training may be more difficult when the singular values of the Jacobian associated with each layer are far from 1. Based on these considerations, we propose a new initialization scheme that brings substantially faster convergence.}
} 
[2010,article] D. Erhan, Y. Bengio, A. Courville, P. Manzagol, P. Vincent, and S. Bengio, "Why Does Unsupervised Pretraining Help Deep Learning?," , vol. 11, pp. 625660, 2010.
@ARTICLE{Erhan+al2010,
author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Manzagol, PierreAntoine and Vincent, Pascal and Bengio, Samy},
month = feb, title = {Why Does Unsupervised Pretraining Help Deep Learning?},
volume = {11},
year = {2010},
pages = {625660},
crossref = {JMLR},
abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of autoencoder variants, with impressive results obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks involve an unsupervised learning component, usually in an unsupervised pretraining phase. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. The main question investigated here is the following: why does unsupervised pretraining work and why does it work so well? Answering these questions is important if learning in deep architectures is to be further improved. We propose several explanatory hypotheses and test them through extensive simulations. We empirically show the influence of pretraining with respect to architecture depth, model capacity, and number of training examples. The experiments confirm and clarify the advantage of unsupervised pretraining. The results suggest that unsupervised pretraining guides the learning towards basins of attraction of minima that are better in terms of the underlying data distribution; the evidence from these results supports a regularization explanation for the effect of pretraining.}
} 
[2009,inproceedings] D. Erhan, P. Manzagol, Y. Bengio, S. Bengio, and P. Vincent, "The Difficulty of Training Deep Architectures and the effect of Unsupervised PreTraining." 2009, pp. 153160.
@INPROCEEDINGS{Erhan2009,
author = {Erhan, Dumitru and Manzagol, PierreAntoine and Bengio, Yoshua and Bengio, Samy and Vincent, Pascal},
keywords = {Deep Networks},
month = apr, title = {The Difficulty of Training Deep Architectures and the effect of Unsupervised PreTraining},
year = {2009},
pages = {153160},
crossref = {xAISTATS2009},
abstract = {Whereas theoretical work suggests that deep architectures might be more efﬁcient at representing highlyvarying functions, training deep architectures was unsuccessful until the recent advent of algorithms based on unsupervised pretraining. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difﬁcult learning problem. Answering these questions is important if learning in deep architectures is to be further improved. We attempt to shed some light on these questions through extensive simulations. The experiments conﬁrm and clarify the advantage of unsupervised pretraining. They demonstrate the robustness of the training procedure with respect to the random initialization, the positive effect of pretraining in terms of optimization and its role as a regularizer. We empirically show the inﬂuence of pretraining with respect to architecture depth, model capacity, and number of training examples.}
} 
[2009,techreport] Y. Bengio, J. Louradour, R. Collobert, and J. Weston, "Curriculum Learning," Département d’informatique et recherche opérationnelle, Université de Montréal, 1330, 2009.
@TECHREPORT{Bengio+al2009TR,
author = {Bengio, Yoshua and Louradour, Jerome and Collobert, Ronan and Weston, Jason},
title = {Curriculum Learning},
number = {1330},
year = {2009},
institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
abstract = {Humans and animals learn much better when the examples are not randomly presented but organized in a meaningful order which illustrates gradually more concepts, and gradually more complex ones. Here, we formalize such training strategies in the context of machine learning, and call them 'curriculum learning'. In the context of recent research studying the difﬁculty of training in the presence of nonconvex training criteria (for deep deterministic and stochastic neural networks), we explore curriculum learning in various setups. The experiments show that signiﬁcant improvements in generalization can be achieved. We hypothesize that curriculum learning has both an effect on the speed of convergence of the training process to a minimum and, in the case of nonconvex criteria, on the quality of the local minima obtained: curriculum learning can be seen as a particular form of continuation method (a general strategy for global optimization of nonconvex functions).}
} 
[2009,inproceedings] G. Attardi, F. Dell’Orletta, M. Simi, and J. Turian, "Accurate Dependency Parsing with a Stacked Multilayer Perceptron," in Proceeding of Evalita 2009, 2009.
@INPROCEEDINGS{Attardi+al2009,
author = {Attardi, Giuseppe and Dell'Orletta, Felice and Simi, Maria and Turian, Joseph},
keywords = {classifier, dependency parsing, natural language, parser, perceptron},
title = {Accurate Dependency Parsing with a Stacked Multilayer Perceptron},
booktitle = {Proceeding of Evalita 2009},
series = {LNCS},
year = {2009},
publisher = {Springer},
abstract = {Abstract. DeSR is a statistical transitionbased dependency parser which learns from annotated corpora which actions to perform for building parse trees while scanning a sentence. We describe recent improvements to the parser, in particular stacked parsing, exploiting a beam search strategy and using a Multilayer Perceptron classifier. For the Evalita 2009 Dependency Parsing task DesR was configured to use a combination of stacked parsers. The stacked combination achieved the best accuracy scores in both the main and pilot subtasks. The contribution to the result of various choices is analyzed, in particular for taking advantage of the peculiar features of the TUT Treebank.}
} 
[2009,phdthesis] H. Larochelle, "Étude de techniques d’apprentissage nonsupervisé pour l’amélioration de l’entra\^\inement supervisé de modèles connexionnistes," PhD Thesis , 2009.
@PHDTHESIS{LarochellePhD2009,
author = {Larochelle, Hugo},
keywords = {apprentissage nonsupervis{\'{e}},
architecture profonde, autoassociateur, autoencodeur, machine de Boltzmann restreinte, r{\'{e}}seau de neurones artificiel},
month = mar, title = {{\'{E}}tude de techniques d'apprentissage nonsupervis{\'{e}} pour l'am{\'{e}}lioration de l'entra{\^{\i}}nement supervis{\'{e}} de mod{\`{e}}les connexionnistes},
year = {2009},
school = {University of Montr{\'{e}}al},
abstract = {Le domaine de l'intelligence artificielle a pour objectif le d{\'{e}}veloppement de syst{\`{e}}mes informatiques capables de simuler des comportements normalement associ{\'{e}}s {\`{a}} l'intelligence humaine. On aimerait entre autres pouvoir construire une machine qui puisse r{\'{e}}soudre des t{\^{a}}ches li{\'{e}}es {\`{a}} la vision (e.g., la reconnaissance d'objet), au traitement de la langue (e.g., l'identification du sujet d'un texte) ou au traitement de signaux sonores (e.g., la reconnaissance de la parole). Une approche d{\'{e}}velopp{\'{e}}e afin de r{\'{e}}soudre ce genre de t{\^{a}}ches est bas{\'{e}}e sur l'apprentissage automatique de mod{\`{e}}les {\`{a}} partir de donn{\'{e}}es {\'{e}}tiquet{\'{e}}es refl{\'{e}}tant le comportement intelligent {\`{a}} {\'{e}}muler. Entre autre, il a {\'{e}}t{\'{e}} propos{\'{e}} de mod{\'{e}}liser le calcul n{\'{e}}cessaire {\`{a}} la r{\'{e}}solution d'une t{\^{a}}che {\`{a}} l'aide d'un r{\'{e}}seau de neurones artificiel, dont il est possible d'adapter le comportement {\`{a}} l'aide de la r{\'{e}}tropropagation [99, 131] d'un gradient informatif sur les erreurs commises par le r{\'{e}}seau. Populaire durant les ann{\'{e}}es 80, cette approche sp{\'{e}}cifique a depuis perdu partiellement de son attrait, suite au d{\'{e}}veloppement des m{\'{e}}thodes {\`{a}} noyau. Cellesci sont souvent plus stables, plus faciles {\`{a}} utiliser et leur performance est souvent au moins aussi {\'{e}}lev{\'{e}}e pour une vaste gamme de probl{\`{e}}mes. Les m{\'{e}}thodes d'apprentissage automatique ont donc progress{\'{e}} dans leur fonctionnement, mais aussi dans la complexit{\'{e}} des probl{\`{e}}mes auxquels elles se sont attaqu{\'{e}}. Ainsi, plus r{\'{e}}cemment, des travaux [12, 15] ont commenc{\'{e}} {\`{a}} {\'{e}}mettre des doutes sur la capacit{\'{e}} des machines {\`{a}} noyau {\`{a}} pouvoir efficacement r{\'{e}}soudre des probl{\`{e}}mes de la complexit{\'{e}} requise par l'intelligence artificielle. Parall{\`{e}}lement, Hinton et al. [81] faisaient une perc{\'{e}}e dans l'apprentissage automatique de r{\'{e}}seaux de neurones, en proposant une proc{\'{e}}dure permettant l'entra{\^{\i}}nement de r{\'{e}}seaux de neurones d'une plus grande complexit{\'{e}} (i.e., avec plus de couches de neurones cach{\'{e}}es) qu'il n'{\'{e}}tait possible auparavant. C'est dans ce contexte qu'ont {\'{e}}t{\'{e}} conduits les travaux de cette th{\`{e}}se. Cette th{\`{e}}se d{\'{e}}bute par une exposition des principes de base de l'apprentissage automatique (chapitre 1) et une discussion des obstacles {\`{a}} l'obtention d'un mod{\`{e}}le ayant une bonne performance de g{\'{e}}n{\'{e}}ralisation (chapitre 2). Puis, sont pr{\'{e}}sent{\'{e}}es les contributions apport{\'{e}}es dans le cadre de cinq articles, contributions qui sont toutes bas{\'{e}}es sur l'utilisation d'une certaine forme d'apprentissage nonsupervis{\'{e}}. Le premier article (chapitre 4) propose une m{\'{e}}thode d'entra{\^{\i}}nement pour un type sp{\'{e}}cifique de r{\'{e}}seau {\`{a}} une seule couche cach{\'{e}}e (la machine de Boltzmann restreinte) bas{\'{e}}e sur une combinaison des apprentissages supervis{\'{e}} et nonsupervis{\'{e}}. Cette m{\'{e}}thode permet d'obtenir une meilleure performance de g{\'{e}}n{\'{e}}ralisation qu'un r{\'{e}}seau de neurones standard ou qu'une machine {\`{a}} vecteurs de support {\`{a}} noyau, et met en {\'{e}}vidence de fa{\c c}on explicite les b{\'{e}}n{\'{e}}fices qu'apporte l'apprentissage nonsupervis{\'{e}} {\`{a}} l'entra{\^{\i}}nement d'un r{\'{e}}seau de neurones. Ensuite, dans le second article (chapitre 6), on {\'{e}}tudie et {\'{e}}tend la proc{\'{e}}dure d'entra{\^{\i}}nement propos{\'{e}}e par Hinton et al. [81]. Plus sp{\'{e}}cifiquement, on y propose une approche diff{\'{e}}rente mais plus flexible pour initialiser un r{\'{e}}seau {\`{a}} plusieurs couches cach{\'{e}}es, bas{\'{e}}e sur un r{\'{e}}seau autoassociateur. On y explore aussi l'impact du nombre de couches et de neurones par couche sur la performance d'un r{\'{e}}seau et on y d{\'{e}}crit diff{\'{e}}rentes variantes mieux adapt{\'{e}}es {\`{a}} l'apprentissage en ligne ou pour donn{\'{e}}es {\`{a}} valeurs continues. Dans le troisi{\`{e}}me article (chapitre 8), on explore plut{\^{o}}t la performance de r{\'{e}}seaux profonds sur plusieurs probl{\`{e}}mes de classification diff{\'{e}}rents. Les probl{\`{e}}mes choisis ont la propri{\'{e}}t{\'{e}} d'avoir {\'{e}}t{\'{e}} g{\'{e}}n{\'{e}}r{\'{e}}s {\`{a}} partir de plusieurs facteurs de variation. Cette propri{\'{e}}t{\'{e}},
qui caract{\'{e}}rise les probl{\`{e}}mes li{\'{e}}s {\`{a}} l'intelligence artificielle, pose difficult{\'{e}} aux machines {\`{a}} noyau, tel que confirm{\'{e}} par les exp{\'{e}}riences de cet article. Le quatri{\`{e}}me article (chapitre 10) pr{\'{e}}sente une am{\'{e}}lioration de l'approche bas{\'{e}}e sur les r{\'{e}}seaux autoassociateurs. Cette am{\'{e}}lioration applique une modification simple {\`{a}} la proc{\'{e}}dure d'entra{\^{\i}}nement d'un r{\'{e}}seau autoassociateur, en « bruitant » les entr{\'{e}}es du r{\'{e}}seau afin que celuici soit forc{\'{e}} {\`{a}} la d{\'{e}}bruiter. Le cinqui{\`{e}}me et dernier article (chapitre 12) apporte une autre am{\'{e}}lioration aux r{\'{e}}seaux autoassociateurs, en permettant des interactions d'inhibition ou d'excitation entre les neurones cach{\'{e}}s de ces r{\'{e}}seaux. On y d{\'{e}}montre que de telles interactions peuvent {\^{e}}tre apprises et sont b{\'{e}}n{\'{e}}fiques {\`{a}} la performance d'un r{\'{e}}seau profond.}
} 
[2009,article] H. Larochelle, Y. Bengio, J. Louradour, and P. Lamblin, "Exploring Strategies for Training Deep Neural Networks," , vol. 10, pp. 140, 2009.
@ARTICLE{Larochellejmlr2009,
author = {Larochelle, Hugo and Bengio, Yoshua and Louradour, Jerome and Lamblin, Pascal},
month = jan, title = {Exploring Strategies for Training Deep Neural Networks},
volume = {10},
year = {2009},
pages = {140},
crossref = {JMLR},
abstract = {Deep multilayer neural networks have many levels of nonlinearities allowing them to compactly represent highly nonlinear and highlyvarying functions. However, until recently it was not clear how to train such deep networks, since gradientbased optimization starting from random initialization often appears to get stuck in poor solutions. Hinton et al. recently proposed a greedy layerwise unsupervised learning procedure relying on the training algorithm of restricted Boltzmann machines (RBM) to initialize the parameters of a deep belief network (DBN), a generative model with many layers of hidden causal variables. This was followed by the proposal of another greedy layerwise procedure, relying on the usage of autoassociator networks. In the context of the above optimization problem, we study these algorithms empirically to better understand their success. Our experiments confirm the hypothesis that the greedy layerwise unsupervised training strategy helps the optimization by initializing weights in a region near a good local minimum, but also implicitly acts as a sort of regularization that brings better generalization and encourages internal distributed representations that are highlevel abstractions of the input. We also present a series of experiments aimed at evaluating the link between the performance of deep neural networks and practical aspects of their topology, for example, demonstrating cases where the addition of more depth helps. Finally, we empirically explore simple variants of these training algorithms, such as the use of different RBM input unit distributions, a simple way of combining gradient estimators to improve performance, as well as online versions of those algorithms.}
} 
[2009,article] Y. Bengio and O. Delalleau, "Justifying and Generalizing Contrastive Divergence," Neural Computation, vol. 21, iss. 6, pp. 16011621, 2009.
@ARTICLE{Bengio+Delalleau2009,
author = {Bengio, Yoshua and Delalleau, Olivier},
month = jun, title = {Justifying and Generalizing Contrastive Divergence},
journal = {Neural Computation},
volume = {21},
number = {6},
year = {2009},
pages = {16011621},
abstract = {We study an expansion of the loglikelihood in undirected graphical models such as the Restricted Boltzmann Machine (RBM), where each term in the expansion is associated with a sample in a Gibbs chain alternating between two random variables (the visible vector and the hidden vector, in RBMs). We are particularly interested in estimators of the gradient of the loglikelihood obtained through this expansion. We show that its residual term converges to zero, justifying the use of a truncation, i.e. running only a short Gibbs chain, which is the main idea behind the Contrastive Divergence (CD) estimator of the loglikelihood gradient. By truncating even more, we obtain a stochastic reconstruction error, related through a meanfield approximation to the reconstruction error often used to train autoassociators and stacked autoassociators. The derivation is not specific to the particular parametric forms used in RBMs, and only requires convergence of the Gibbs chain. We present theoretical and empirical evidence linking the number of Gibbs steps $k$ and the magnitude of the RBM parameters to the bias in the CD estimator. These experiments also suggest that the sign of the CD estimator is correct most of the time, even when the bias is large, so that CD$k$ is a good descent direction even for small $k$.}
} 
[2009,inproceedings] J. Turian, J. Bergstra, and Y. Bengio, "Quadratic Features and Deep Architectures for Chunking," in North American Chapter of the Association for Computational Linguistics – Human Language Technologies (NAACL HLT), Boulder, Colorado, 2009, pp. 245248.
@INPROCEEDINGS{Turian+al2009,
author = {Turian, Joseph and Bergstra, James and Bengio, Yoshua},
month = jun, title = {Quadratic Features and Deep Architectures for Chunking},
booktitle = {North American Chapter of the Association for Computational Linguistics  Human Language Technologies (NAACL HLT)},
year = {2009},
pages = {245248},
publisher = {Association for Computational Linguistics},
address = {Boulder, Colorado},
url = {http://www.aclweb.org/anthology/N/N09/N092062},
abstract = {We experiment with several chunking models. Deeper architectures achieve better generalization. Quadratic filters, a simplification of theoretical model of V1 complex cells, reliably increase accuracy. In fact, logistic regression with quadratic filters outperforms a standard single hidden layer neural network. Adding quadratic filters to logistic regression is almost as effective as feature engineering. Despite predicting each output label independently, our model is competitive with ones that use previous decisions.}
} 
[2009,inproceedings] "Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009)," in Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009), 2009.
@INPROCEEDINGS{xAISTATS2009, title = {Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009)},
booktitle = {Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009)},
year = {2009},
location = {Clearwater (Florida), USA}
} 
[2009,article] Y. Bengio, "Learning deep architectures for AI," Foundations and Trends in Machine Learning, vol. 2, iss. 1, pp. 1127, 2009.
@ARTICLE{Bengio2009,
author = {Bengio, Yoshua},
title = {Learning deep architectures for {AI}},
journal = {Foundations and Trends in Machine Learning},
volume = {2},
number = {1},
year = {2009},
pages = {1127},
note = {Also published as a book. Now Publishers, 2009.},
doi = {10.1561/2200000006},
abstract = {Theoretical results suggest that in order to learn the kind of complicated functions that can represent highlevel abstractions (e.g. in vision, language, and other AIlevel tasks), one may need {\insist deep architectures}. Deep architectures are composed of multiple levels of nonlinear operations, such as in neural nets with many hidden layers or in complicated propositional formulae reusing many subformulae. Searching the parameter space of deep architectures is a difficult task, but learning algorithms such as those for Deep Belief Networks have recently been proposed to tackle this problem with notable success, beating the stateoftheart in certain areas. This paper discusses the motivations and principles regarding learning algorithms for deep architectures, in particular those exploiting as building blocks unsupervised learning of singlelayer models such as Restricted Boltzmann Machines, used to construct deeper models such as Deep Belief Networks.}
} 
[2009,techreport] D. Erhan, Y. Bengio, A. Courville, and P. Vincent, "Visualizing HigherLayer Features of a Deep Network," University of Montreal, 1341, 2009.
@TECHREPORT{visualization_techreport,
author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Vincent, Pascal},
month = jun, title = {Visualizing HigherLayer Features of a Deep Network},
number = {1341},
year = {2009},
institution = {University of Montreal},
abstract = {Deep architectures have demonstrated stateoftheart results in a variety of settings, especially with vision datasets. Beyond the model definitions and the quantitative analyses, there is a need for qualitative comparisons of the solutions learned by various deep architectures. The goal of this paper is to find good qualitative interpretations of high level features represented by such models. To this end, we contrast and compare several techniques applied on Stacked Denoising Autoencoders and Deep Belief Networks, trained on several vision datasets. We show that, perhaps counterintuitively, such interpretation is possible at the unit level, that it is simple to accomplish and that the results are consistent across various techniques. We hope that such techniques will allow researchers in deep architectures to understand more of how and why deep architectures work}
} 
[2009,inproceedings] H. Larochelle, D. Erhan, and P. Vincent, "Deep Learning using Robust Interdependent Codes," in Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009), April 1618, 2009 2009, pp. 312319.
@INPROCEEDINGS{Larochelle2009,
author = {Larochelle, Hugo and Erhan, Dumitru and Vincent, Pascal},
month = apr, title = {Deep Learning using Robust Interdependent Codes},
booktitle = {Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS 2009)},
year = {2009},
pages = {312319},
location = {Clearwater (Florida), USA},
date = "April 1618, 2009", } 
[2008,article] Y. Bengio and J. Sénécal, "Adaptive Importance Sampling to Accelerate Training of a Neural Probabilistic Language Model," IEEE Transactions on Neural Networks, vol. 19, iss. 4, pp. 713722, 2008.
@ARTICLE{Bengio+Senecal2008,
author = {Bengio, Yoshua and S{\'{e}}n{\'{e}}cal, JeanS{\'{e}}bastien},
keywords = {Energybased models, fast training, importance sampling, language modeling, Monte Carlo methods, probabilistic neural networks},
title = {Adaptive Importance Sampling to Accelerate Training of a Neural Probabilistic Language Model},
journal = {IEEE Transactions on Neural Networks},
volume = {19},
number = {4},
year = {2008},
pages = {713722},
abstract = {Previous work on statistical language modeling has shown that it is possible to train a feedforward neural network to approximate probabilities over sequences of words, resulting in significant error reduction when compared to standard baseline models based on grams. However, training the neural network model with the maximumlikelihood criterion requires computations proportional to the number of words in the vocabulary. In this paper, we introduce adaptive importance sampling as a way to accelerate training of the model. The idea is to use an adaptive ngram model to track the conditional distributions produced by the neural network. We show that a very significant speedup can be obtained on standard problems.}
} 
[2008,techreport] G. Desjardins and Y. Bengio, "Empirical Evaluation of Convolutional RBMs for Vision," Département d’Informatique et de Recherche Opérationnelle, Université de Montréal, 1327, 2008.
@TECHREPORT{Desjardins2008,
author = {Desjardins, Guillaume and Bengio, Yoshua},
keywords = {Convolutional Architectures, Deep Networks, RBM, Vision},
title = {Empirical Evaluation of Convolutional RBMs for Vision},
number = {1327},
year = {2008},
institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
abstract = {Convolutional Neural Networks (CNN) have had great success in machine learning tasks involving vision and represent one of the early successes of deep networks. Local receptive fields and weight sharing make their architecture ideally suited for vision tasks by helping to enforce a prior based on our knowledge of natural images. This same prior could also be applied to recent developments in the field of deep networks, in order to tailor these new architectures for artificial vision. In this context, we show how the Restricted Boltzmann Machine (RBM), the building block of Deep Belief Networks (DBN), can be adapted to operate in a convolutional manner. We compare their performance to standard fullyconnected RBMs on a simple visual learning task and show that the convolutional RBMs (CRBMs) converge to smaller values of the negative likelihood function. Our experiments also indicate that CRBMs are more efficient than standard RBMs trained on small image patches, with the CRBMs having faster convergence.}
} 
[2008,misc] J. Bergstra, Y. Bengio, and J. Louradour, Image Classification using HigherOrder Neural Models, 2008.
@MISC{James+alsnowbird2008,
author = {Bergstra, James and Bengio, Yoshua and Louradour, Jerome},
month = apr, title = {Image Classification using HigherOrder Neural Models},
year = {2008},
howpublished = {The Learning Workshop (Snowbird, Utah)},
url = {http://snowbird.djvuzone.org/2007/abstracts/161.pdf}
} 
[2008,techreport] P. Vincent, H. Larochelle, Y. Bengio, and P. Manzagol, "Extracting and Composing Robust Features with Denoising Autoencoders," Département d’Informatique et Recherche Opérationnelle, Université de Montréal, 1316, 2008.
@TECHREPORT{VincentTR1316,
author = {Vincent, Pascal and Larochelle, Hugo and Bengio, Yoshua and Manzagol, PierreAntoine},
month = feb, title = {Extracting and Composing Robust Features with Denoising Autoencoders},
number = {1316},
year = {2008},
institution = {D{\'{e}}partement d'Informatique et Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
url = {http://www.iro.umontreal.ca/~vincentp/Publications/denoising_autoencoders_tr1316.pdf},
abstract = {Previous work has shown that the difficulties in learning deep generative or discriminative models can be overcome by an initial unsupervised learning step that maps inputs to useful intermediate representations. We introduce and motivate a new training principle for unsupervised learning of a representation based on the idea of making the learned representations robust to partial corruption of the input pattern. This approach can be used to train autoencoders, and these denoising autoencoders can be stacked to initialize deep architectures. The algorithm can be motivated from a manifold learning and information theoretic perspective or from a generative model perspective. Comparative experiments clearly show the surprising advantage of corrupting the input of autoencoders on a pattern classification benchmark suite.}
} 
[2008,inproceedings] P. Vincent, H. Larochelle, Y. Bengio, and P. Manzagol, "Extracting and Composing Robust Features with Denoising Autoencoders." 2008, pp. 10961103.
@INPROCEEDINGS{VincentPLarochelleH2008,
author = {Vincent, Pascal and Larochelle, Hugo and Bengio, Yoshua and Manzagol, PierreAntoine},
title = {Extracting and Composing Robust Features with Denoising Autoencoders},
year = {2008},
pages = {10961103},
crossref = {ICML08},
abstract = {Recently, many applications for Restricted Boltzmann Machines (RBMs) have been developed for a large variety of learning problems. However, RBMs are usually used as feature extractors for another learning algorithm or to provide a good initialization for deep feedforward neural network classifiers, and are not considered as a standalone solution to classification problems. In this paper, we argue that RBMs provide a selfcontained framework for deriving competitive nonlinear classifiers. We present an evaluation of different learning algorithms for RBMs which aim at introducing a discriminative component to RBM training and improve their performance as classifiers. This approach is simple in that RBMs are used directly to build a classifier, rather than as a stepping stone. Finally, we demonstrate how discriminative RBMs can also be successfully employed in a semisupervised setting.}
} 
[2008,inproceedings] H. Larochelle and Y. Bengio, "Classification using Discriminative Restricted Boltzmann Machines." 2008, pp. 536543.
@INPROCEEDINGS{Larochelle+Bengio2008,
author = {Larochelle, Hugo and Bengio, Yoshua},
title = {Classification using Discriminative Restricted {B}oltzmann Machines},
year = {2008},
pages = {536543},
location = {Helsinki, Finland},
crossref = {ICML08},
abstract = {Recently, many applications for Restricted Boltzmann Machines (RBMs) have been developed for a large variety of learning problems. However, RBMs are usually used as feature extractors for another learning algorithm or to provide a good initialization for deep feedforward neural network classifiers, and are not considered as a standalone solution to classification problems. In this paper, we argue that RBMs provide a selfcontained framework for deriving competitive nonlinear classifiers. We present an evaluation of different learning algorithms for RBMs which aim at introducing a discriminative component to RBM training and improve their performance as classifiers. This approach is simple in that RBMs are used directly to build a classifier, rather than as a stepping stone. Finally, we demonstrate how discriminative RBMs can also be successfully employed in a semisupervised setting.}
} 
[2008,proceedings] Proceedings of the Twentyfifth International Conference on Machine Learning (ICML’08)ACM, 2008.
@PROCEEDINGS{ICML08, editor = {Cohen, William W. and McCallum, Andrew and Roweis, Sam T.},
title = {Proceedings of the Twentyfifth International Conference on Machine Learning (ICML'08)},
booktitle = {Proceedings of the Twentyfifth International Conference on Machine Learning (ICML'08)},
year = {2008},
publisher = {ACM}
} 
[2008,misc] Y. Bengio, H. Larochelle, and J. Turian, Deep Woods, 2008.
@MISC{Yoshua+alsnowbird2008,
author = {Bengio, Yoshua and Larochelle, Hugo and Turian, Joseph},
title = {Deep Woods},
year = {2008},
howpublished = {Poster presented at the Learning@Snowbird Workshop, Snowbird, USA, 2008}
} 
[2008,phdthesis] N. Le Roux, "Avancées théoriques sur la représentation et l’optimisation des réseaux de neurones," PhD Thesis , 2008.
@PHDTHESIS{LeRouxPhD2008,
author = {Le Roux, Nicolas},
month = mar, title = {Avanc{\'{e}}es th{\'{e}}oriques sur la repr{\'{e}}sentation et l'optimisation des r{\'{e}}seaux de neurones},
year = {2008},
school = {Universit{\'{e}} de Montr{\'{e}}al},
url = {http://www.iro.umontreal.ca/~lisa/pointeurs/LeRouxNicolasThese.pdf},
abstract = {Les r{\'{e}}seaux de neurones artificiels ont {\'{e}}t{\'{e}} abondamment utilis{\'{e}}s dans la communaut{\'{e}} de l'apprentissage machine depuis les ann{\'{e}}es 80. Bien qu'ils aient {\'{e}}t{\'{e}} {\'{e}}tudi{\'{e}}s pour la premi{\`{e}}re fois il y a cinquante ans par Rosenblatt [68], ils ne furent r{\'{e}}ellement populaires qu'apr{\`{e}}s l'apparition de la r{\'{e}}tropropagation du gradient, en 1986 [71]. En 1989, il a {\'{e}}t{\'{e}} prouv{\'{e}} [44] qu'une classe sp{\'{e}}cifique de r{\'{e}}seaux de neurones (les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e) {\'{e}}tait suffisamment puissante pour pouvoir approximer presque n'importe quelle fonction avec une pr{\'{e}}cision arbitraire : le th{\'{e}}or{\`{e}}me d'approximation universelle. Toutefois, bien que ce th{\'{e}}or{\`{e}}me e{\^{u}}t pour cons{\'{e}}quence un int{\'{e}}r{\^{e}}t accru pour les r{\'{e}}seaux de neurones, il semblerait qu'aucun effort n'ait {\'{e}}t{\'{e}} fait pour profiter de cette propri{\'{e}}t{\'{e}}. En outre, l'optimisation des r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e n'est pas convexe. Cela a d{\'{e}}tourn{\'{e}} une grande partie de la communaut{\'{e}} vers d'autres algorithmes, comme par exemple les machines {\`{a}} noyau (machines {\`{a}} vecteurs de support et r{\'{e}}gression {\`{a}} noyau, entre autres). La premi{\`{e}}re partie de cette th{\`{e}}se pr{\'{e}}sentera les concepts d'apprentissage machine g{\'{e}}n{\'{e}}raux n{\'{e}}cessaires {\`{a}} la compr{\'{e}}hension des algorithmes utilis{\'{e}}s. La deuxi{\`{e}}me partie se focalisera plus sp{\'{e}}cifiquement sur les m{\'{e}}thodes {\`{a}} noyau et les r{\'{e}}seaux de neurones. La troisi{\`{e}}me partie de ce travail visera ensuite {\`{a}} {\'{e}}tudier les limitations des machines {\`{a}} noyaux et {\`{a}} comprendre les raisons pour lesquelles elles sont inadapt{\'{e}}es {\`{a}} certains probl{\`{e}}mes que nous avons {\`{a}} traiter. La quatri{\`{e}}me partie pr{\'{e}}sente une technique permettant d'optimiser les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e de mani{\`{e}}re convexe. Bien que cette technique s'av{\`{e}}re difficilement exploitable pour des probl{\`{e}}mes de grande taille, une version approch{\'{e}}e permet d'obtenir une bonne solution dans un temps raisonnable. La cinqui{\`{e}}me partie se concentre sur les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e infinie. Cela leur permet th{\'{e}}oriquement d'exploiter la propri{\'{e}}t{\'{e}} d'approximation universelle et ainsi d'approcher facilement une plus grande classe de fonctions. Toutefois, si ces deux variations sur les r{\'{e}}seaux de neurones {\`{a}} une couche cach{\'{e}}e leur conf{\`{e}}rent des propri{\'{e}}t{\'{e}}s int{\'{e}}ressantes, ces derniers ne peuvent extraire plus que des concepts de bas niveau. Les m{\'{e}}thodes {\`{a}} noyau souffrant des m{\^{e}}mes limites, aucun de ces deux types d'algorithmes ne peut appr{\'{e}}hender des probl{\`{e}}mes faisant appel {\`{a}} l'apprentissage de concepts de haut niveau. R{\'{e}}cemment sont apparus les Deep Belief Networks [39] qui sont des r{\'{e}}seaux de neurones {\`{a}} plusieurs couches cach{\'{e}}es entra{\^{\i}}n{\'{e}}s de mani{\`{e}}re efficace. Cette profondeur leur permet d'extraire des concepts de haut niveau et donc de r{\'{e}}aliser des t{\^{a}}ches hors de port{\'{e}}e des algorithmes conventionnels. La sixi{\`{e}}me partie {\'{e}}tudie des propri{\'{e}}t{\'{e}}s de ces r{\'{e}}seaux profonds. Les probl{\`{e}}mes que l'on rencontre actuellement n{\'{e}}cessitent non seulement des algorithmes capables d'extraire des concepts de haut niveau, mais {\'{e}}galement des m{\'{e}}thodes d'optimisation capables de traiter l'immense quantit{\'{e}} de donn{\'{e}}es parfois disponibles, si possible en temps r{\'{e}}el. La septi{\`{e}}me partie est donc la pr{\'{e}}sentation d'une nouvelle technique permettant une optimisation plus rapide.}
} 
[2008,article] N. Le Roux and Y. Bengio, "Representational Power of Restricted Boltzmann Machines and Deep Belief Networks," Neural Computation, vol. 20, iss. 6, pp. 16311649, 2008.
@ARTICLE{LeRouxBengio2008,
author = {Le Roux, Nicolas and Bengio, Yoshua},
month = jun, title = {Representational Power of Restricted {B}oltzmann Machines and Deep Belief Networks},
journal = {Neural Computation},
volume = {20},
number = {6},
year = {2008},
pages = {16311649},
abstract = {Deep Belief Networks (DBN) are generative neural network models with many layers of hidden explanatory factors, recently introduced by Hinton et al., along with a greedy layerwise unsupervised learning algorithm. The building block of a DBN is a probabilistic model called a Restricted Boltzmann Machine (RBM), used to represent one layer of the model. Restricted Boltzmann Machines are interesting because inference is easy in them, and because they have been successfully used as building blocks for training deeper models. We first prove that adding hidden units yields strictly improved modelling power, while a second theorem shows that RBMs are universal approximators of discrete distributions. We then study the question of whether DBNs with more layers are strictly more powerful in terms of representational power. This suggests a new and less greedy criterion for training RBMs within DBNs.}
} 
[2007,proceedings] Proceedings of the 24th International Conference on Machine Learning (ICML’07)ACM, 2007.
@PROCEEDINGS{ICML07, editor = {Ghahramani, Zoubin},
title = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
booktitle = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)},
year = {2007},
publisher = {ACM}
} 
[2007,techreport] Y. Bengio and O. Delalleau, "Justifying and Generalizing Contrastive Divergence," Département d’Informatique et Recherche Opérationnelle, Université de Montréal, 1311, 2007.
@TECHREPORT{Bengio+DelalleauTR2007,
author = {Bengio, Yoshua and Delalleau, Olivier},
keywords = {Contrastive Divergence, Restricted Boltzmann Machine},
title = {Justifying and Generalizing Contrastive Divergence},
number = {1311},
year = {2007},
institution = {D{\'{e}}partement d'Informatique et Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
abstract = {We study an expansion of the loglikelihood in undirected graphical models such as the Restricted Boltzmann Machine (RBM), where each term in the expansion is associated with a sample in a Gibbs chain alternating between two random variables (the visible vector and the hidden vector, in RBMs). We are particularly interested in estimators of the gradient of the loglikelihood obtained through this expansion. We show that its terms converge to zero, justifying the use of a truncation, i.e. running only a short Gibbs chain, which is the main idea behind the Contrastive Divergence approximation of the loglikelihood gradient. By truncating even more, we obtain a stochastic reconstruction error, related through a meanfield approximation to the reconstruction error often used to train autoassociators and stacked autoassociators. The derivation is not specific to the particular parametric forms used in RBMs, and only requires convergence of the Gibbs chain.}
} 
[2007,inproceedings] H. Larochelle, D. Erhan, A. Courville, J. Bergstra, and Y. Bengio, "An Empirical Evaluation of Deep Architectures on Problems with Many Factors of Variation." 2007, pp. 473480.
@INPROCEEDINGS{LarochelleH2007,
author = {Larochelle, Hugo and Erhan, Dumitru and Courville, Aaron and Bergstra, James and Bengio, Yoshua},
title = {An Empirical Evaluation of Deep Architectures on Problems with Many Factors of Variation},
year = {2007},
pages = {473480},
location = {Corvallis, OR},
url = {http://oregonstate.edu/conferences/icml2007/paperlist.html},
doi = {http://doi.acm.org/10.1145/1273496.1273556},
crossref = {ICML07},
abstract = {Recently, several learning algorithms relying on models with deep architectures have been proposed. Though they have demonstrated impressive performance, to date, they have only been evaluated on relatively simple problems such as digit recognition in a controlled environment, for which many machine learning algorithms already report reasonable results. Here, we present a series of experiments which indicate that these models show promise in solving harder learning problems that exhibit many factors of variation. These models are compared with wellestablished algorithms such as Support Vector Machines and single hiddenlayer feedforward neural networks.}
} 
[2007,incollection] Y. Bengio and Y. LeCun, "Scaling Learning Algorithms towards AI," , Bottou, ., Chapelle, O., DeCoste, D., and Weston, J., Eds., MIT Press, 2007.
@INCOLLECTION{Bengio+chapter2007,
author = {Bengio, Yoshua and {LeCun},
Yann},
editor = {Bottou, {L{\'{e}}on} and Chapelle, Olivier and DeCoste, D. and Weston, J.},
title = {Scaling Learning Algorithms towards {AI}},
booktitle = {Large Scale Kernel Machines},
year = {2007},
publisher = {MIT Press},
url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio+lecun_chapter2007.pdf},
abstract = {One longterm goal of machine learning research is to produce methods that are applicable to highly complex tasks, such as perception (vision, audition), reasoning, intelligent control, and other artificially intelligent behaviors. We argue that in order to progress toward this goal, the Machine Learning community must endeavor to discover algorithms that can learn highly complex functions, with minimal need for prior knowledge, and with minimal human intervention. We present mathematical and empirical evidence suggesting that many popular approaches to nonparametric learning, particularly kernel methods, are fundamentally limited in their ability to learn complex highdimensional functions. Our analysis focuses on two problems. First, kernel machines are shallow architectures, in which one large layer of simple template matchers is followed by a single layer of trainable coefficients. We argue that shallow architectures can be very inefficient in terms of required number of computational elements and examples. Second, we analyze a limitation of kernel machines with a local kernel, linked to the curse of dimensionality, that applies to supervised, unsupervised (manifold learning) and semisupervised kernel machines. Using empirical results on invariant image recognition tasks, kernel methods are compared with deep architectures, in which lowerlevel features or concepts are progressively combined into more abstract and higherlevel representations. We argue that deep architectures have the potential to generalize in nonlocal ways, i.e., beyond immediate neighbors, and that this is crucial in order to make progress on the kind of complex tasks required for artificial intelligence.},
cat={B},
topics={HighDimensional},
} 
[2007,incollection] Y. Bengio, "On the Challenge of Learning Complex Functions," , Cisek, P., Kalaska, J., and Drew, T., Eds., Elsevier, 2007.
@INCOLLECTION{Bengio2007,
author = {Bengio, Yoshua},
editor = {Cisek, Paul and Kalaska, John and Drew, Trevor},
title = {On the Challenge of Learning Complex Functions},
booktitle = {Computational Neuroscience: Theoretical Insights into Brain Function},
series = {Progress in Brain Research},
year = {2007},
publisher = {Elsevier},
url = {http://www.iro.umontreal.ca/~lisa/pointeurs/PBR_chapter.pdf},
abstract = {A common goal of computational neuroscience and of artificial intelligence research based on statistical learning algorithms is the discovery and understanding of computational principles that could explain what we consider adaptive intelligence, in animals as well as in machines. This chapter focuses on what is required for the learning of complex behaviors. We believe it involves the learning of highly varying functions, in a mathematical sense. We bring forward two types of arguments which convey the message that many currently popular machine learning approaches to learning flexible functions have fundamental limitations that render them inappropriate for learning highly varying functions. The first issue concerns the representation of such functions with what we call shallow model architectures. We discuss limitations of shallow architectures, such as socalled kernel machines, boosting algorithms, and onehiddenlayer artificial neural networks. The second issue is more focused and concerns kernel machines with a local kernel (the type used most often in practice), that act like a collection of template matching units. We present mathematical results on such computational architectures showing that they have a limitation similar to those already proved for older nonparametric methods, and connected to the socalled curse of dimensionality. Though it has long been believed that efficient learning in deep architectures is difficult, recently proposed computational principles for learning in deep architectures may offer a breakthrough.}
} 
[2007,techreport] N. Le Roux and Y. Bengio, "Representational Power of Restricted Boltzmann Machines and Deep Belief Networks," Département d’Informatique et de Recherche Opérationnelle, Université de Montréal, Montréal (QC) Canada, 1294, 2007.
@TECHREPORT{LeRouxBengio2007TR,
author = {Le Roux, Nicolas and Bengio, Yoshua},
month = apr, title = {Representational Power of Restricted {B}oltzmann Machines and Deep Belief Networks},
number = {1294},
year = {2007},
institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
address = {Montr{\'{e}}al (QC) Canada},
abstract = {Deep Belief Networks (DBN) are generative neural network models with many layers of hidden explanatory factors, recently introduced by Hinton et al., along with a greedy layerwise unsupervised learning algorithm. The building block of a DBN is a probabilistic model called a Restricted Boltzmann Machine (RBM), used to represent one layer of the model. Restricted Boltzmann Machines are interesting because inference is easy in them, and because they have been successfully used as building blocks for training deeper models. We first prove that adding hidden units yields strictly improved modeling power, while a second theorem shows that RBMs are universal approximators of discrete distributions. We then study the question of whether DBNs with more layers are strictly more powerful in terms of representational power. This suggests a new and less greedy criterion for training RBMs within DBNs.}
} 
[2007,techreport] Y. Bengio, "Learning deep architectures for AI," Dept. IRO, Universite de Montreal, 1312, 2007.
@TECHREPORT{BengioTR1312,
author = {Bengio, Yoshua},
title = {Learning deep architectures for AI},
number = {1312},
year = {2007},
institution = {Dept. IRO, Universite de Montreal},
note = {Preliminary version of journal article with the same title appearing in Foundations and Trends in Machine Learning (2009)},
url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1312.pdf},
abstract = {Theoretical results strongly suggest that in order to learn the kind of complicated functions that can represent highlevel abstractions (e.g. in vision, language, and other AIlevel tasks), one may need deep architectures. Deep architectures are composed of multiple levels of nonlinear operations, such as in neural nets with many hidden layers. Searching the parameter space of deep architectures is a difficult optimization task, but learning algorithms such as those for Deep Belief Networks have recently been proposed to tackle this problem with notable success, beating the stateoftheart in certain areas. This paper discusses the motivations and principles regarding learning algorithms for deep architectures and in particular for those based on unsupervised learning such as Deep Belief Networks, using as building blocks singlelayer models such as Restricted Boltzmann Machines.}
} 
[2007,inproceedings] Y. Bengio, P. Lamblin, D. Popovici, and H. Larochelle, "Greedy LayerWise Training of Deep Networks." 2007, pp. 153160.
@INPROCEEDINGS{Bengionips2006,
author = {Bengio, Yoshua and Lamblin, Pascal and Popovici, Dan and Larochelle, Hugo},
title = {Greedy LayerWise Training of Deep Networks},
year = {2007},
pages = {153160},
url = {http://www.iro.umontreal.ca/~lisa/pointeurs/BengioNips2006All.pdf},
crossref = {NIPS19},
abstract = {Complexity theory of circuits strongly suggests that deep architectures can be much more efficient (sometimes exponentially) than shallow architectures, in terms of computational elements required to represent some functions. Deep multilayer neural networks have many levels of nonlinearities allowing them to compactly represent highly nonlinear and highlyvarying functions. However, until recently it was not clear how to train such deep networks, since gradientbased optimization starting from random initialization appears to often get stuck in poor solutions. Hinton et al. recently introduced a greedy layerwise unsupervised learning algorithm for Deep Belief Networks (DBN), a generative model with many layers of hidden causal variables. In the context of the above optimization problem, we study this algorithm empirically and explore variants to better understand its success and extend it to cases where the inputs are continuous or where the structure of the input distribution is not revealing enough about the variable to be predicted in a supervised task. Our experiments also confirm the hypothesis that the greedy layerwise unsupervised training strategy mostly helps the optimization, by initializing weights in a region near a good local minimum, giving rise to internal distributed representations that are highlevel abstractions of the input, bringing better generalization.}
} 
[2007,techreport] Y. Bengio, O. Delalleau, and C. Simard, "Decision Trees do not Generalize to New Variations," Département d’informatique et recherche opérationnelle, Université de Montréal, 1304, 2007.
@TECHREPORT{Bengio+altreecurse2007,
author = {Bengio, Yoshua and Delalleau, Olivier and Simard, Clarence},
month = jun, title = {Decision Trees do not Generalize to New Variations},
number = {1304},
year = {2007},
institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al},
url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio+altr1304.pdf}
} 
[1,inproceedings] "Advances in Neural Information Processing Systems 19 (NIPS’06)," in Advances in Neural Information Processing Systems 19 (NIPS’06), 1.
@INPROCEEDINGS{NIPS19, editor = {{Sch{\"{o}}lkopf},
Bernhard and Platt, John and Hoffman, Thomas},
title = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
booktitle = {Advances in Neural Information Processing Systems 19 (NIPS'06)},
year = {1},
publisher = {MIT Press}
} 
[1,article] ," Journal of Machine Learning Research, 1.
@ARTICLE{JMLR, journal = {Journal of Machine Learning Research},
year = {1}
}