| \begin{thebibliography}{9} |
| \providecommand{\natexlab}[1]{ |
| \providecommand{\url}[1]{\texttt{ |
| \expandafter\ifx\csname urlstyle\endcsname\relax |
| \providecommand{\doi}[1]{doi: |
| \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi |
|
|
| \bibitem[Bahdanau et~al.(2014)Bahdanau, Cho, and Bengio]{bahdanau2014neural} |
| Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. |
| \newblock Neural machine translation by jointly learning to align and |
| translate. |
| \newblock \emph{arXiv preprint arXiv:1409.0473}, 2014. |
|
|
| \bibitem[Glorot \& Bengio(2010)Glorot and Bengio]{Glorot2010UnderstandingTD} |
| Xavier Glorot and Yoshua Bengio. |
| \newblock Understanding the difficulty of training deep feedforward neural |
| networks. |
| \newblock pp.\ 249--256, 2010. |
|
|
| \bibitem[Goodfellow et~al.(2016)Goodfellow, Bengio, Courville, and |
| Bengio]{goodfellow2016deep} |
| Ian Goodfellow, Yoshua Bengio, Aaron Courville, and Yoshua Bengio. |
| \newblock \emph{Deep learning}, volume~1. |
| \newblock MIT Press, 2016. |
|
|
| \bibitem[He et~al.(2015)He, Zhang, Ren, and Sun]{He2015DelvingDI} |
| Kaiming He, X.~Zhang, Shaoqing Ren, and Jian Sun. |
| \newblock Delving deep into rectifiers: Surpassing human-level performance on |
| imagenet classification. |
| \newblock \emph{2015 IEEE International Conference on Computer Vision (ICCV)}, |
| pp.\ 1026--1034, 2015. |
|
|
| \bibitem[Kingma \& Ba(2014)Kingma and Ba]{kingma2014adam} |
| Diederik~P Kingma and Jimmy Ba. |
| \newblock Adam: A method for stochastic optimization. |
| \newblock \emph{arXiv preprint arXiv:1412.6980}, 2014. |
|
|
| \bibitem[Loshchilov \& Hutter(2017)Loshchilov and Hutter]{loshchilov2017adamw} |
| Ilya Loshchilov and Frank Hutter. |
| \newblock Decoupled weight decay regularization. |
| \newblock \emph{arXiv preprint arXiv:1711.05101}, 2017. |
|
|
| \bibitem[Power et~al.(2022)Power, Burda, Edwards, Babuschkin, and |
| Misra]{power2022grokking} |
| Alethea Power, Yuri Burda, Harri Edwards, Igor Babuschkin, and Vedant Misra. |
| \newblock Grokking: Generalization beyond overfitting on small algorithmic |
| datasets. |
| \newblock \emph{arXiv preprint arXiv:2201.02177}, 2022. |
|
|
| \bibitem[Saxe et~al.(2013)Saxe, McClelland, and Ganguli]{Saxe2013ExactST} |
| Andrew~M. Saxe, James~L. McClelland, and S.~Ganguli. |
| \newblock Exact solutions to the nonlinear dynamics of learning in deep linear |
| neural networks. |
| \newblock \emph{CoRR}, abs/1312.6120, 2013. |
|
|
| \bibitem[Vaswani et~al.(2017)Vaswani, Shazeer, Parmar, Uszkoreit, Jones, Gomez, |
| Kaiser, and Polosukhin]{vaswani2017attention} |
| Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, |
| Aidan~N Gomez, {\L}ukasz Kaiser, and Illia Polosukhin. |
| \newblock Attention is all you need. |
| \newblock \emph{Advances in neural information processing systems}, 30, 2017. |
|
|
| \end{thebibliography} |
|
|