@article{srivastava2014dropout, title = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting}, author = {Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky, Alex and Sutskever, Ilya and Salakhutdinov, Ruslan}, journal = {Journal of Machine Learning Research}, volume = {15}, number = {56}, pages = {1929--1958}, year = {2014}, url = {https://www.jmlr.org/papers/v15/srivastava14a.html} } @inproceedings{vaswani2017attention, title = {Attention Is All You Need}, author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia}, booktitle = {Advances in Neural Information Processing Systems}, year = {2017}, url = {https://arxiv.org/abs/1706.03762} } @article{kaplan2020scaling, title = {Scaling Laws for Neural Language Models}, author = {Kaplan, Jared and McCandlish, Sam and Henighan, Tom and Brown, Tom B. and Chess, Benjamin and Child, Rewon and Gray, Scott and Radford, Alec and Wu, Jeffrey and Amodei, Dario}, journal = {arXiv preprint arXiv:2001.08361}, year = {2020}, url = {https://arxiv.org/abs/2001.08361} } @article{hoffmann2022training, title = {Training Compute-Optimal Large Language Models}, author = {Hoffmann, Jordan and Borgeaud, Sebastian and Mensch, Arthur and Buchatskaya, Elena and Cai, Trevor and Rutherford, Eliza and Casas, Diego de Las and Hendricks, Lisa Anne and Welbl, Johannes and Clark, Aidan and Hennigan, Tom and Noland, Eric and Millican, Katie and van den Driessche, George and Damoc, Bogdan and Guy, Aurelia and Osindero, Simon and Simonyan, Karen and Elsen, Erich and Rae, Jack W. and Vinyals, Oriol and Sifre, Laurent}, journal = {arXiv preprint arXiv:2203.15556}, year = {2022}, url = {https://arxiv.org/abs/2203.15556} } @inproceedings{merity2017pointer, title = {Pointer Sentinel Mixture Models}, author = {Merity, Stephen and Xiong, Caiming and Bradbury, James and Socher, Richard}, booktitle = {International Conference on Learning Representations}, year = {2017}, url = {https://arxiv.org/abs/1609.07843} } @article{eldan2023tinystories, title = {TinyStories: How Small Can Language Models Be and Still Speak Coherent English?}, author = {Eldan, Ronen and Li, Yuanzhi}, journal = {arXiv preprint arXiv:2305.07759}, year = {2023}, url = {https://arxiv.org/abs/2305.07759} } @misc{karpathy_nanochat, title = {nanochat}, author = {Karpathy, Andrej}, year = {2025}, howpublished = {\url{https://github.com/karpathy/nanochat}}, note = {MIT-licensed repository, accessed 2026-05-29} }