Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| % FinePhrase blog post bibliography | |
| % Datasets | |
| @article{c4, | |
| title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer}, | |
| author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu}, | |
| journal = {Journal of Machine Learning Research}, | |
| volume = {21}, | |
| number = {140}, | |
| pages = {1--67}, | |
| year = {2020}, | |
| url = {https://arxiv.org/abs/1910.10683} | |
| } | |
| @article{thepile, | |
| title = {The Pile: An 800GB Dataset of Diverse Text for Language Modeling}, | |
| author = {Leo Gao and Stella Biderman and Sid Black and Laurence Golding and Travis Hoppe and Charles Foster and Jason Phang and Horace He and Anish Thite and Noa Nabeshima and Shawn Presser and Connor Leahy}, | |
| journal = {arXiv preprint arXiv:2101.00027}, | |
| year = {2020}, | |
| eprint = {2101.00027}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2101.00027} | |
| } | |
| @misc{datacomp, | |
| title = {DataComp-LM: In search of the next generation of training for language models}, | |
| author = {Jeffrey Li and Alex Fang and Georgios Smyrnis and Maor Ivgi and Matt Jordan and Samir Gadre and Hritik Bansal and Etash Guha and Sedrick Keh and Kushal Arora and Saurabh Garg and Rui Xin and Niklas Muennighoff and Reinhard Heckel and Jean Mercat and Mayee Chen and Suchin Gururangan and Mitchell Wortsman and Alon Albalak and Yonatan Bitton and Marianna Nezhurina and Amro Abbas and Cheng-Yu Hsieh and Dhruba Ghosh and Josh Gardner and Maciej Kilian and Hanlin Zhang and Rulin Shao and Sarah Pratt and Sunny Sanyal and Gabriel Ilharco and Giannis Daras and Kalyani Marathe and Aaron Gokaslan and Jieyu Zhang and Khyathi Chandu and Thao Nguyen and Igor Vasiljevic and Sham Kakade and Shuran Song and Sujay Sanghavi and Fartash Faghri and Sewoong Oh and Luke Zettlemoyer and Kyle Lo and Alaaeldin El-Nouby and Hadi Pouransari and Alexander Toshev and Stephanie Wang and Dirk Groeneveld and Luca Soldaini and Pang Wei Koh and Jenia Jitsev and Thomas Kollar and Alexandros G. Dimakis and Yair Carmon and Achal Dave and Ludwig Schmidt and Vaishaal Shankar}, | |
| year = {2025}, | |
| eprint = {2406.11794}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.LG}, | |
| url = {https://arxiv.org/abs/2406.11794} | |
| } | |
| @misc{fineweb, | |
| title = {The FineWeb Datasets: Decanting the Web for the Finest Text at Scale}, | |
| author = {Guilherme Penedo and Hynek Kydlíček and Loubna Ben allal and Anton Lozhkov and Margaret Mitchell and Colin Raffel and Leandro Von Werra and Thomas Wolf}, | |
| year = {2024}, | |
| eprint = {2406.17557}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2406.17557} | |
| } | |
| @misc{ultrafineweb, | |
| title = {Ultra-FineWeb: Efficient Filtering and Verification for High-Quality LLM Training }, | |
| author = {Yudong Wang and Zixuan Fu and Jie Cai and Peijun Tang and Hongya Lyu and Yewei Fang and Zhi Zheng and Jie Zhou and Guoyang Zeng and Chaojun Xiao and Xu Han and Zhiyuan Liu}, | |
| year = {2025}, | |
| eprint = {2505.05427}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2505.05427} | |
| } | |
| @misc{nemotroncc, | |
| title = {Nemotron-CC: Transforming Crawl into a Refined Long-Horizon Pretraining Dataset}, | |
| author = {Dan Su and Kezhi Kong and Ying Lin and Joseph Jennings and Brandon Norick and Markus Kliegl and Mostofa Patwary and Mohammad Shoeybi and Bryan Catanzaro}, | |
| year = {2024}, | |
| eprint = {2412.02595}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2412.02595} | |
| } | |
| @software{cosmopedia, | |
| title = {Cosmopedia}, | |
| author = {Ben Allal, Loubna and Lozhkov, Anton and Penedo, Guilherme and Wolf, Thomas and von Werra, Leandro}, | |
| year = {2024}, | |
| url = {https://huggingface.co/datasets/HuggingFaceTB/cosmopedia}, | |
| note = {Hugging Face Blog: \url{https://huggingface.co/blog/cosmopedia}} | |
| } | |
| @misc{synthpleias, | |
| title = {SYNTH: The New Frontier}, | |
| author = {{PleIAs}}, | |
| year = {2025}, | |
| url = {https://pleias.fr/blog/blogsynth-the-new-data-frontier}, | |
| note = {Blog post} | |
| } | |
| @misc{s1k, | |
| title = {s1: Simple Test-Time Scaling}, | |
| author = {Niklas Muennighoff and Zitong Yang and Weijia Shi and Xiang Lisa Li and Li Fei-Fei and Hannaneh Hajishirzi and Luke Zettlemoyer and Percy Liang and Emmanuel Candès and Tatsunori Hashimoto}, | |
| year = {2025}, | |
| eprint = {2501.19393}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2501.19393} | |
| } | |
| % Synthetic methods | |
| @inproceedings{demystifyingsynth, | |
| title = {Demystifying Synthetic in LLM Pre-training: A Systematic Study of Scaling Laws, Benefits, and Pitfalls}, | |
| author = {Feiyang Kang and Newsha Ardalani and Michael Kuchnik and Youssef Emad and Mostafa Elhoushi and Shubhabrata Sengupta and Shang-Wen Li and Ramya Raghavendra and Ruoxi Jia and Carole-Jean Wu}, | |
| booktitle = {Conference on Empirical Methods in Natural Language Processing}, | |
| year = {2025}, | |
| url = {https://aclanthology.org/2025.emnlp-main.544/} | |
| } | |
| @misc{syntheticcpt, | |
| title = {Synthetic Continued Pretraining}, | |
| author = {Zitong Yang and Neil Band and Shuangping Li and Emmanuel Candès and Tatsunori Hashimoto}, | |
| year = {2024}, | |
| eprint = {2409.07431}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2409.07431} | |
| } | |
| @inproceedings{wrap, | |
| title = {Rephrasing the Web: A Recipe for Compute and -Efficient Language Modeling}, | |
| author = {Pratyush Maini and Skyler Seto and Richard He Bai and David Grangier and Yizhe Zhang and Navdeep Jaitly}, | |
| booktitle = {Annual Meeting of the Association for Computational Linguistics}, | |
| year = {2024}, | |
| eprint = {2401.16380}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2401.16380} | |
| } | |
| @misc{rewire, | |
| title = {Recycling the Web: A Method to Enhance Pre-training Quality and Quantity for Language Models}, | |
| author = {Thao Nguyen and Yang Li and Olga Golovneva and Luke Zettlemoyer and Sewoong Oh and Ludwig Schmidt and Xian Li}, | |
| year = {2025}, | |
| eprint = {2506.04689}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2506.04689} | |
| } | |
| @misc{beyondweb, | |
| title = {BeyondWeb: Lessons from Scaling Synthetic for Trillion-scale Pretraining}, | |
| author = {Pratyush Maini and Vineeth Dorna and Parth Doshi and Aldo Carranza and Fan Pan and Jack Urbanek and Paul Burstein and Alex Fang and Alvin Deng and Amro Abbas and Brett Larsen and Cody Blakeney and Charvi Bannur and Christina Baek and Darren Teh and David Schwab and Haakon Mongstad and Haoli Yin and Josh Wills and Kaleigh Mentzer and Luke Merrick and Ricardo Monti and Rishabh Adiga and Siddharth Joshi and Spandan Das and Zhengping Wang and Bogdan Gaza and Ari Morcos and Matthew Leavitt}, | |
| year = {2025}, | |
| eprint = {2508.10975}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.LG}, | |
| url = {https://arxiv.org/abs/2508.10975} | |
| } | |
| @article{modelcollapse, | |
| title = {AI models collapse when trained on recursively generated }, | |
| author = {Shumailov, Ilia and Shumaylov, Zakhar and Zhao, Yiren and Papernot, Nicolas and Anderson, Ross and Gal, Yarin}, | |
| journal = {Nature}, | |
| volume = {631}, | |
| pages = {755--759}, | |
| year = {2024}, | |
| doi = {10.1038/s41586-024-07566-y} | |
| } | |
| % Models | |
| @misc{smollm2, | |
| title = {SmolLM2: When Smol Goes Big -- -Centric Training of a Small Language Model}, | |
| author = {Loubna Ben Allal and Anton Lozhkov and Elie Bakouch and Gabriel Martín Blázquez and Guilherme Penedo and Lewis Tunstall and Andrés Marafioti and Hynek Kydlíček and Agustín Piqueres Lajarín and Vaibhav Srivastav and Joshua Lochner and Caleb Fahlgren and Xuan-Son Nguyen and Clémentine Fourrier and Ben Burtenshaw and Hugo Larcher and Haojun Zhao and Cyril Zakka and Mathieu Morlon and Colin Raffel and Leandro von Werra and Thomas Wolf}, | |
| year = {2025}, | |
| eprint = {2502.02737}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2502.02737} | |
| } | |
| @misc{smollm3, | |
| title = {SmolLM3: Smol, Multilingual, Long-Context Reasoner}, | |
| author = {{Hugging Face}}, | |
| year = {2025}, | |
| url = {https://huggingface.co/blog/smollm3}, | |
| note = {Blog post} | |
| } | |
| @misc{glm45, | |
| title = {GLM-4.5: Agentic, Reasoning, and Coding (ARC) Foundation Models}, | |
| author = { 5 Team and Aohan Zeng and Xin Lv and Qinkai Zheng and Zhenyu Hou and Bin Chen and Chengxing Xie and Cunxiang Wang and Da Yin and Hao Zeng and Jiajie Zhang and Kedong Wang and Lucen Zhong and Mingdao Liu and Rui Lu and Shulin Cao and Xiaohan Zhang and Xuancheng Huang and Yao Wei and Yean Cheng and Yifan An and Yilin Niu and Yuanhao Wen and Yushi Bai and Zhengxiao Du and Zihan Wang and Zilin Zhu and Bohan Zhang and Bosi Wen and Bowen Wu and Bowen Xu and Can Huang and Casey Zhao and Changpeng Cai and Chao Yu and Chen Li and Chendi Ge and Chenghua Huang and Chenhui Zhang and Chenxi Xu and Chenzheng Zhu and Chuang Li and Congfeng Yin and Daoyan Lin and Dayong Yang and Dazhi Jiang and Ding Ai and Erle Zhu and Fei Wang and Gengzheng Pan and Guo Wang and Hailong Sun and Haitao Li and Haiyang Li and Haiyi Hu and Hanyu Zhang and Hao Peng and Hao Tai and Haoke Zhang and Haoran Wang and Haoyu Yang and He Liu and He Zhao and Hongwei Liu and Hongxi Yan and Huan Liu and Huilong Chen and Ji Li and Jiajing Zhao and Jiamin Ren and Jian Jiao and Jiani Zhao and Jianyang Yan and Jiaqi Wang and Jiayi Gui and Jiayue Zhao and Jie Liu and Jijie Li and Jing Li and Jing Lu and Jingsen Wang and Jingwei Yuan and Jingxuan Li and Jingzhao Du and Jinhua Du and Jinxin Liu and Junkai Zhi and Junli Gao and Ke Wang and Lekang Yang and Liang Xu and Lin Fan and Lindong Wu and Lintao Ding and Lu Wang and Man Zhang and Minghao Li and Minghuan Xu and Mingming Zhao and Mingshu Zhai and Pengfan Du and Qian Dong and Shangde Lei and Shangqing Tu and Shangtong Yang and Shaoyou Lu and Shijie Li and Shuang Li and Shuang-Li and Shuxun Yang and Sibo Yi and Tianshu Yu and Wei Tian and Weihan Wang and Wenbo Yu and Weng Lam Tam and Wenjie Liang and Wentao Liu and Xiao Wang and Xiaohan Jia and Xiaotao Gu and Xiaoying Ling and Xin Wang and Xing Fan and Xingru Pan and Xinyuan Zhang and Xinze Zhang and Xiuqing Fu and Xunkai Zhang and Yabo Xu and Yandong Wu and Yida Lu and Yidong Wang and Yilin Zhou and Yiming Pan and Ying Zhang and Yingli Wang and Yingru Li and Yinpei Su and Yipeng Geng and Yitong Zhu and Yongkun Yang and Yuhang Li and Yuhao Wu and Yujiang Li and Yunan Liu and Yunqing Wang and Yuntao Li and Yuxuan Zhang and Zezhen Liu and Zhen Yang and Zhengda Zhou and Zhongpei Qiao and Zhuoer Feng and Zhuorui Liu and Zichen Zhang and Zihan Wang and Zijun Yao and Zikang Wang and Ziqiang Liu and Ziwei Chai and Zixuan Li and Zuodong Zhao and Wenguang Chen and Jidong Zhai and Bin Xu and Minlie Huang and Hongning Wang and Juanzi Li and Yuxiao Dong and Jie Tang}, | |
| year = {2025}, | |
| eprint = {2508.06471}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2508.06471} | |
| } | |
| @misc{nemotron3, | |
| title = {NVIDIA Nemotron 3: Efficient and Open Intelligence}, | |
| author = {{NVIDIA}}, | |
| year = {2025}, | |
| eprint = {2512.20856}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2512.20856} | |
| } | |
| @misc{qwen, | |
| title = {Qwen Technical Report}, | |
| author = {Jinze Bai and Shuai Bai and Yunfei Chu and Zeyu Cui and Kai Dang and Xiaodong Deng and Yang Fan and Wenbin Ge and Yu Han and Fei Huang and Binyuan Hui and Luo Ji and Mei Li and Junyang Lin and Runji Lin and Dayiheng Liu and Gao Liu and Chengqiang Lu and Keming Lu and Jianxin Ma and Rui Men and Xingzhang Ren and Xuancheng Ren and Chuanqi Tan and Sinan Tan and Jianhong Tu and Peng Wang and Shijie Wang and Wei Wang and Shengguang Wu and Benfeng Xu and Jin Xu and An Yang and Hao Yang and Jian Yang and Shusheng Yang and Yang Yao and Bowen Yu and Hongyi Yuan and Zheng Yuan and Jianwei Zhang and Xingxuan Zhang and Yichang Zhang and Zhenru Zhang and Chang Zhou and Jingren Zhou and Xiaohuan Zhou and Tianhang Zhu}, | |
| year = {2023}, | |
| eprint = {2309.16609}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2309.16609} | |
| } | |
| @misc{qwen2, | |
| title = {Qwen2 Technical Report}, | |
| author = {An Yang and Baosong Yang and Binyuan Hui and Bo Zheng and Bowen Yu and Chang Zhou and Chengpeng Li and Chengyuan Li and Dayiheng Liu and Fei Huang and Guanting Dong and Haoran Wei and Huan Lin and Jialong Tang and Jialin Wang and Jian Yang and Jianhong Tu and Jianwei Zhang and Jianxin Ma and Jin Xu and Jingren Zhou and Junyang Lin}, | |
| year = {2024}, | |
| eprint = {2407.10671}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2407.10671} | |
| } | |
| @misc{qwen25, | |
| title = {Qwen2.5 Technical Report}, | |
| author = {An Yang and Baosong Yang and Beichen Zhang and Binyuan Hui and Bo Zheng and Bowen Yu and Chengyuan Li and Dayiheng Liu and Fei Huang and Haoran Wei and Huan Lin and Jian Yang and Jianhong Tu and Jianwei Zhang and Jianxin Yang and Junyang Lin and Jingren Zhou}, | |
| year = {2024}, | |
| eprint = {2412.15115}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2412.15115} | |
| } | |
| @misc{qwen3, | |
| title = {Qwen3 Technical Report}, | |
| author = {An Yang and Anfeng Li and Baosong Yang and Beichen Zhang and Binyuan Hui and Bo Zheng and Bowen Yu and Chang Gao and Chengen Huang and Chenxu Lv and Chujie Zheng and Dayiheng Liu and Fan Zhou and Fei Huang and Junyang Lin and Jingren Zhou}, | |
| year = {2025}, | |
| eprint = {2505.09388}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2505.09388} | |
| } | |
| @misc{phi4, | |
| title = {Phi-4 Technical Report}, | |
| author = {Marah Abdin and Sahaj Agarwal and Ahmed Awadallah and Vidhisha Balachandran and Harkirat Behl and Lingjiao Chen and Gustavo de Rosa and Suriya Gunasekar and Mojan Javaheripi and Neel Jain and Piero Kauffmann and Yin Tat Lee and Yuanzhi Li and Anh Nguyen and Olatunji Ruwase and Olli Saarikivi and Adil Salim and Shital Shah and Michael Santacroce and Harsha Nori and Xin Wang and Rachel Ward and Philipp Witte and Cyril Zhang and Yi Zhang}, | |
| year = {2024}, | |
| eprint = {2412.08905}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2412.08905} | |
| } | |
| @misc{arceetrinitymanifesto, | |
| title = {The Trinity Manifesto}, | |
| author = {Lucas Atkins}, | |
| year = {2025}, | |
| note = {Blog post}, | |
| url = {https://www.arcee.ai/blog/the-trinity-manifesto} | |
| } | |
| @misc{arceetrinitylarge, | |
| title = {Trinity Large}, | |
| author = {Lucas Atkins}, | |
| year = {2026}, | |
| note = {Blog post}, | |
| url = {https://www.arcee.ai/blog/trinity-large} | |
| } | |
| @misc{llama3, | |
| title = {The Llama 3 Herd of Models}, | |
| author = {Aaron Grattafiori and Abhimanyu Dubey and Abhinav Jauhri and Abhinav Pandey and Abhishek Kadian and Ahmad Al-Dahle and Aieleen Lakber and Aishwarya Selvaraj and Alan Schelten and Amit Sangani and others}, | |
| year = {2024}, | |
| eprint = {2407.21783}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.AI}, | |
| url = {https://arxiv.org/abs/2407.21783} | |
| } | |
| @misc{mixtral, | |
| title = {Mixtral of Experts}, | |
| author = {Albert Q. Jiang and Alexandre Sablayrolles and Antoine Roux and Arthur Mensch and Blanche Savary and Chris Bamford and Devendra Singh Chaplot and Diego de las Casas and Emma Bou Hanna and Florian Bressand and Gianna Lengyel and Guillaume Bour and Guillaume Lample and L{\'e}lio Renard Lavaud and Lucile Saulnier and Marie-Anne Lachaux and Pierre Stock and Sandeep Subramanian and Sophia Yang and Szymon Antoniak and Teven Le Scao and Th{\'e}ophile Gervet and Thibaut Lavril and Thomas Wang and Timoth{\'e}e Lacroix and William El Sayed}, | |
| year = {2024}, | |
| eprint = {2401.04088}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.LG}, | |
| url = {https://arxiv.org/abs/2401.04088} | |
| } | |
| @misc{deepseekr1, | |
| title = {DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning}, | |
| author = {{DeepSeek-AI}}, | |
| year = {2025}, | |
| eprint = {2501.12948}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2501.12948} | |
| } | |
| @misc{gemma3, | |
| title = {Gemma 3 Technical Report}, | |
| author = {{Gemma Team}}, | |
| year = {2025}, | |
| eprint = {2503.19786}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2503.19786} | |
| } | |
| @software{falcon3, | |
| title = {Falcon 3 Family of Open Models}, | |
| author = {{Technology Innovation Institute}}, | |
| year = {2024}, | |
| url = {https://huggingface.co/blog/falcon3}, | |
| note = {Hugging Face Blog} | |
| } | |
| @misc{granite3, | |
| title = {Granite 3.0 Language Models}, | |
| author = {{IBM Granite Team}}, | |
| year = {2024}, | |
| url = {https://github.com/ibm-granite/granite-3.0-language-models}, | |
| note = {Technical Report} | |
| } | |
| @misc{kimik2, | |
| title = {Kimi K2: Open Agentic Intelligence}, | |
| author = {{Moonshot AI}}, | |
| year = {2025}, | |
| eprint = {2507.20534}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2507.20534} | |
| } | |
| @misc{gptoss, | |
| title = {gpt-oss-120b \& gpt-oss-20b Model Card}, | |
| author = {{OpenAI}}, | |
| year = {2025}, | |
| eprint = {2508.10925}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2508.10925} | |
| } | |
| % Architecture | |
| @inproceedings{gqa, | |
| title = {GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints}, | |
| author = {Joshua Ainslie and James Lee-Thorp and Michiel de Jong and Yury Zemlyanskiy and Federico Lebrón and Sumit Sanghai}, | |
| booktitle = {Conference on Empirical Methods in Natural Language Processing}, | |
| year = {2023}, | |
| eprint = {2305.13245}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2305.13245} | |
| } | |
| @article{rope, | |
| title = {RoFormer: Enhanced Transformer with Rotary Position Embedding}, | |
| author = {Jianlin Su and Murtadha Ahmed and Yu Lu and Shengfeng Pan and Wen Bo and Yunfeng Liu}, | |
| journal = {Neurocomputing}, | |
| volume = {568}, | |
| pages = {127063}, | |
| year = {2024}, | |
| doi = {10.1016/j.neucom.2023.127063} | |
| } | |
| % Inference | |
| @inproceedings{vllm, | |
| title = {Efficient Memory Management for Large Language Model Serving with PagedAttention}, | |
| author = {Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica}, | |
| booktitle = {Proceedings of the 29th Symposium on Operating Systems Principles}, | |
| year = {2023}, | |
| eprint = {2309.06180}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.LG}, | |
| url = {https://arxiv.org/abs/2309.06180} | |
| } | |
| @inproceedings{sglang, | |
| title = {SGLang: Efficient Execution of Structured Language Model Programs}, | |
| author = {Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng}, | |
| booktitle = {Advances in Neural Information Processing Systems}, | |
| year = {2024}, | |
| eprint = {2312.07104}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.AI}, | |
| url = {https://arxiv.org/abs/2312.07104} | |
| } | |
| @misc{flashattention2, | |
| title = {FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning}, | |
| author = {Tri Dao}, | |
| year = {2023}, | |
| eprint = {2307.08691}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.LG}, | |
| url = {https://arxiv.org/abs/2307.08691} | |
| } | |
| @misc{flashinfer, | |
| title = {FlashInfer: Efficient and Customizable Attention Engine for LLM Inference Serving}, | |
| author = {Zihao Ye and Lequn Chen and Ruihang Lai and Yilong Zhao and Size Zheng and Junru Shao and Bohan Hou and Hongyi Jin and Yifei Zuo and Liangsheng Yin and Tianqi Chen and Luis Ceze}, | |
| year = {2025}, | |
| eprint = {2501.01005}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.DC}, | |
| url = {https://arxiv.org/abs/2501.01005} | |
| } | |
| @inproceedings{speculativedecoding, | |
| title = {Fast Inference from Transformers via Speculative Decoding}, | |
| author = {Yaniv Leviathan and Matan Kalman and Yossi Matias}, | |
| booktitle = {International Conference on Machine Learning}, | |
| year = {2023}, | |
| eprint = {2211.17192}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.LG}, | |
| url = {https://arxiv.org/abs/2211.17192} | |
| } | |
| @misc{dflash, | |
| title = {DFlash: Block Diffusion for Flash Speculative Decoding}, | |
| author = {Jian Chen and Yesheng Liang and Zhijian Liu}, | |
| year = {2026}, | |
| eprint = {2602.06036}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2602.06036} | |
| } | |
| @misc{mercury2, | |
| title = {Introducing Mercury 2}, | |
| author = {Inception Labs}, | |
| year = {2026}, | |
| note = {Blog post}, | |
| url = {https://www.inceptionlabs.ai/blog/introducing-mercury-2} | |
| } | |
| % Training | |
| @inproceedings{adamw, | |
| title = {Decoupled Weight Decay Regularization}, | |
| author = {Ilya Loshchilov and Frank Hutter}, | |
| booktitle = {International Conference on Learning Representations}, | |
| year = {2019}, | |
| eprint = {1711.05101}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.LG}, | |
| url = {https://arxiv.org/abs/1711.05101} | |
| } | |
| @misc{minicpm, | |
| title = {MiniCPM: Unveiling the Potential of Small Language Models with Scalable Training Strategies}, | |
| author = {Shengding Hu and Yuge Tu and Xu Han and Chaoqun He and Ganqu Cui and Xiang Long and Zhi Zheng and Yewei Fang and Yuxiang Huang and Weilin Zhao and Xinrong Zhang and Zheng Leng Thai and Kaihuo Zhang and Chongyi Wang and Yuan Yao and Chenyang Zhao and Jie Zhou and Jie Cai and Zhongwu Zhai and Ning Ding and Chao Jia and Guoyang Zeng and Dahai Li and Zhiyuan Liu and Maosong Sun}, | |
| year = {2024}, | |
| eprint = {2404.06395}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2404.06395} | |
| } | |
| % Tools | |
| @inproceedings{dspy, | |
| title = {DSPy: Compiling Declarative Language Model Calls into State-of-the-Art Pipelines}, | |
| author = {Omar Khattab and Arnav Singhvi and Paridhi Maheshwari and Zhiyuan Zhang and Keshav Santhanam and Sri Vardhamanan and Saiful Haq and Ashutosh Sharma and Thomas T. Joshi and Hanna Moazam and Heather Miller and Matei Zaharia and Christopher Potts}, | |
| booktitle = {International Conference on Learning Representations}, | |
| year = {2024}, | |
| eprint = {2310.03714}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2310.03714} | |
| } | |
| @software{datatrove, | |
| title = {DataTrove: Large Scale Data Processing}, | |
| author = {Guilherme Penedo and Hynek Kydlíček and Thomas Wolf and Leandro von Werra}, | |
| year = {2024}, | |
| url = {https://github.com/huggingface/datatrove}, | |
| note = {GitHub repository} | |
| } | |
| % Benchmarks | |
| @misc{arc, | |
| title = {Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge}, | |
| author = {Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord}, | |
| year = {2018}, | |
| eprint = {1803.05457}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.AI}, | |
| url = {https://arxiv.org/abs/1803.05457} | |
| } | |
| @misc{hellaswag, | |
| title = {HellaSwag: Can a Machine Really Finish Your Sentence?}, | |
| author = {Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi}, | |
| year = {2019}, | |
| eprint = {1905.07830}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/1905.07830} | |
| } | |
| @misc{mmluredux, | |
| title = {Are We Done with MMLU?}, | |
| author = {Aryo Pradipta Gema and Joshua Ong Jun Leang and Giwon Hong and Alessio Devoto and Alberto Carlo Maria Mancino and Rohit Saxena and Xuanli He and Yu Zhao and Xiaotang Du and Mohammad Reza Ghasemi Madani and Claire Barale and Robert McHardy and Joshua Harris and Jean Kaddour and Emile van Krieken and Pasquale Minervini}, | |
| year = {2024}, | |
| eprint = {2406.04127}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/2406.04127} | |
| } | |
| @inproceedings{xcsqa, | |
| title = {Common Sense Beyond English: Evaluating and Improving Multilingual Language Models for Commonsense Reasoning}, | |
| author = {Bill Yuchen Lin and Seyeon Lee and Xiaoyang Qiao and Xiang Ren}, | |
| booktitle = {Annual Meeting of the Association for Computational Linguistics}, | |
| year = {2021}, | |
| url = {https://aclanthology.org/2021.acl-long.102/} | |
| } | |
| @misc{openbookqa, | |
| title = {Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering}, | |
| author = {Todor Mihaylov and Peter Clark and Tushar Khot and Ashish Sabharwal}, | |
| year = {2018}, | |
| eprint = {1809.02789}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/1809.02789} | |
| } | |
| @misc{winogrande, | |
| title = {WinoGrande: An Adversarial Winograd Schema Challenge at Scale}, | |
| author = {Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi}, | |
| year = {2019}, | |
| eprint = {1907.10641}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.AI}, | |
| url = {https://arxiv.org/abs/1907.10641} | |
| } | |
| @misc{piqa, | |
| title = {PIQA: Reasoning about Physical Intuition by Question Answering}, | |
| author = {Yonatan Bisk and Rowan Zellers and Ronan Le Bras and Jianfeng Gao and Yejin Choi}, | |
| year = {2019}, | |
| eprint = {1911.11641}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/1911.11641} | |
| } | |
| @misc{squad2, | |
| title = {Know What You Don't Know: Unanswerable Questions for SQuAD}, | |
| author = {Pranav Rajpurkar and Robin Jia and Percy Liang}, | |
| year = {2018}, | |
| eprint = {1806.03822}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/1806.03822} | |
| } | |
| @misc{drop, | |
| title = {DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, | |
| author = {Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner}, | |
| year = {2019}, | |
| eprint = {1903.00161}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/1903.00161} | |
| } | |
| @inproceedings{wikitablequestions, | |
| title = {Compositional Semantic Parsing on Semi-Structured Tables}, | |
| author = {Panupong Pasupat and Percy Liang}, | |
| booktitle = {Annual Meeting of the Association for Computational Linguistics}, | |
| year = {2015}, | |
| eprint = {1508.00305}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/1508.00305} | |
| } | |
| @misc{triviaqa, | |
| title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}, | |
| author = {Mandar Joshi and Eunsol Choi and Daniel Weld and Luke Zettlemoyer}, | |
| year = {2017}, | |
| eprint = {1705.03551}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.CL}, | |
| url = {https://arxiv.org/abs/1705.03551} | |
| } | |
| @misc{gsm8k, | |
| title = {Training Verifiers to Solve Math Word Problems}, | |
| author = {Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Mark Chen and Heewoo Jun and Lukasz Kaiser and Matthias Plappert and Jerry Tworek and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman}, | |
| year = {2021}, | |
| eprint = {2110.14168}, | |
| archiveprefix = {arXiv}, | |
| primaryclass = {cs.LG}, | |
| url = {https://arxiv.org/abs/2110.14168} | |
| } | |