--- title: 'The Synthetic Data Playbook:
Generating Trillions of the Finest Tokens' subtitle: >- How to turn noisy web text into state-of-the-art pretraining data with the right prompts, models, and infrastructure description: 'The Synthetic Data Playbook: Generating Trillions of the Finest Tokens' authors: - name: Joel Niklaus url: 'https://huggingface.co/joelniklaus' affiliations: - 1 - name: Guilherme Penedo url: 'https://huggingface.co/guipenedo' affiliations: - 1 - name: Hynek Kydlicek url: 'https://huggingface.co/hynky' affiliations: - 1 - name: Elie Bakouch url: 'https://huggingface.co/eliebak' affiliations: - 1 - name: Lewis Tunstall url: 'https://huggingface.co/lewtun' affiliations: - 1 - name: Ed Beeching url: 'https://huggingface.co/edbeeching' affiliations: - 1 - name: Thibaud Frere url: 'https://huggingface.co/tfrere' affiliations: - 1 - name: Colin Raffel url: 'https://huggingface.co/craffel' affiliations: - 1 - name: Leandro von Werra url: 'https://huggingface.co/lvwerra' affiliations: - 1 - name: Thomas Wolf url: 'https://huggingface.co/thomwolf' affiliations: - 1 affiliations: - name: Hugging Face url: 'https://huggingface.co' published: 'Mar. 8, 2026' tags: - research-article-template - scientific paper - data visualization tableOfContentsAutoCollapse: true seoThumbImage: /thumb.png pdfProOnly: false --- import Introduction from "./chapters/1-introduction.mdx"; import Setup from "./chapters/2-setup.mdx"; import Experiments from "./chapters/3-experiments.mdx"; import Analyses from "./chapters/4-analyses.mdx"; import Infrastructure from "./chapters/5-infrastructure.mdx"; import Finephrase from "./chapters/6-finephrase.mdx"; import Conclusions from "./chapters/7-conclusions.mdx"; import Appendix from "./chapters/8-appendix.mdx";