Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| title: 'The Synthetic Data Playbook:<br/> Generating Trillions of the Finest Tokens' | |
| subtitle: >- | |
| How to turn noisy web text into state-of-the-art pretraining data with the | |
| right prompts, models, and infrastructure | |
| description: 'The Synthetic Data Playbook: Generating Trillions of the Finest Tokens' | |
| authors: | |
| - name: Joel Niklaus | |
| url: 'https://huggingface.co/joelniklaus' | |
| affiliations: | |
| - 1 | |
| - name: Guilherme Penedo | |
| url: 'https://huggingface.co/guipenedo' | |
| affiliations: | |
| - 1 | |
| - name: Hynek Kydlicek | |
| url: 'https://huggingface.co/hynky' | |
| affiliations: | |
| - 1 | |
| - name: Elie Bakouch | |
| url: 'https://huggingface.co/eliebak' | |
| affiliations: | |
| - 1 | |
| - name: Lewis Tunstall | |
| url: 'https://huggingface.co/lewtun' | |
| affiliations: | |
| - 1 | |
| - name: Ed Beeching | |
| url: 'https://huggingface.co/edbeeching' | |
| affiliations: | |
| - 1 | |
| - name: Thibaud Frere | |
| url: 'https://huggingface.co/tfrere' | |
| affiliations: | |
| - 1 | |
| - name: Colin Raffel | |
| url: 'https://huggingface.co/craffel' | |
| affiliations: | |
| - 1 | |
| - name: Leandro von Werra | |
| url: 'https://huggingface.co/lvwerra' | |
| affiliations: | |
| - 1 | |
| - name: Thomas Wolf | |
| url: 'https://huggingface.co/thomwolf' | |
| affiliations: | |
| - 1 | |
| affiliations: | |
| - name: Hugging Face | |
| url: 'https://huggingface.co' | |
| published: 'Mar. 8, 2026' | |
| tags: | |
| - research-article-template | |
| - scientific paper | |
| - data visualization | |
| tableOfContentsAutoCollapse: true | |
| seoThumbImage: /thumb.png | |
| pdfProOnly: false | |
| import Introduction from "./chapters/1-introduction.mdx"; | |
| import Setup from "./chapters/2-setup.mdx"; | |
| import Experiments from "./chapters/3-experiments.mdx"; | |
| import Analyses from "./chapters/4-analyses.mdx"; | |
| import Infrastructure from "./chapters/5-infrastructure.mdx"; | |
| import Finephrase from "./chapters/6-finephrase.mdx"; | |
| import Conclusions from "./chapters/7-conclusions.mdx"; | |
| import Appendix from "./chapters/8-appendix.mdx"; | |
| <Introduction /> | |
| <Setup /> | |
| <Experiments /> | |
| <Analyses /> | |
| <Infrastructure /> | |
| <Finephrase /> | |
| <Conclusions /> | |
| <Appendix /> | |