Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 2,096 Bytes
89a46a6 1ef404f 8662a3e c3d54a9 1ef404f 89a46a6 8662a3e 89a46a6 8662a3e 89a46a6 8662a3e 89a46a6 8662a3e 89a46a6 8662a3e 89a46a6 8662a3e 89a46a6 8662a3e 89a46a6 8662a3e 89a46a6 8662a3e d997d47 8662a3e 526247c 187aab5 8662a3e 89a46a6 8662a3e 89a46a6 8662a3e 89a46a6 d00ac2c 526247c 187aab5 8662a3e 89a46a6 8662a3e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | ---
title: 'The Synthetic Data Playbook:<br/> Generating Trillions of the Finest Tokens'
subtitle: >-
How to turn noisy web text into state-of-the-art pretraining data with the
right prompts, models, and infrastructure
description: 'The Synthetic Data Playbook: Generating Trillions of the Finest Tokens'
authors:
- name: Joel Niklaus
url: 'https://huggingface.co/joelniklaus'
affiliations:
- 1
- name: Guilherme Penedo
url: 'https://huggingface.co/guipenedo'
affiliations:
- 1
- name: Hynek Kydlicek
url: 'https://huggingface.co/hynky'
affiliations:
- 1
- name: Elie Bakouch
url: 'https://huggingface.co/eliebak'
affiliations:
- 1
- name: Lewis Tunstall
url: 'https://huggingface.co/lewtun'
affiliations:
- 1
- name: Ed Beeching
url: 'https://huggingface.co/edbeeching'
affiliations:
- 1
- name: Thibaud Frere
url: 'https://huggingface.co/tfrere'
affiliations:
- 1
- name: Colin Raffel
url: 'https://huggingface.co/craffel'
affiliations:
- 1
- name: Leandro von Werra
url: 'https://huggingface.co/lvwerra'
affiliations:
- 1
- name: Thomas Wolf
url: 'https://huggingface.co/thomwolf'
affiliations:
- 1
affiliations:
- name: Hugging Face
url: 'https://huggingface.co'
published: 'Mar. 8, 2026'
tags:
- research-article-template
- scientific paper
- data visualization
tableOfContentsAutoCollapse: true
seoThumbImage: /thumb.png
pdfProOnly: false
---
import Introduction from "./chapters/1-introduction.mdx";
import Setup from "./chapters/2-setup.mdx";
import Experiments from "./chapters/3-experiments.mdx";
import Analyses from "./chapters/4-analyses.mdx";
import Infrastructure from "./chapters/5-infrastructure.mdx";
import Finephrase from "./chapters/6-finephrase.mdx";
import Conclusions from "./chapters/7-conclusions.mdx";
import Appendix from "./chapters/8-appendix.mdx";
<Introduction />
<Setup />
<Experiments />
<Analyses />
<Infrastructure />
<Finephrase />
<Conclusions />
<Appendix />
|