File size: 2,096 Bytes
89a46a6
1ef404f
8662a3e
c3d54a9
 
1ef404f
89a46a6
8662a3e
89a46a6
8662a3e
 
 
 
89a46a6
8662a3e
 
 
 
 
 
 
 
 
 
 
 
 
 
89a46a6
8662a3e
 
 
 
89a46a6
8662a3e
 
 
 
89a46a6
8662a3e
 
 
 
89a46a6
8662a3e
 
 
 
89a46a6
8662a3e
 
 
 
89a46a6
8662a3e
 
 
 
 
 
 
 
d997d47
8662a3e
 
 
 
 
 
 
 
 
 
526247c
 
 
 
 
187aab5
 
 
8662a3e
 
89a46a6
8662a3e
89a46a6
8662a3e
89a46a6
d00ac2c
 
526247c
 
187aab5
 
8662a3e
89a46a6
8662a3e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
---
title: 'The Synthetic Data Playbook:<br/> Generating Trillions of the Finest Tokens'
subtitle: >-
  How to turn noisy web text into state-of-the-art pretraining data with the
  right prompts, models, and infrastructure
description: 'The Synthetic Data Playbook: Generating Trillions of the Finest Tokens'
authors:
  - name: Joel Niklaus

    url: 'https://huggingface.co/joelniklaus'
    affiliations:
      - 1
  - name: Guilherme Penedo

    url: 'https://huggingface.co/guipenedo'
    affiliations:
      - 1
  - name: Hynek Kydlicek

    url: 'https://huggingface.co/hynky'
    affiliations:
      - 1
  - name: Elie Bakouch

    url: 'https://huggingface.co/eliebak'
    affiliations:
      - 1
  - name: Lewis Tunstall

    url: 'https://huggingface.co/lewtun'
    affiliations:
      - 1
  - name: Ed Beeching

    url: 'https://huggingface.co/edbeeching'
    affiliations:
      - 1
  - name: Thibaud Frere

    url: 'https://huggingface.co/tfrere'
    affiliations:
      - 1
  - name: Colin Raffel

    url: 'https://huggingface.co/craffel'
    affiliations:
      - 1
  - name: Leandro von Werra

    url: 'https://huggingface.co/lvwerra'
    affiliations:
      - 1
  - name: Thomas Wolf

    url: 'https://huggingface.co/thomwolf'
    affiliations:
      - 1

affiliations:
  - name: Hugging Face

    url: 'https://huggingface.co'
published: 'Mar. 8, 2026'
tags:
  - research-article-template
  - scientific paper
  - data visualization

tableOfContentsAutoCollapse: true
seoThumbImage: /thumb.png
pdfProOnly: false
---

import Introduction from "./chapters/1-introduction.mdx";
import Setup from "./chapters/2-setup.mdx";
import Experiments from "./chapters/3-experiments.mdx";
import Analyses from "./chapters/4-analyses.mdx";
import Infrastructure from "./chapters/5-infrastructure.mdx";
import Finephrase from "./chapters/6-finephrase.mdx";
import Conclusions from "./chapters/7-conclusions.mdx";
import Appendix from "./chapters/8-appendix.mdx";

<Introduction />

<Setup />

<Experiments />

<Analyses />

<Infrastructure />

<Finephrase />

<Conclusions />

<Appendix />