finephrase / app /src /content /article.mdx
joelniklaus's picture
joelniklaus HF Staff
change publication date
d997d47
---
title: 'The Synthetic Data Playbook:<br/> Generating Trillions of the Finest Tokens'
subtitle: >-
How to turn noisy web text into state-of-the-art pretraining data with the
right prompts, models, and infrastructure
description: 'The Synthetic Data Playbook: Generating Trillions of the Finest Tokens'
authors:
- name: Joel Niklaus
url: 'https://huggingface.co/joelniklaus'
affiliations:
- 1
- name: Guilherme Penedo
url: 'https://huggingface.co/guipenedo'
affiliations:
- 1
- name: Hynek Kydlicek
url: 'https://huggingface.co/hynky'
affiliations:
- 1
- name: Elie Bakouch
url: 'https://huggingface.co/eliebak'
affiliations:
- 1
- name: Lewis Tunstall
url: 'https://huggingface.co/lewtun'
affiliations:
- 1
- name: Ed Beeching
url: 'https://huggingface.co/edbeeching'
affiliations:
- 1
- name: Thibaud Frere
url: 'https://huggingface.co/tfrere'
affiliations:
- 1
- name: Colin Raffel
url: 'https://huggingface.co/craffel'
affiliations:
- 1
- name: Leandro von Werra
url: 'https://huggingface.co/lvwerra'
affiliations:
- 1
- name: Thomas Wolf
url: 'https://huggingface.co/thomwolf'
affiliations:
- 1
affiliations:
- name: Hugging Face
url: 'https://huggingface.co'
published: 'Mar. 8, 2026'
tags:
- research-article-template
- scientific paper
- data visualization
tableOfContentsAutoCollapse: true
seoThumbImage: /thumb.png
pdfProOnly: false
---
import Introduction from "./chapters/1-introduction.mdx";
import Setup from "./chapters/2-setup.mdx";
import Experiments from "./chapters/3-experiments.mdx";
import Analyses from "./chapters/4-analyses.mdx";
import Infrastructure from "./chapters/5-infrastructure.mdx";
import Finephrase from "./chapters/6-finephrase.mdx";
import Conclusions from "./chapters/7-conclusions.mdx";
import Appendix from "./chapters/8-appendix.mdx";
<Introduction />
<Setup />
<Experiments />
<Analyses />
<Infrastructure />
<Finephrase />
<Conclusions />
<Appendix />