finephrase

Running on CPU Upgrade

App Files Files Community

joelniklaus HF Staff commited on Feb 6

Commit

b48430a

1 Parent(s): 24b72a3

remove old notion import

Browse files

Files changed (2) hide show

app/src/content/article.mdx +1 -1
app/src/content/article/article.mdx +0 -941

app/src/content/article.mdx CHANGED Viewed

@@ -60,7 +60,7 @@ affiliations:
   - name: Hugging Face
     url: 'https://huggingface.co'
-published: 'Oct. 30, 2025'
 tags:
   - research-article-template
   - scientific paper

   - name: Hugging Face
     url: 'https://huggingface.co'
+published: 'Feb. 20, 2026'
 tags:
   - research-article-template
   - scientific paper

app/src/content/article/article.mdx DELETED Viewed

@@ -1,941 +0,0 @@
----
-title: 'The Synthetic Data Playbook:<br/> Generating Trillions of the Finest Tokens'
-subtitle: >-
-  A practical journey through the challenges, decisions, and messy reality
-  behind training state-of-the-art language models
-description: 'The Synthetic Data Playbook: Generating Trillions of the Finest Tokens'
-authors:
-  - name: Joel Niklaus
-    url: 'https://huggingface.co/joelniklaus'
-    affiliations:
-      - 1
-  - name: Guilherme Penedo
-    url: 'https://huggingface.co/guipenedo'
-    affiliations:
-      - 1
-  - name: Hynek Kydlicek
-    url: 'https://huggingface.co/hynky'
-    affiliations:
-      - 1
-  - name: Elie Bakouch
-    url: 'https://huggingface.co/eliebak'
-    affiliations:
-      - 1
-  - name: Lewis Tunstall
-    url: 'https://huggingface.co/lewtun'
-    affiliations:
-      - 1
-  - name: Ed Beeching
-    url: 'https://huggingface.co/edbeeching'
-    affiliations:
-      - 1
-  - name: Thibaud Frere
-    url: 'https://huggingface.co/tfrere'
-    affiliations:
-      - 1
-  - name: Colin Raffel
-    url: 'https://huggingface.co/craffel'
-    affiliations:
-      - 1
-  - name: Leandro von Werra
-    url: 'https://huggingface.co/lvwerra'
-    affiliations:
-      - 1
-  - name: Thomas Wolf
-    url: 'https://huggingface.co/thomwolf'
-    affiliations:
-      - 1
-affiliations:
-  - name: Hugging Face
-    url: 'https://huggingface.co'
-published: 'Oct. 30, 2025'
-tags:
-  - research-article-template
-  - scientific paper
-  - data visualization
-tableOfContentsAutoCollapse: true
-seoThumbImage: /thumb.png
-pdfProOnly: false
----
-import Image from '../../components/Image.astro';
-import SyDLepVveg_2f81384e_bcac_806f_acb7_fd65c71dd9df from '../assets/image/SyDLepVveg_2f81384e-bcac-806f-acb7-fd65c71dd9df.jpg';
-import Screenshot_2026_01_20_at_09_42_21_2f81384e_bcac_80e6_b3fa_d06567e56b15 from '../assets/image/Screenshot_2026-01-20_at_09_42_21_2f81384e-bcac-80e6-b3fa-d06567e56b15.png';
-import newplot_2f81384e_bcac_804d_b760_e8611cc0302b from '../assets/image/newplot_2f81384e-bcac-804d-b760-e8611cc0302b.png';
-import newplot_2e21384e_bcac_801e_b0b1_da03761b1dc6 from '../assets/image/newplot_2e21384e-bcac-801e-b0b1-da03761b1dc6.png';
-import newplot_2c41384e_bcac_8073_9395_cf2d0e901187 from '../assets/image/newplot_2c41384e-bcac-8073-9395-cf2d0e901187.png';
-import newplot_2c31384e_bcac_800b_82e8_ff44228f7720 from '../assets/image/newplot_2c31384e-bcac-800b-82e8-ff44228f7720.png';
-import newplot_2e11384e_bcac_800a_abc6_d0690da3f955 from '../assets/image/newplot_2e11384e-bcac-800a-abc6-d0690da3f955.png';
-import newplot_2e21384e_bcac_80a2_9bac_c543304d926e from '../assets/image/newplot_2e21384e-bcac-80a2-9bac-c543304d926e.png';
-import newplot_2e11384e_bcac_80dd_972d_cf77d9c3b004 from '../assets/image/newplot_2e11384e-bcac-80dd-972d-cf77d9c3b004.png';
-import newplot_2e11384e_bcac_80a3_a6fa_e8634e0e2206 from '../assets/image/newplot_2e11384e-bcac-80a3-a6fa-e8634e0e2206.png';
-import newplot_2e41384e_bcac_80c0_aef5_e71fdbaccd8d from '../assets/image/newplot_2e41384e-bcac-80c0-aef5-e71fdbaccd8d.png';
-import newplot_2da1384e_bcac_80d6_a8b9_da80324f8fef from '../assets/image/newplot_2da1384e-bcac-80d6-a8b9-da80324f8fef.png';
-import newplot_2e71384e_bcac_8027_ae32_c133627ede4a from '../assets/image/newplot_2e71384e-bcac-8027-ae32-c133627ede4a.png';
-import newplot_2f71384e_bcac_80c6_a99e_f52084fc497b from '../assets/image/newplot_2f71384e-bcac-80c6-a99e-f52084fc497b.png';
-import newplot_2f71384e_bcac_80d8_9985_e195d39f1e70 from '../assets/image/newplot_2f71384e-bcac-80d8-9985-e195d39f1e70.png';
-import newplot_2d21384e_bcac_80ab_a6dd_e31a6c150e61 from '../assets/image/newplot_2d21384e-bcac-80ab-a6dd-e31a6c150e61.png';
-import newplot_2e11384e_bcac_80ea_88cc_c971b2816596 from '../assets/image/newplot_2e11384e-bcac-80ea-88cc-c971b2816596.png';
-import newplot_2e11384e_bcac_8032_9835_e1407f4d780d from '../assets/image/newplot_2e11384e-bcac-8032-9835-e1407f4d780d.png';
-import newplot_2df1384e_bcac_80bc_b93c_ee8e9cfd5529 from '../assets/image/newplot_2df1384e-bcac-80bc-b93c-ee8e9cfd5529.png';
-import newplot_2df1384e_bcac_8018_b1f6_da1dcde1f90a from '../assets/image/newplot_2df1384e-bcac-8018-b1f6-da1dcde1f90a.png';
-import newplot_2e01384e_bcac_8017_9829_cd0c1db928c6 from '../assets/image/newplot_2e01384e-bcac-8017-9829-cd0c1db928c6.png';
-import newplot_2e01384e_bcac_806f_8bf1_f7e5405a2ff9 from '../assets/image/newplot_2e01384e-bcac-806f-8bf1-f7e5405a2ff9.png';
-import newplot_2d61384e_bcac_8092_baca_c17346b95734 from '../assets/image/newplot_2d61384e-bcac-8092-baca-c17346b95734.png';
-import newplot_2e41384e_bcac_8065_b313_c38a6db4ac31 from '../assets/image/newplot_2e41384e-bcac-8065-b313-c38a6db4ac31.png';
-import newplot_2df1384e_bcac_8010_abe7_cf477262b8d6 from '../assets/image/newplot_2df1384e-bcac-8010-abe7-cf477262b8d6.png';
-import newplot_2e11384e_bcac_80bc_810d_d13554c628dc from '../assets/image/newplot_2e11384e-bcac-80bc-810d-d13554c628dc.png';
-import newplot_2f61384e_bcac_80d9_ab81_d57a228847cf from '../assets/image/newplot_2f61384e-bcac-80d9-ab81-d57a228847cf.png';
-import newplot_2ee1384e_bcac_80da_82cd_df97247e2e72 from '../assets/image/newplot_2ee1384e-bcac-80da-82cd-df97247e2e72.png';
-Notes:
-- Finepdfs-edu outperforms even dclm quite clearly. This would change the whole story completely so it would be quite time consuming to adapt. Therefore we leave it out for now.
- **Leandro Intro:**
-If you read some of the latest LLM papers [add some refs, e.g. Nemotron 3, Arcee's trinity], you may have noticed that synthetic data has become a key component for LLM training data. It is quickly becoming one of the standard tools for building high quality datasets for LLM training. If we look back we can see several paradigm shifts for LLM data, especially for pretraining, and synthetic data is the natural latest step:
-- After training the first language models on small-ish datasets like Wikipedia, people started scaling up the pretraining corpora including more and more data from the web. We went from training on just a few billion tokens to training on trillions of tokens including most of the web text.
-- When approaching the scaling limits of web data people started to more aggressively filter the data and the discussion shifted from volume to quality. Starting with stronger heuristics including deduplication pipelines and eventually switching to neural classifiers looking for "educational" or "instruction-like" data. The first model trainings were conservative with repeating data but with higher quality data some repetitions seemed fine.
-- Now that we have mostly exhausted web text data and concluded that quality is more important, synthetic data has become an interesting option to up-cycle the data that the classifiers would have normally excluded and thus increase the volume of data again. The latest LLMs were trained on trillions of synthetic tokens, matching the volume of unaltered data.
-Besides pretraining, synthetic data generation also has become a useful tool for post-training. It is applied to fill gaps identified in models. A fun anecdote is the SmolLM2 training, where we noticed the model was decent at coding and math, but totally went off the rails with small talk queries (e.g. "How are you?", "Hi", "What's up?"). Synthetically generating a small talk dataset ([https://huggingface.co/datasets/HuggingFaceTB/everyday-conversations-llama3.1-2k/](https://huggingface.co/datasets/HuggingFaceTB/everyday-conversations-llama3.1-2k/viewer/default/train_sft?row=0)) quickly solved this issue.
-We are seeing a radical shift in compute allocation for model training: while the model training dominated the compute budget early on, we see more and more compute allocated to curate and improve the training datasets, both in pretraining and post-training.
-However, how to do synthetic data generation properly still resembles alchemy these days: which model should you use? which prompts work best and how many do you need? and how do you even scale this effectively?
-In this blog post we take a journey to answer all these questions systematically. We ran XXX experiments and generated YYY tokens in total to find the ideal settings for synthetic data.
-Here's the plan:
-We start with the infrastructure needed for synthetic data generation at scale. This includes some extensions we made to the datatrove library and crucially detailed throughput benchmarking of popular models you might want to use for synthetic data generation. This is super important to get the most data for your bucks.
-We continue with a walkthrough of the different approaches for synthetic data in pretraining, from explaining what prior work did to the prompts we are experimenting with.
-Finally we present the suite of XXX experiments we ran to figure out best practices regarding what models, prompts and settings work well.
-## Infrastructure
-When you start generating your first synthetic tokens with LLMs you will notice quickly that this is an extremely slow and compute heavy process. Even though we can cache KV values from previous tokens, we still need to do one forward pass for  *EVERY*  token and every web document typically has a few thousand tokens. So the first step before we can run any large scale experiments is to setup some infrastructure to make sure we can generate as efficiently and scalable as possible. Let's have look at what is involved!
-Synthetic data has emerged as a key ingredient in training modern LLMs, providing a path past the pretraining data wall, where high-quality text (or ["fossil fuel"](https://youtu.be/1yvBqasHLZs?si=YgaaCSfngJNi3OSb&t=475)) becomes scarce and collecting more internet data yields diminishing returns. For example, NVIDIA used LLMs to rephrase around 2 trillion tokens (!) of web text in their [Nemotron-CC dataset](https://huggingface.co/datasets/nvidia/Nemotron-CC-v2), while Z.ai generated 500 billion reasoning tokens to mid-train the [GLM-4.5 series of models](https://huggingface.co/collections/zai-org/glm-45):
-<Image src={SyDLepVveg_2f81384e_bcac_806f_acb7_fd65c71dd9df} alt="Image" />
-Synthetic data also plays a central role in post-training via  *distillation* , where a capable model is used to generate high-quality responses for targeted domains such as reasoning, instruction-following, and tool-use. This data can then be used for supervised fine-tuning or preference optimization, allowing developers to shape a model's behaviour with labels that would be expensive or impractical to obtain from humans. For example, [SmolLM3](https://huggingface.co/spaces/HuggingFaceTB/smol-training-playbook) was post-trained almost entirely on a few billion tokens of data generated from models like DeepSeek-R1 and Qwen3.
-So what does it actually take to generate a trillion tokens of synthetic data? Thanks to fast inference engines like [vLLM](https://github.com/vllm-project/vllm) and [SGLang](https://github.com/sgl-project/sglang), it turns out that the bottleneck isn't the generation itself but the  *infrastructure*  around it: orchestrating thousands of prompts, keeping GPUs saturated, checkpointing outputs, and pushing everything to storage without losing progress when a worker crashes.
-Today we're excited to announce major extensions to [DataTrove](https://github.com/huggingface/datatrove) to manage this entire process. These extensions package the scaffolding we built for our own synthetic data pipelines and make it accessible to anyone who wants to generate high-quality datasets at scale. DataTrove supports both local generation and large-scale distributed runs on Slurm clusters, handling chunking, checkpointing, distributed queueing, and Hugging Face dataset management so you can focus on synthetic data design rather than operational glue.
-In this blog post we show how DataTrove can be used to generate a billion tokens per hour across several model scales, ranging from 1 billion to 1 trillion parameters. Let's dive in!
-### Generating synthetic data at scale
-At the core of the repo is  `examples/inference/benchmark/generate_data.py` , a Typer-powered entry point that orchestrates the full synthetic data loop:
-1.  **Read** : pull any split/config from the Hugging Face Hub via  `HuggingFaceDatasetReader` .
-1.  **Transform** : stream examples through  `InferenceRunner` , which talks to vLLM (or another server type) and handles chunking, retries, and metric logging.
-1.  **Write** : push results back to the Hub with  `ParquetWriter` .
-Because everything is declared as a DataTrove pipeline, you get deterministic checkpoints, resumability, and clean separation between each stage. No more bespoke scripts glued together with bash. The pipeline can easily scale to launch parallel generation jobs on a Slurm cluster, with automatic aggregation of generation metrics.
-DataTrove provides two modes to generate synthetic data:
--  **Local execution** : Run on a single machine with multiple workers for development and small-scale generation
--  **Slurm cluster** : Distribute processing across multiple nodes for large-scale production workloads
-Here's a simple example of local execution on a node of 8 GPUs to generate solutions to math problems from the [s1K dataset](https://huggingface.co/datasets/simplescaling/s1K-1.1) using  `Qwen3-4B-Thinking-2507` :
-```shell
-python examples/inference/benchmark/generate_data.py \
-    --input-dataset-name simplescaling/s1K-1.1 \
-    --input-dataset-split train \
-    --prompt-column question \
-    --model-name-or-path Qwen/Qwen3-4B-Thinking-2507 \
-    --model-max-context 32768 \
-    --output-dataset-name s1K-datatrove \
-    --tasks 1 \
-    --examples-per-chunk 50 \
-    --dp 8 \
-    --tp 1 \
-    --local-execution
-```
-Most arguments are self-explanatory, but let's take a look at the main ones that control the behavior of DataTrove pipelines:
--  `tasks` : controls how many tasks the executor spawns. Each task processes a disjoint slice of the dataset.
--  `examples-per-chunk` : controls how many prompts are batched before checkpointing.
--  `dp` : controls the data parallel size.
--  `tp` : controls the tensor parallel size.
-Bigger chunks improve throughput but increase the work lost if you need to resume, so tune  `examples-per-chunk`  accordingly while using  `tasks`  mainly to spread the workload across independent jobs.
-Local execution is handy for small-scale datasets or models, but what if you want to generate data from a trillion parameter model like Kimi K2 😱? For that we use the in-built Slurm executor to scale the job across multiple nodes and tasks:
-```shell
-python examples/inference/benchmark/generate_data.py \
-    --input-dataset-name simplescaling/s1K-1.1 \
-    --prompt-column question \
-    --model-name-or-path moonshotai/Kimi-K2-Instruct \
-    --model-max-context 1024 \
-    --max-tokens 8 \
-    --trust-remote-code \
-    --output-dataset-name s1K-1.1-benchmark-Kimi-K2-Instruct \
-    --examples-per-chunk 10 \
-    --tasks 1 \
-    --workers 1 \
-    --max-examples 100 \
-    --nodes-per-task 2 \
-    --tp 8 \
-    --pp 2 \
-    --optimization-level 0 \
-    --max-num-seqs=16
-```
-### Custom Rollouts: Flexible LLM Inference Orchestration
-At the heart of our inference system lies a powerful abstraction: the  **rollout function** . A rollout is simply an async callable that receives a  `Document` , a  `generate(payload)`  callback, and any extra resources you've configured. Inside the rollout, you have complete freedom to orchestrate one or many  `generate`  calls: sequentially, in parallel, or any combination.
-This design separates  *what*  you want to generate from  *how*  the inference engine batches and executes requests. You focus on your application logic; the runner handles efficient GPU utilization.
-#### Example 1: Simple Single-Request Rollout
-The simplest rollout sends one request per document and returns the result directly:
-```python
-async def simple_rollout(
-    document: Document,
-    generate: Callable[[dict[str, Any]], Awaitable[InferenceResult]],
-) -> InferenceResult:
-    payload = {
-        "messages": [{"role": "user", "content": document.text}],
-        "max_tokens": 2048,
-    }
-    return await generate(payload)
-```
-The returned  `InferenceResult`  is automatically stored under  `document.metadata["rollout_results"]` .
- **Use case: Rephrasing web documents for LLM training.**  You're building a training corpus by rephrasing web documents into cleaner, more consistent prose. Most documents fit within context, outputs stay under 4k tokens, and you want minimal overhead—one request per document, no chunking logic, no coordination. The rollout wraps each document in a rephrasing prompt and returns the rewritten text directly.
-#### Example 2: Chunked Rollout for Long Documents
-When documents exceed your model's context window, you can split them into chunks and stitch generations together:
-```python
-async def chunked_rollout(
-    document: Document,
-    generate: Callable[[dict[str, Any]], Awaitable[InferenceResult]],
-) -> str:
-    max_chars = 4000
-    text = document.text
-    chunks = [text[i : i + max_chars] for i in range(0, len(text), max_chars)]
-    generations = []
-    for chunk in chunks:
-        payload = {
-            "messages": [
-                {"role": "user", "content": f"Rewrite formally:\\n\\n{chunk}"},
-                {"role": "assistant", "content": generations[-1] if generations else ""},
-            ],
-            "continue_final_message": True,
-        }
-        result = await generate(payload)
-        generations.append(result.text)
-    return "\\n".join(generations)
-```
-Each chunk builds on the previous generation, allowing the model to maintain coherence across the entire document.
- **Use case: Translating long web documents.**  You're translating multilingual web content into English at massive scale. Many documents exceed context limits, so you split them into 512-token chunks and translate with a sliding window—each chunk is translated while keeping the previous (already translated) chunk in the prompt for context. This maintains coherence across chunk boundaries. The [FineTranslations](https://huggingface.co/datasets/HuggingFaceFW/finetranslations) project used this approach to translate over 1 trillion tokens across 500+ languages.
-#### Example 3: CPU-Heavy Preprocessing with Process Pools
-For rollouts that require expensive CPU work (parsing, image processing, etc.), you can offload preprocessing to a process pool via  `shared_context` :
-```python
-def cpu_heavy_build_payload(doc: Document, page: int) -> dict[str, Any]:
-    # Expensive preprocessing here (e.g., PDF parsing, OCR)
-    return {"messages": [{"role": "user", "content": f"[page {page}] {doc.text}"}]}
-async def heavy_cpu_rollout(
-    document: Document,
-    generate: Callable[[dict[str, Any]], Awaitable[InferenceResult]],
-    process_pool: ProcessPoolExecutor,  # Injected via shared_context
-) -> list[InferenceResult]:
-    loop = asyncio.get_running_loop()
-    async def process_page(page: int) -> InferenceResult:
-        payload = await loop.run_in_executor(process_pool, cpu_heavy_build_payload, document, page)
-        return await generate(payload)
-    return await asyncio.gather(*[process_page(p) for p in [1, 2]])
-```
-Configure the shared context when creating the runner:
-```python
-@contextmanager
-def process_pool_context(max_workers: int = 100):
-    with ProcessPoolExecutor(max_workers=max_workers) as pool:
-        yield {"process_pool": pool}
-InferenceRunner(
-    rollout_fn=heavy_cpu_rollout,
-    shared_context=partial(process_pool_context, max_workers=100),
-    ...
-)
-```
-The pool is initialized lazily and shared across all rollout invocations, keeping CPU-bound work off the async event loop.
- **Use case: PDF document understanding.**  You're building a pipeline to extract structured information from scanned PDFs. Each document requires CPU-intensive OCR preprocessing before the text can be sent to the LLM for extraction. By offloading the OCR to a process pool, you keep the GPU fed with generation requests while workers handle the parsing in parallel.
-#### Running Multiple Rollouts per Document
-Need multiple samples per document? Set  `rollouts_per_document`  in your  `InferenceConfig` . All successful outputs are collected under  `document.metadata["rollout_results"]`  as a list.
- **Use case: Best-of-N sampling for code generation.**  When generating code solutions, you want multiple attempts per problem to increase the chance of a correct answer. Set  `rollouts_per_document=10`  and later filter for solutions that pass your test suite.
-### Automatic HF Upload
-We want you to be able to just press a button, let the GPUs go brrrr and check back in to the finished dataset. Therefore, DataTrove continuously uploads data to your specified Hugging Face dataset repo whenever a chunk is finished. At the end, the  `InferenceDatasetCardGenerator`  pipeline step checks the logs directory, collects information about the throughput and uploads a dataset card to document your new synthetic dataset (see an [example](https://huggingface.co/datasets/joelniklaus/s1K-1.1-datatrove) below).
-<Image src={Screenshot_2026_01_20_at_09_42_21_2f81384e_bcac_80e6_b3fa_d06567e56b15} alt="Image" />
-### Progress Monitoring
-For long-running inference jobs, you can use  `InferenceProgressMonitor`  to periodically update a HuggingFace dataset card with a progress bar and ETA. After inference completes,  `InferenceDatasetCardGenerator`  creates a final dataset card with statistics.
-```python
-from datatrove.pipeline.inference import InferenceDatasetCardParams, InferenceProgressMonitor, InferenceDatasetCardGenerator
-params = InferenceDatasetCardParams(
-    output_repo_id="your-username/output-dataset",
-    input_dataset_name="simplescaling/s1K-1.1",
-    input_dataset_split="train",
-    model_name="Qwen/Qwen3-0.6B",
-    # ... other params
-)
-## Monitor pipeline (runs in parallel with inference on Slurm)
-monitor_pipeline = [InferenceProgressMonitor(params=params, update_interval=3600)]
-## Final card generation (runs after inference completes)
-datacard_pipeline = [InferenceDatasetCardGenerator(params=params)]
-```
-### Scaling Throughput from 1B to 1T parameters
-For synthetic data generation, we may run language model inference for millions of GPU hours. Finding a configuration that maximizes throughput is critical, as it could accelerate generation by days and save thousands of dollars. In this section, we describe our experiments to identify optimal parameters for a selection of popular models. We run the experiments once for a pre-training dataset and once for a post-training example.
-The Flash-Attn VLLM backend is more than 50% faster than FlashInfer across setups.
-#### Pre-training
-For pre-training, we know that 1B models are enough, so we are not interested in testing different sizes, but more about optimizing the setup for specific dataset-prompt-model combinations we know work well.
-We also experimented with different values of  `gpu-memory-utilization` , but they didn't make a substantial difference so we don't report them here and exclude them from the main benchmarking code for simplicity.
-Similar to [prior experiments](https://github.com/vllm-project/vllm/issues/6868), we also did not find significant throughput differences with non-standard block sizes (not 16).
-We experimented with fp8 kv-cache quantization and 4-bit model quantization using BitsandBytes but did not see consistent throughput improvements (e.g., positive improvements for Qwen3-0.6B but not for Gemma-3-270M for fp8-kv-cache). In the case of SmolLM2-135M-Instruct, we saw the model degrading into many repetitions for both settings.
-#### Post-training
-For post-training we are interested in scaling up the model size since we expect much higher-quality data from larger models. It is worth generating from larger models since we need much less data than in pre-training.
-We used the  `simplescaling/s1K-1.1`  dataset as input. Since it rarely contains more than 500 input tokens, we have ample room for generating thinking traces and output. We set the  `model-max-context`  to 2048 and the  `max-tokens`  to 1024 to control sequence length. At larger sequence lengths, thinking models generate an order of magnitude more completion tokens compared to instruct models, which skews throughput metrics. We tested four compact models ({'<'}4B params), four medium (10B-100B total params) MoEs, two large (100B-500B total params) MoEs, and one enormous ({'>'}500B total params) MoE using different TP and PP configurations. We measured output tokens per second per TP (Output TPS) from vLLM server logs, then calculated the number of GPUs required to generate 1B tokens per hour. We conducted our experiments using 80GB NVIDIA H100 GPUs with default vLLM parameters (e.g.,  `max-concurrent-requests`  500,  `gpu-memory-utilization`  0.9).
-Below we present our results scaling from 1B to 1T parameters.
-| Model | Size | TP | PP | Output TPS | GPUs/1B/h |
-| --- | --- | --- | --- | --- | --- |
-| Gemma-3-1B | Compact | 1 | 1 | 16616 | 17 |
-| Qwen3-1.7B | Compact | 1 | 1 | 15397 | 18 |
-| Gemma-3-4B | Compact | 1 | 1 | 10429 | 51 |
-| Qwen3-4B-Thinking-2507 | Compact | 1 | 1 | 12515 | 39 |
-| GPT-OSS-20B (21B-A4B) | Medium | 1 | 1 | 12194 | 40 |
-| Nemotron-3-Nano-30B-A3B BF16 | Medium | 1 | 1 | 5490 | 51 |
-| Nemotron-3-Nano-30B-A3B FP8 | Medium | 1 | 1 | 9274 | 30 |
-| Qwen3-30B-A3B-Thinking-2507 | Medium | 2 | 1 | 6681 | 77 |
-| Qwen3-Next-80B-A3B-Thinking | Medium | 4 | 1 | 2910 | 273 |
-| GPT-OSS-120B (117B-A5B) | Large | 2 | 1 | 5187 | 103 |
-| Qwen3-235B-A22B-Thinking-2507 | Large | 8 | 1 | 732 | 1161 |
-| Kimi-K2-Instruct (1T-A32B) | Enormous | 8 | 2 | 26 | 10645 |
-We consistently achieve the highest throughput at the lowest tensor parallelism (TP) and pipeline parallelism (PP) without running out of memory (OOM). We hypothesize this occurs because, except for the largest Qwen model and Kimi-K2-Instruct, no model has more than 6B active parameters.
-Interestingly, model family appears to significantly impact performance. At the same 4B scale, Qwen3 achieves nearly 20% higher throughput than Gemma-3. GPT-OSS-20B nearly matches Qwen3-4B's throughput despite having 5x the total parameters (21B vs 4B) and slightly fewer active parameters (3.6B vs 4B). Even more notably, GPT-OSS-120B nearly doubles the throughput of Qwen3-Next-80B-A3B despite having both more total and more active parameters. This performance difference, along with the fact that GPT-OSS-120B runs on TP2 while Qwen3-Next-80B-A3B OOMs, is likely attributable to GPT-OSS being loaded in weight-quantized mode (mxfp4) by default, compared to bf16 for the other models.
-We also explored what would be required to generate 1T tokens in a day. We believe GPT-OSS-120B offers a strong balance between quality and throughput. Generating 1T tokens in a day would require 279 nodes, resulting in a cost of approximately $161K at roughly $3 per H100 hour. For a slightly lower quality option using GPT-OSS-20B, we would need 119 nodes at a cost of $69K.
-You can find the benchmarking code [here](https://github.com/huggingface/datatrove/tree/main/examples/inference/benchmark) together with the [yaml config](https://github.com/huggingface/datatrove/blob/main/examples/inference/benchmark/sample_benchmark_config.yaml).
-We experimented with speculative decoding using the [ngram method](https://docs.vllm.ai/en/stable/features/spec_decode.html?h=specula#speculating-by-matching-n-grams-in-the-prompt) but found no consistent speedups. We hypothesize this approach is unhelpful because the input in our benchmarking dataset is relatively short compared to the thinking tokens and output. We expect greater gains for tasks involving more copying from the input.
-TODO: Optimize this section for pretraining: use that prompt and seq length configuration but mention in the end that for post training we can easily rerun this experiment with different prompts and datasets
-## Synthetic Data for Pretraining
-Language model development has encountered a fundamental data wall as high-quality web data becomes increasingly scarce, pushing researchers toward synthetic data generation as a complement to traditional internet-scraped datasets. While recent work has demonstrated that synthetic data can dramatically improve model quality, with approaches like WRAP, Nemotron-CC, and BeyondWeb showing that rephrasing existing web content into higher-quality formats can outperform training on raw data alone, the field lacks both a clear conceptual framework for understanding what "synthetic data" and "rephrasing" actually mean and systematic investigations of the factors that determine their effectiveness.
- **What is Rephrasing?**  At its core, rephrasing involves transforming existing documents through language models to produce variants that preserve semantic content while modifying presentation, structure, or style. However, this simple definition masks considerable complexity. Rephrasing exists along a spectrum from conservative transformations (style transfer, format conversion) to more aggressive interventions (content expansion, pedagogical restructuring, knowledge extraction). A document might be reformatted as a tutorial with worked examples, restructured as FAQ pairs, expanded with explanatory commentary, condensed into knowledge lists, or rewritten in Wikipedia style. Each transformation targets different downstream objectives: tutorials may enhance step-by-step reasoning, FAQs might improve question-answering capabilities, and mathematical reformulations could strengthen quantitative skills. Understanding which transformations work, when they work, and why they work remains an open challenge.
- **Three Critical Dimensions.**  We argue that synthetic data generation must be understood along three fundamental axes: (1)  **Rephrasing strategies** : the specific prompts, formats, and transformation types that convert source documents into synthetic variants; (2)  **Generator model characteristics** : the size, architecture, training, and capabilities of models performing the rephrasing; and (3)  **Source data quality** : the characteristics of seed documents being transformed, from high-quality filtered corpora to noisy web text. Prior work has explored these dimensions in isolation, but their interactions remain poorly understood. Does the optimal rephrasing strategy depend on source quality? Can small models effectively rephrase high-quality data, or do we need larger models to salvage value from noisy documents? When does aggressive transformation help versus hurt?
- **Our Investigation.**  FinePhrase addresses these questions through systematic experimentation across all three axes. We investigate several guiding research questions:
-1.  **Which rephrasing strategies are most effective?**  We compare prompts from prior work (REWIRE's guided rewriting, Nemotron's diverse QA pairs and knowledge extraction) against novel formats (tutorials, FAQs, tables, mathematical reformulations), identifying which transformations consistently improve downstream performance.
-1.  **How do generator model properties affect synthetic data quality?**  We examine model family (Gemma, Llama, Qwen, Granite, Falcon, SmolLM), model generation (Qwen 1.5 through 3), and model scale (270M to 27B parameters), investigating whether rephrasing requires large, capable models or whether smaller models suffice.
-1.  **When does source data quality matter?**  We rephrase both high-quality (FineWeb-Edu-HQ, DCLM) and low-quality (FineWeb-Edu-LQ, Cosmopedia) sources, testing whether rephrasing can recover value from noisy documents or whether it amplifies existing quality differences.
-1.  **How do synthetic and original data interact?**  We explore training regimes that use synthetic data alone, mix synthetic with original data, or decay from one to the other, investigating whether synthetic data complements or replaces traditional web corpora.
-Our experiments reveal that the mix-in dataset (what synthetic data is combined with) often matters more than the source dataset (what gets rephrased), that 1B models generally suffice for effective rephrasing across quality tiers, and that certain prompts (math, table, FAQ, tutorial) consistently outperform both high-quality web data and prior synthetic approaches. We identify SmolLM2's surprising effectiveness as likely stemming from explicit rewriting tasks in its training data, demonstrate that model generation matters more than model family, and show that mixing multiple diverse synthetic datasets can reduce or eliminate the need for original data mixing.
-By systematically exploring these dimensions and documenting the design decisions, failure modes, and practical considerations necessary for generating high-quality synthetic pretraining data at scale, we establish a shared vocabulary and conceptual framework for synthetic data generation. Our goal is to clarify the fundamental principles that determine when, how, and why synthetic data works, provide the detailed recipes the community needs to reproduce and extend this work, and accelerate research by moving beyond ad-hoc approaches toward principled understanding of data rephrasing.
-## Rephrasing Setup
-We conduct large-scale document rephrasing experiments using instruction-tuned language models ranging from 270M to 27B parameters (primarily Gemma-3 variants) on filtered web corpora including FineWeb-Edu and DCLM-baseline-1.0, processing approximately 20 billion input tokens per quality tier. Our pipeline processes documents through customizable prompt templates that transform raw web text into various structured formats (articles, tutorials, FAQs, discussions, commentaries) as well as distillation and continuation tasks inspired by prior work, yielding between ~2 billion and 20 billion output tokens depending on the prompting strategy. We employ vLLM for efficient inference with tensor parallelism, chunked prefill, and speculative decoding (n-gram prompt lookup with ~7 draft tokens achieving acceptance rates around 0.7). Each rephrased document is evaluated with both the FineWeb-Edu classifier and DCLM quality scorer, tracking token counts, quality score deltas, and comprehensive metadata including thinking traces when available, enabling systematic analysis of how different rephrasing strategies affect both output quality and token efficiency across model scales and data quality tiers. The experiments run distributed across 100 parallel tasks on a SLURM cluster with checkpointing support, targeting 10B tokens of synthetic data for downstream data ablations.
-## Source Datasets
-TODO: in the blog, we could make this into a widget where you have a tab for each dataset and then if you click on the tab you can see the description (maybe even some samples).
-We compare against several baseline datasets for pretraining and data rephrasing:
-[ **DCLM (DataComp-LM)** ](https://arxiv.org/abs/2406.11794) **:**  A standardized benchmark providing a 240T token corpus from Common Crawl with model-based filtering as a key curation strategy. DCLM-Baseline enables training a 7B parameter model to 64% accuracy on MMLU with 2.6T tokens.
-[ **Fineweb-Edu-HQ and Fineweb-Edu-LQ** ](https://arxiv.org/html/2406.17557v1) **:**  Subsets of FineWeb-Edu, a 1.3T token educational dataset filtered using Llama-3-70B-Instruct scoring samples on educational quality from 0 to 5. We use HQ (scores 4 or 5) and LQ (scores 0 or 1) to investigate the impact of seed data quality on rephrasing.
-[ **Ultra-Fineweb-1.4** ](https://arxiv.org/abs/2505.05427) **:**  A 1T English token and 120B Chinese token dataset created by applying efficient verification-based filtering to FineWeb. Uses a lightweight fastText classifier and optimized seed data selection to improve data quality.
-[ **Nemotron-HQ-Synth** ](https://arxiv.org/abs/2412.02595) **:**  Part of Nemotron-CC, a 6.3T token dataset using classifier ensembling and synthetic data rephrasing. The High-Quality-Synthetic subset contains synthetically rephrased data using Qwen3-30B-A3B.
-[ **Cosmopedia** ](https://huggingface.co/blog/cosmopedia) **:**  A 30 million file synthetic dataset with 25 billion tokens generated by Mixtral-8x7B-Instruct, containing textbooks, blog posts, and stories across diverse topics. Created through careful prompt engineering conditioning on curated educational sources and web data clusters.
-[ **SYNTH** ](https://pleias.fr/blog/blogsynth-the-new-data-frontier) **:**  A fully synthetic dataset built from 50,000 Wikipedia articles expanded into problems and resolution paths including math exercises, creative writing, and information extraction. Uses multiple specialized synthetic pipelines with fine-tuned models and grounding in encyclopedic content.
-[ **REWIRE** ](https://arxiv.org/abs/2506.04689) **:**  A method for recycling the web with guided rewrite that enriches low-quality documents discarded by filtering pipelines to make them useful for training. Experiments show that mixing high-quality raw texts with rewritten texts leads to 1.0, 1.3, and 2.5 percentage point improvements at 1B, 3B, and 7B scales respectively across 22 tasks.
-We use source data and seed data interchangeably.
-TODO: put this where we first mention source/seed data
-## Ablation Setup
-For our ablations we train a 1.2B parameter language model using a Qwen2-style architecture (see more details in the Appendix). We evaluate our model on a diverse set of 12 benchmark tasks spanning multiple reasoning and knowledge domains. For reasoning capabilities, we assess performance on ARC, HellaSwag, MMLU Redux, Cross-lingual CommonsenseQA (XCSQA), OpenBookQA, Winogrande, and PIQA. Question answering capabilities are evaluated using SQuAD v2, DROP, WikiTableQuestions, and TriviaQA. Mathematical reasoning is assessed via GSM8K. Given that our model is relatively small and trained on only 20 billion tokens, we employ the continuation format (CF) for most tasks rather than the standard multiple-choice format. The CF setup, which frames evaluation as a next-token prediction task, has been shown to provide more reliable assessments for smaller or less extensively trained models that may struggle with complex instruction following or multiple-choice formatting conventions. All evaluations are conducted using 3-shot prompting with a single seed to ensure reproducibility.
-### Naming
-The experiment names follow a systematic structure that encodes training regime, data sources, and synthetic data characteristics:
-####  **Training Regimes**
--  **Baseline** : Single dataset (e.g., fw_edu_hq, cosmopedia)
--  **Mix** : mix-{'{'}dataset1{'}'}-{'{'}dataset2{'}'}-... - uniform mixture of datasets (used for both from-scratch training and as decay targets)
--  **Decay** : {'{'}base{'}'}-decay-{'{'}target{'}'} - starts with base dataset, exponentially decays to target dataset or mixture
-####  **Dataset Components**
--  **Base corpora** : fw_edu (FineWeb-Edu), dclm, cosmopedia
--  **Synthetic types** : continue, rephrase, summarize, tutorial, article, commentary, discussion, faq, math, table (see the complete prompts in the Appendix)
--  **Model & Scale** : 270m, 1b, 4b, 12b, 27b, qwen3-1.7b, etc. (model name and model size in billions of parameters used for generation; no model name corresponds to Gemma-3).
--  **Source** : hq (FineWeb-Edu-HQ), lq (FineWeb-Edu-LQ), dclm (DCLM dataset), cosmopedia
-####  **Examples**
-- mix-fw_edu_hq-continue_1b_hq - from-scratch training on a mixture of FineWeb-Edu-HQ and continuations generated by a 1B parameter model from FineWeb-Edu-HQ
-- fw_edu_lq-decay-mix-fw_edu_hq-tutorial_1b_hq - decay from FineWeb-Edu-LQ to a mixture of FineWeb-Edu-HQ + tutorial-formatted synthetic data generated by a 1B model from FineWeb-Edu-HQ
-TODO: Radically simplify the naming in each ablation section and also to clearly separate baselines:
-1. baselines: have a different linestyle (e.g. dashed and with less alpha)
-1. experiment names: i think within each ablation section it's clear what you are doing (ie you are only changing 1 or 2 of the components listed here), so the experiments could have a short natural language + a tooltip with the full information.
-I am pretty sure you can use this as a prompt for claude to help rename things.
-## A Note on Synthetic Data and Model Collapse
-A common misconception about model collapse is that any use of synthetic data in training will inevitably degrade model performance, leading many to view AI-generated training data with blanket suspicion. This misunderstanding stems from [influential ](https://www.nature.com/articles/s41586-024-07566-y)[research](https://www.ft.com/content/ae507468-7f5b-440b-8512-aea81c6bf4a5) that demonstrated severe degradation when models were trained exclusively and iteratively on outputs from previous model generations, without any injection of new information or human-generated content. In practice, however, the AI research community doesn't train models this way. Real-world applications of synthetic data typically involve mixing it with genuine human data, using diverse reference materials in prompts to ensure variety, and employing synthetic data strategically for specific purposes like domain adaptation or augmenting limited datasets rather than replacing entire training corpora. The key distinction is that model collapse occurs specifically when models are trained in a closed loop on their own outputs without introducing new signal or information, a scenario that practitioners actively avoid. The concern should be focused on frontier models generating training data for other frontier models in isolation, not on the thoughtful integration of synthetic data that introduces new knowledge or perspectives into the training process. In [Fineweb v1](https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1) we also did not find degradation from naturally occurring data on the web likely created by ChatGPT.
-## Experiments
-TODO: experiments: go one by one as you currently have and write down the initial hypothesis, the results, if this was surprising or not, if it prompted another experiment (which then leads to a nice flow between experiments)
-TODO: Benchmarking: plot compare against default, mention how expensive one sweep is, automatically produce plot from baseline to be optimized and spit out the result
-TODO: With Thibaud look how to visualize the 1T tokens of ablation data (similar to fineweb visualization)
-TODO: rename the experiment names so it is easier to understand
-TODO: think about what dataset to build and release as artifact: do more rephrasing with smollm2
-TODO: larger ablation with 100b tokens (how little synthetic data can we get away with)
-TODO: more conversational style (less condensed than papers)
-TODO: read recent long blog posts as inspiration
-TODO: add a visualization for the infrastructure
-TODO: add a plot for the table with the benchmark results
-TODO: Analyze if certain models are more verbose than others (how many tokens did they produce per prompt?)
-TODO: Add appendix section of weird unexplainable results?
-### FinePhrase vs Synthetic Baselines
-We see that FinePhrase clearly outperforms the synthetic baselines.
-TODO: call the best one FinePhrase in the blog post and just show that one
-<Image src={newplot_2f81384e_bcac_804d_b760_e8611cc0302b} alt="Image" />
-### Baselines
-DCLM, REWIRE and Nemotron-HQ-Synth are the strongest baselines in our setup by a significant margin. For this reason we compare against DCLM in the following experiments.
-<Image src={newplot_2e21384e_bcac_801e_b0b1_da03761b1dc6} alt="Image" />
-#### Disecting the synthetic baselines
-Using gemma-3-1b, the prompt from REWIRE (guided_rewrite_original) is on-par with DCLM in our setup. Nemotron-HQ-Synth was created using five prompts: diverse_qa_pairs, extract_knowledge, distil, wikipedia_style_rephrasing and knowledge_list. The only prompt that really works well in our setup is diverse_qa_pairs. This is mainly due to very strong performance on SQUAD. We used fineweb-edu-hq as the source dataset for all prompts.
-<Image src={newplot_2c41384e_bcac_8073_9395_cf2d0e901187} alt="Image" />
-We see that dclm is a very strong baseline: apart from the diverse_qa_pairs prompt from the Nemotron-HQ-Synth dataset, no other open prior work outperforms dclm. Can we do better with different prompts?
-### Which new prompts work well?
-We found four prompts that outperform both fw-edu-hq and the challenging dclm baseline: math, table, faq and tutorial.
-<Image src={newplot_2c31384e_bcac_800b_82e8_ff44228f7720} alt="Image" />
-For now we just used the Gemma-3-1b model, but can we do better by changing the rephrasing model?
-<details>
-  <summary>
-  <h2>Impact of the Rephrasing Model </h2>
-  </summary>
-  In general, we want to know whether using a stronger model leads to better synthetic data. We look at this dimension from three angles: model size, model family and model generation.
-#### Does the model size matter?
-We compare rephrasing with all Gemma-3 sizes (270m, 1b, 4b, 12b, 27b) using the tutorial prompt. We find that the 270m model underperforms but otherwise there is no significant difference.
-<Image src={newplot_2e11384e_bcac_800a_abc6_d0690da3f955} alt="Image" />
-Potentially, writing a tutorial is easy enough and we only need larger models for harder prompts such as Math. So we tested it there too, but find similar results with the 270m underperforming and no large difference between 1b, 4b, 12b and 27b.
-<Image src={newplot_2e21384e_bcac_80a2_9bac_c543304d926e} alt="Image" />
-TODO: also run this experiment for the REWIRE prompt since the original authors claim that larger models are necessary there
- **Do we need better models for rephrasing low-quality data?**
-The [REWIRE](https://arxiv.org/abs/2506.04689) paper claims that for upcycling low quality data we need large models (Llama-3.3 70B in their case). Is this true?
-Continue prompt: For the 1b model the source data does not seem to matter, but the 12b model can make use of the hq data better.
-<Image src={newplot_2e11384e_bcac_80dd_972d_cf77d9c3b004} alt="Image" />
-Tutorial prompt: For the hq data the model size does not seem to matter whereas for the lq data the larger model is slightly better.
-<Image src={newplot_2e11384e_bcac_80a3_a6fa_e8634e0e2206} alt="Image" />
-FAQ prompt: Surprisingly, the 1b model is better for both lq and hq data.
-<Image src={newplot_2e41384e_bcac_80c0_aef5_e71fdbaccd8d} alt="Image" />
-In general we cannot reproduce REWIRE's claim that large models are needed for lq data. Overall we rarely see benefits of using models larger than 1b. So as long as the model has some baseline level (in our experiments already reached at the 1b scale) we see no evidence for a clear benefit of using larger models for rephrasing. For these reasons we default to the 1b size for maximum throughput from here on. We hypothesize that most rephrasing tasks are simple enough for smaller models to handle sufficiently well.
-#### Does the model family matter?
-Some model families may be better suited for rephrasing than others based on their training data. This is why we test top families at the 1B scale on the four top-performing prompts tutorial, faq, table, math. We find that for the tutorial prompt at the 1B scale Llama-3.2, Granite-3, Gemma-3, and Qwen3 and Falcon3 perform roughly at the same level. SmolLM2 clearly outperforms.
-<Image src={newplot_2da1384e_bcac_80d6_a8b9_da80324f8fef} alt="Image" />
-In the faq prompt SmolLM2 again clearly outperforms the others. Here Qwen3 underperforms.
-<Image src={newplot_2e71384e_bcac_8027_ae32_c133627ede4a} alt="Image" />
-For the table prompt we again see SmolLM2 and to some degree Falcon3 outperform.
-<Image src={newplot_2f71384e_bcac_80c6_a99e_f52084fc497b} alt="Image" />
-Finally, math is again a clear win for SmolLM2 with Qwen3 underperforming.
-<Image src={newplot_2f71384e_bcac_80d8_9985_e195d39f1e70} alt="Image" />
-We hypothesize that the consistently strong performance of SmolLM2 originates from [rewrite tasks](https://huggingface.co/datasets/HuggingFaceTB/smoltalk/viewer/smol-rewrite?row=0&views%5B%5D=smol_rewrite_train) in the training data.
-So the model family clearly seems to matter. However, SmolLM2 is already a year old. Are newer models better than older ones?
-#### Does the model generation matter?
-We compare rephrasing with Qwen models from versions 1.5, 2, 2.5 and 3 using the tutorial prompt, one of the prompts that outperformed the DCLM baseline. While the differences are small we find a trend that newer versions lead to higher evaluation performance.
-<Image src={newplot_2d21384e_bcac_80ab_a6dd_e31a6c150e61} alt="Image" />
-So now we know that certain models are better than others, newer models tend to outperform older models and usually rephrasing models can be as small as 1B parameters. What difference do the dataset choices make?
-</details>
-<details>
-  <summary>
-  <h2>Impact of the Dataset Choices</h2>
-  </summary>
-  ### Does the mix-in dataset matter?
-To test the effect of the mix-in dataset we apply the tutorial prompt using Gemma-3-1b on fw_edu_hq and mix in dclm, cosmopedia, fw_edu_hq and fw_edu_lq. We find that the mix-in dataset makes a substantial difference, with cosmopedia and fw_edu_lq underperforming dclm and fw_edu_hq. fw_edu_hq and dclm achieve very similar accuracy even though dclm is much better by itself. We see that mixing in the synthetic data improves performance for all mix-in datasets. The effect is more pronounced for the worse datasets fw_edu_lq and cosmopedia.
-<Image src={newplot_2e11384e_bcac_80ea_88cc_c971b2816596} alt="Image" />
-Does this trend hold for other source datasets? We ran the experiment for fw_edu_lq as source and find similar results: fw_edu_hq and dclm outperform both cosmopedia and fw_edu_lq. For all mix-in datasets except dclm, adding synthetic data is beneficial.
-<Image src={newplot_2e11384e_bcac_8032_9835_e1407f4d780d} alt="Image" />
-So we know that the mix-in dataset plays a large role. What about the source dataset used for rephrasing?
-#### Does the source dataset matter?
-To investigate to what extent the source dataset for rephrasing matters we rephrased dclm, cosmopedia, fw_edu_hq and fw_edu_lq using the Gemma-3-1B model and the tutorial and faq prompts. When we mix in the source dataset with the rephrased data we find fw_edu_hq and dclm clearly outperforming fw_edu_lq and cosmopedia for both prompts.
-<Image src={newplot_2df1384e_bcac_80bc_b93c_ee8e9cfd5529} alt="Image" />
-<Image src={newplot_2df1384e_bcac_8018_b1f6_da1dcde1f90a} alt="Image" />
-When fix the mix-in dataset to fw_edu_hq, the difference shrinks drastically for the tutorial prompt and even more for the faq prompt. This corroborates our finding that the mix-in datasets seem to matter much more than the source rephrasing datasets.
-<Image src={newplot_2e01384e_bcac_8017_9829_cd0c1db928c6} alt="Image" />
-<Image src={newplot_2e01384e_bcac_806f_8bf1_f7e5405a2ff9} alt="Image" />
-#### Is synthetic data enough?
-We were wondering whether just training on synthetic data works. While we get increased performance over fw-edu-hq, it does not match the original dataset performance (DCLM) and also is clearly below the performance of the original dataset mixed with the rephrased one for both the tutorial and faq prompts. We get the same result when we rephrase fw_edu_hq instead of dclm.
-<Image src={newplot_2d61384e_bcac_8092_baca_c17346b95734} alt="Image" />
-<Image src={newplot_2e41384e_bcac_8065_b313_c38a6db4ac31} alt="Image" />
-#### Does increased diversity help?
-There are multiple ways of increasing diversity. We can think of mixing rephrasing approaches, mixing model families or both at the same time.
- **Mixing rephrasing approaches**
-We were wondering whether mixing the best performing rephrasing approaches can improve over the individual approaches. We find no significant increase over the best performing approach (mix-fw_edu_hq-math_1b_hq). It seems that when we mix together enough different prompts (mix-tutorial_1b_hq-faq_1b_hq-table_1b_hq-math_1b_hq), we don't necessarily need the source dataset (fw_edu_hq) for good performance. This could mean that when just training on one synthetic dataset we need the original dataset for diversity, but when we mix multiple ones it is not necessary. However, it does not hurt and is an easy way of increasing the dataset size while keeping the performance high. To follow up it would be interesting to study with how little synthetic data we can get away with without performance drops.
-<Image src={newplot_2df1384e_bcac_8010_abe7_cf477262b8d6} alt="Image" />
- **Mixing model families**
-We rephrased using different model families and saw SmolLM2 and Falcon3 clearly outperform Llama3.2 and Granite3. Now we wonder whether mixing the rephrased outputs of multiple models improves performance through increased diversity.
-<Image src={newplot_2e11384e_bcac_80bc_810d_d13554c628dc} alt="Image" />
-It turns out that benchmark performance does not improve through increased rephrasing model diversity but is largely an average of the mixed datasets performance (smollm2 and falcon3 are similar to just smollm2, smollm2 and llama3.2 lie in between smollm2 and llama3.2, llama3.2 and granite3 are similar to just llama3.2).
- **Mixing both rephrasing approaches and model families**
-Maybe we need more diversity by mixing both rephrasing approaches and model families?
-<Image src={newplot_2f61384e_bcac_80d9_ab81_d57a228847cf} alt="Image" />
-No, we get the same results as for just mixing rephrasing approaches or model families independently: the mix is around the average performance instead of resulting in a gain.
-</details>
-### Do typos in the prompt hurt?
-The original REWIRE prompt contains many typos and grammar errors. To what extent do typos in the prompt hurt performance?
-<Image src={newplot_2ee1384e_bcac_80da_82cd_df97247e2e72} alt="Image" />
-Surprisingly, typos don't have a negative effect on downstream model performance. For the 1b model, even the opposite is the case.
-### Does edu-score or dclm-score predict model performance?
-Running these ablations is super expensive. So we were looking for informative proxies that can predict whether a certain dataset will result in better downstream benchmark performance. Since the fw-edu-score and dclm-score work well for human data, we surmised it could also work for synthetic data.
-TODO: Run this analysis and add a small report
-### Math Rephrasing: When "Worse" Outputs Win
-We compared two ~1.7B parameter models for generating math word problems: SmolLM2 and Qwen3. At first glance, SmolLM2's outputs looked significantly worse—shorter, inconsistent, often missing solutions entirely. So when models trained on SmolLM2 data outperformed those trained on Qwen3 data, we investigated why.
- **Qwen3 produced beautiful, structured outputs:**
-- 100% had proper Problem/Solution sections
-- 99% had step-by-step formatting
-- 60% included LaTeX math notation
-Here's a typical Qwen3 output:
-```
-**Problem:**
-A disc rotates at 120 rpm. How many revolutions in 5 minutes?
-**Solution:**
-1. Revolutions per minute = 120
-2. Number of minutes = 5
-3. Total revolutions = 120 × 5
-$$120 \\times 5 = 600$$
-The disc makes 600 revolutions in 5 minutes.
-```
- **SmolLM2 was messier:**
-- Only 68% had complete solutions
-- Wide variance in output length (4 to 4,000 tokens)
-- Mix of formats: questions, partial answers, full solutions
-SmolLM2 outputs ranged from proper solutions to just questions like  *"What is the difference between X and Y?"*  or even 4-token fragments like  *"Areas Where We Service"* .
-Yet models trained on SmolLM2's data  **outperformed**  those trained on Qwen3's data on downstream benchmarks.
-#### Our Hypothesis: Template Collapse
-We suspect Qwen3's outputs were  *too*  consistent. 115 out of 1,000 samples started with identical text, while SmolLM2's most common pattern appeared only 3 times.
-| Metric | SmolLM2 | Qwen3 |
-| --- | --- | --- |
-| Most common start | 3/1000 | 115/1000 |
-| Output length range | 4-4,000 | 100-2,600 |
-| Unique patterns | High | Low |
-SmolLM2's quality distribution was actually reasonable:
-| Quality | Criteria | Share |
-| --- | --- | --- |
-| Excellent | Has "solution" + numbered steps + 80+ tokens | 45% |
-| Good | Has "solution" + 50+ tokens | 22% |
-| Partial | 30+ tokens but missing structure | 25% |
-| Poor | {'<'}30 tokens | 8% |
-This natural variance may have created more diverse training signal than Qwen3's uniformly formatted outputs.
- **Takeaway:**  For pretraining data, diversity might beat consistency. Models that don't follow instructions perfectly could produce better training data than those that do—though more experiments are needed to confirm this.
-## Conclusions
-TODO: Table with answers to the questions (ablation sections)
-## Next Steps
-The main bottleneck to scaling experimentation on synthetic data for pretraining is compute to create the data (for reference, it takes roughly 3800 H100 GPU hours to generate the 10B tokens with Gemma-3-1B-IT, needed for one of these ablations). Diffusion language models offer substantial promise for synthetic data generation through their parallel generation capabilities and reported 2-10x inference speedups compared to autoregressive approaches. However, their current experimental status and absence of native support in production serving frameworks (e.g., vLLM, SGLang) present significant barriers to adoption that must be addressed before diffusion-based methods can be practically deployed at scale.
-While we answered some questions in this work, many still remain such as:
-- Can you "repeat" data more without performance loss if the repetitions are rephrased?
-- We mixed unrephrased source data with synthetic data in our experiments to equal proportions. How little synthetic data can we get away with: 50%, 20%, 5%?
-- What influence do generation parameters such as temperature or top_p have on rephrasing performance
-- [https://z-lab.ai/projects/dflash/](https://z-lab.ai/projects/dflash/) This as future work for speeding up inference more: currently still a bit cumbersome to use and limited model support
-- Experiment with chunked rollouts context extension in mid-training
-- Experiment with multiple rollouts per example and filtering for the highest quality one
-- In REWIRE, they show larger gains for bigger models trained on their data. Can we reproduce this?
-- Does automatic prompt optimization with tools like dspy improve rephrasing performance?
-## Appendix
-<details>
-  <summary>
-  <h2>Details on the experiments</h2>
-  </summary>
-  For our ablations we train a 1.2B parameter language model using a Qwen2-style architecture with 28 layers, a hidden dimension of 2048, 16 attention heads with 8 key-value heads (grouped-query attention), and an intermediate size of 6144. The model utilized the Llama 3.2 tokenizer ( `hynky/Llama-3.2-1B-no-bos` ) with a vocabulary size of 128,256 tokens. Training was conducted on 64 NVIDIA H100 80GB GPUs across 8 nodes using pure data parallelism (DP=64) with a global batch size of 512 and a sequence length of 4,096 tokens, accumulating to approximately 21 billion tokens total over 10,000 steps. We employed the AdamW optimizer with a learning rate of 5×10⁻⁴, β₁=0.9, β₂=0.95, weight decay of 0.1, and gradient clipping at 1.0. All training utilized bfloat16 precision with Flash Attention 2, fused operations (RMS normalization and rotary embeddings), and document masking to prevent cross-document attention. We aim to rephrase at least 10B tokens per experiment but due to wildly varying number of completion tokens by prompt we sometimes get less than that. In these cases we train on some of the data twice.
-</details>
-<details>
-  <summary>
-  <h2>Prompts</h2>
-  </summary>
-  ### BeyondWeb
-#### continue
-Continue the following text in the same style as the original. Start with the continuation directly.
-Text:
-[TEXT]
-#### summarize
-Summarize the following text. Write a standalone summary without referencing the text. Directly start with the summary. Do not say anything else.
-Text:
-[TEXT]
-Summary:
-#### Format
-#### article
-Transform the document into a magazine-style feature article. Open with an engaging lead, then blend narrative storytelling with factual explanation. Maintain an accessible yet polished tone suitable for a general but informed readership. Output only the feature article, nothing else.
-Document:
-[TEXT]
-#### commentary
-Summarize the document in a concise paragraph that captures its central arguments or findings. Then, write an expert commentary that critically reflects on its implications, limitations, or broader context. Maintain an analytical and professional tone throughout. Output only the summary and the commentary, nothing else.
-Document:
-[TEXT]
-#### discussion
-Reformulate the document as a dialogue between a teacher and a student. The teacher should guide the student toward understanding the key points while clarifying complex concepts. Keep the exchange natural, informative, and faithful to the original content. Output only the dialogue, nothing else.
-Document:
-[TEXT]
-#### faq
-Rewrite the document as a comprehensive FAQ (Frequently Asked Questions). Extract or infer the key questions a reader would have about this topic, then provide clear, direct answers. Order questions logically—from foundational to advanced, or by topic area. Each answer should be self-contained and understandable without reference to other answers. Ensure the FAQ works as a standalone document. Output only the FAQ, nothing else.
-Document:
-[TEXT]
-#### math
-Rewrite the document to create a mathematical word problem based on the numerical data or relationships in the text. Provide a step-by-step solution that shows the calculation process clearly. Create a problem that requires multi-step reasoning and basic arithmetic operations. It should include the question followed by a detailed solution showing each calculation step. Output only the problem and solution, nothing else.
-Document:
-[TEXT]
-#### table
-Rewrite the document as a structured table that organizes the key information, then generate one question-answer pair based on the table. First extract the main data points and organize them into a clear table format with appropriate headers using markdown table syntax with proper alignment. After the table, generate one insightful question that can be answered using the table data. Provide a clear, concise answer to the question based on the information in the table. Output only the table followed by the question-answer pair, nothing else.
-Document:
-[TEXT]
-#### tutorial
-Rewrite the document as a clear, step-by-step tutorial or instructional guide. Use numbered steps or bullet points where appropriate to enhance clarity. Preserve all essential information while ensuring the style feels didactic and easy to follow. Output only the tutorial, nothing else.
-Document:
-[TEXT]
-#### Nemotron
-#### distill
-Your task is to read and paraphrase the provided text following these instructions:
-- Aim to create a condensed but accurate and informative version of the original text, not a simplistic summary.
-- Capture and preserve the crucial information, key concepts, important values, and factual details in the original text, while making it more readable and accessible.
-- Retain technical terms, specialized vocabulary, and complex concepts.
-- Retain examples, explanations of reasoning processes, and supporting evidence to maintain the text's depth and context.
-- Only include information that is present in the original text. Do not adding new or unsubstantiated claims.
-- Write in plain text.
-Here is the text:
-[TEXT]
-Task:
-After thoroughly reading the above text, paraphrase it in high-quality and clear English following the instructions.
-#### diverse_qa_pairs
-Task: Read the text, ask questions and answer them.
-Follow these instructions:
-1. Ask diverse questions that require different cognitive skills or cover different aspects of the text.
-1. Ask questions in various forms such as:
-    - Yes/No questions that require determining whether a statement is true or false.
-    - Open-ended questions that begin with words like what, how, when, where, why and who.
-    - Multi-choice questions that offers two or more options to choose from. Include the options in the question.
-    - Comparison questions that compare two quantities or objects and determine the relationship between them.
-    - Reading comprehension questions that test the ability to understand and analyze the text.
-    - Problem-solving questions that test the ability to solve mathematical, physical, or logical problems.
-1. Focus on asking questions about factual information, important knowledge, or concrete details in the text.
-1. Write questions and answers using clear and concise language.
-1. Use plain text. Do not use Markdown.
-1. Each question and answer pair should be on a separate line. Tag the question with "Question:" and the answer with "Answer:".
-Text:
-[TEXT]
-Task:
-After reading the above text, ask up to 8 questions and provide the correct answers following the instructions. Give your response in this format:
-Here are the questions and answers based on the provided text:
-- Question: [first question] Answer: [first answer]
-- Question: [second question] Answer: [second answer]
-....
-#### extract_knowledge
-Your task is to rewrite knowledge from the provided text following these instructions:
-- Rewrite the text as a passage or passages using easy-to-understand and high-quality English like sentences in textbooks and Wikipedia.
-- Focus on content in disciplines such as humanities, social sciences, natural sciences, technology, engineering, math, law and legal, business, management, art, education, agricultural sciences, politics, and history.
-- Disregard content that does not contain useful facts or knowledge.
-- Retain examples, explanations of reasoning processes, and supporting evidence to maintain the text's depth and context.
-- Do not add or alter details. Only restate what is already in the text.
-- Write in plain text.
-- Do not add titles, subtitles, note, or comment.
-Text:
-[TEXT]
-Task:
-Rewrite facts and knowledge from the above text as a passage or passages following the instructions.
-#### knowledge_list
-Review the text and extract the key information. Follow these instructions:
-- Carefully read the above text and provide a concise and organized list of factual information, concrete details, key concepts, and important numbers and statistics extracted from the text.
-- Ensure each point is clear, specific, and supported by the original text.
-- Ensure the extract text is information-dense and easier to learn from.
-- Do not add titles or headings.
-Text:
-[TEXT]
-Task:
-Extract the factual information, concrete details, and key concepts from the above text following the instructions.
-#### wikipedia_style_rephrasing
-For the following paragraph give me a diverse paraphrase of the same in high quality English language as in sentences on Wikipedia. Begin your answer on a separate line with "Here is a paraphrased version:".
-Text:
-[TEXT]
-#### REWIRE
-#### guided_rewrite_improved
-Below is a draft from an AI Assistant when trying to accomplish a task or solve a problem. Analyze and understand the task and problem(s) to be solved. Then pretend to be the expert who is most skillful to accomplish this task, and use detailed thinking and internal reasoning to identify a strategy and develop a plan about how to solve this problem. Experts usually apply meta-reasoning and planning to reason about how to best accomplish the task before jumping to a solution.
-Deliberate meta-reasoning also involves reflection which can help identify issues and take a step back to explore other paths. Below are some generic examples of starting questions experts could ask themselves during the meta-reasoning process. The expert will come up with the most relevant questions that can help with their thinking process, which are also very specific to the task.
-Consider these questions during your internal reasoning process:
-- What is the core issue or problem that needs to be addressed? What are the key assumptions underlying this problem?
-- How can I break down this problem into smaller, more manageable parts? How can I simplify the problem so that it is easier to solve?
-- What kinds of solutions are typically produced for this kind of problem specification? Given the problem specification and the current best solution, what other possible solutions exist? If the current best solution is totally wrong, what other ways are there to think about the problem specifically?
-- What is the best way to modify this current best solution, given what you know about these kinds of problem specifications?
-- Am I on the right track? Check your progress so far.
-- Develop a step by step plan internally.
-Finally, rewrite the original content from the author's perspective, maintaining their voice and intent while making substantial improvements. Take information and details from the original draft whenever they are useful. The rewritten content should not be shorter than the original response. The improved version should have significantly better formatting and readability, with more coherent and in-depth reasoning, enhanced clarity, stronger structure, and removal of any noise or digression. Write as if you are the original author meaningfully improving their own work - not just making minor edits.
-IMPORTANT: Your output must be ONLY the actual rewritten content itself - nothing else. Do NOT include any analysis, commentary, description, summary, or explanation about the improvements made. Do NOT add any meta-commentary like "This version improves..." or similar statements. Do NOT reference "the original draft" or "the draft" in your output. Output ONLY the content as if it were the final published piece that readers would see, with absolutely no additional text before or after it.
-Original Draft:
-[TEXT]
-#### guided_rewrite_original
-Below is a draft from an AI Assistant when trying to accomplish task or solving a problem. Analyze and understand the task and problem(s) to be solved. Then pretend to be the expert who is most skillful to acomplish this task, write down the detailed thinking process and internal monologue that went into identifying a strategy and lay out a plan about how to solve this problem. Experts usually apply meta-reasoning and planning to reason about how to best accomplish the task before jumping to solution.
-Deliberate meta-reasoning also involves reflection which can help identify issues and take a step back to explore other paths. Below are some generic examples of starting questions experts could ask themselves during meta-reasoning process. The expert will come up with the most relevant questions that can help with their thinking process, which are also very specific to the task.
-Let's first try to understand the task and exactly what problem(s) to be solved. What is the core issue or problem that needs to be addressed? What are the key assumptions underlying this problem?
-How can I break down this problem into smaller, more manageable parts? How can I simplify the problem so that it is easier to solve?
-What kinds of solution typically are produced for this kind of problem specification? Given the problem specification and the current best solution, have a guess about other possible solutions. Let's imagine the current best solution is totally wrong, what other ways are there to think about the problem specific
-What is the best way to modify this current best solution, given what you know about these kinds of problem specification?
-Am I on the right track? Let's check our progress so far.
-Let's make a step by step plan and implement it with good notion and explanation.
-Finally, write an improved response after thinking about how to accomplish the task. Take information and details from the original draft whenever they are useful. Therefore, the improved response should not be shorter than the original response. The improved response should have better formatting and readability, with more coherent and in-depth reasoning, while removing any noise or digression. Note that the best experts chosen to answer each prompt may be different, so please make sure the you do not sound like the same expert for all tasks.
-IMPORTANT: Start your analysis and thinking right away. DO NOT add any filler text, explanations or notes about your response. Put the thinking and planning between {'<'}thinking starts{'>'} and {'<'}thinking ends{'>'}, and the improved response between {'<'}improved response starts{'>'} and {'<'}improved response ends{'>'}.
-Original Draft: [TEXT]
-</details>
-<details>
-  <summary>
-  <h2>Decay vs Scratch</h2>
-  </summary>
-  We explored two distinct training paradigms. In the  **from-scratch**  setup ( `decay_exp=false` ), models were trained for the full 10,000 steps (~21B tokens) on a single dataset or mixture of datasets. In contrast, the  **decay**  experiments ( `decay_exp=true` ) aimed to obtain quicker signal with fewer rephrased tokens by leveraging a two-stage training approach. These decay experiments resumed training from a checkpoint at step 9,000 of a model previously trained on lower-quality data ( `fw_edu_lq` ), then continued training with a new dataset (or mixture) for the final 1,000 steps (~2B tokens) during the learning rate decay phase. We selected the low quality fineweb dataset for the first training phase so we can see effects of the ablated data mixtures more clearly. This design allowed us to evaluate the impact of high-quality rephrased or synthetic data more efficiently, requiring around 2B rephrased tokens rather than the full 21B needed for from-scratch training, thus reducing computational costs by 90% per experimental condition while still providing meaningful signal about data quality effects. To enable the decay experiments, we used a warmup-stable-decay (WSD) learning rate schedule with 1% warmup (100 steps), 89% stable training, and 10% linear decay (1,000 steps) to a minimum of 5×10⁻⁵.
-#### Variance across seeds and data seeds
-The seed parameter sets the global random seed for the training experiment, ensuring reproducibility for model weight initialization and other global operations across different runs. The data-seed parameter specifically controls the randomness of the data pipeline, such as dataset shuffling and sampling, ensuring reproducible data ordering across different training runs.
-As a first validation of the decay experiment, we were interested in the variance across runs. So we ran a grid of 3x3 seeds (1,2,3) and data seeds (1,2,3) for 3 datasets, vanilla fw_edu_hq, mix-fw_edu_hq-continue_1b_hq and mix-fw_edu_hq-tutorial_12b_hq. Overall we found the variance to be fairly small, giving us early confidence in the setup.
-Decaying with fw_edu_hq the minimum macro averaged score is 10.73 and the maximum 11.05 across a grid of 3x3 seeds and data seeds. Decaying with mix-fw_edu_hq-continue_1b_hq ranges from 12.9 to 13.21 macro averaged score. Finally, decaying with mix-fw_edu_hq-tutorial_12b_hq ranges from 13.25 to 13.43.
-#### Correlation to runs from scratch
-From scratch the ranking is dclm (13.77) > nemotron_hq_synth (13.54) > fw_edu_hq (11.82) > cosmopedia (10.33) > synth (10.03). For decay the ranking is nemotron_hq_synth (12.35) > dclm (11.80) > fw_edu_hq (10.66) > cosmopedia (10.57) > synth (10.50). So while we see a meaningful difference between fw_edu_hq and cosmopedia/synth from scratch, they are very close in the decay. Additionally, dclm and nemotron_hq_synth are flipped.
-⇒ Fast vibe-check if the dataset is useful or not (TODO: add rephrasing comparison rankings)
-</details>