joelniklaus HF Staff commited on
Commit
fdfc515
·
1 Parent(s): 621688d

prettified experiment plots

Browse files
app/src/content/chapters/experiments.mdx CHANGED
@@ -8,11 +8,10 @@ import FigRef from "../../components/FigRef.astro";
8
  {/* TODO: read through entire blog post and make improvements */}
9
  {/* TODO: potentially make a widget for data exploration: look at the same few samples generated by different models or transformed with different prompts */}
10
  {/* TODO: Integrate decay experiment as another analysis for proxy */}
11
- {/* TODO: ask elie about reddit post */}
12
- {/* TODO: draft tweet */}
13
- {/* TODO: ask kashif about hackernews posting */}
14
- {/* TODO: ask elie and merve to share it on discord channels and ask about comms in general */}
15
- {/* TODO: share on a bunch of discords/slacks */}
16
 
17
  {/*
18
  Notes:
@@ -40,8 +39,7 @@ We train on eight datasets under identical conditions and compare their final ev
40
  src="d3-benchmark-comparison.html"
41
  desc="Comparison of baseline datasets across different evaluation metrics. Use the dropdown to switch metrics."
42
  config={{
43
- baselines: [],
44
- datasetNames: {
45
  cosmopedia: "Cosmopedia",
46
  dclm: "DCLM",
47
  fw_edu_hq: "FineWeb-Edu-HQ",
@@ -67,33 +65,20 @@ The BeyondWeb dataset was never released and the paper omits key details, yet cl
67
  <HtmlEmbed
68
  id="dissecting-baselines"
69
  src="d3-benchmark-comparison.html"
70
- desc="Individual prompt performance from existing synthetic datasets compared to DCLM and FineWeb-Edu-HQ."
71
  config={{
72
- baselines: ["dclm", "nemotron_hq_synth", "rewire"],
73
- datasetNames: {
74
- "mix-fw_edu_hq-diverse_qa_pairs_1b_hq": "Diverse QA Pairs",
75
- dclm: "DCLM",
76
- "mix-fw_edu_hq-extract_knowledge_1b_hq": "Extract Knowledge",
77
- "mix-fw_edu_hq-guided_rewrite_original_1b_hq": "Guided Rewrite",
78
- nemotron_hq_synth: "Nemotron-HQ-Synth",
79
- rewire: "REWIRE",
80
- "mix-fw_edu_hq-distill_1b_hq": "Distill",
81
- "mix-fw_edu_hq-wikipedia_style_rephrasing_1b_hq": "Wikipedia Rephrasing",
82
- "mix-fw_edu_hq-knowledge_list_1b_hq": "Knowledge List",
83
- "mix-fw_edu_hq-continue_1b_hq": "Continue",
84
- "mix-fw_edu_hq-summarize_1b_hq": "Summarize"
85
- },
86
- pinnedColors: {
87
- "Nemotron-HQ-Synth": "#76b900",
88
- "Diverse QA Pairs": "#c5e384",
89
- "Distill": "#a0c95c",
90
- "Wikipedia Rephrasing": "#7fb034",
91
- "Knowledge List": "#5e960e",
92
- "Extract Knowledge": "#3d6b00",
93
- "REWIRE": "#1877F2",
94
- "Guided Rewrite": "#6aabff",
95
- "Continue (BeyondWeb)": "#e8713a",
96
- "Summarize (BeyondWeb)": "#c4451c"
97
  }
98
  }}
99
  />
@@ -102,14 +87,14 @@ Can we design prompts that consistently beat DCLM?
102
 
103
  ### Can New Prompts Beat DCLM?
104
 
105
- Since most existing prompts fail to beat DCLM, we designed seven novel prompt formats targeting different skills ([math](#math), [table](#table), [faq](#faq), [tutorial](#tutorial), [article](#article), [commentary](#commentary), [discussion](#discussion)), all using Gemma-3-1B on FineWeb-Edu-HQ. Four prompts ([math](#math), [table](#table), [faq](#faq), [tutorial](#tutorial)) outperform both FineWeb-Edu-HQ and DCLM, while [article](#article), [commentary](#commentary), and [discussion](#discussion) are at or below DCLM level (see <FigRef target="new-prompts" />). The best-performing prompts all restructure the source content into pedagogically rich formats.
106
 
107
  <HtmlEmbed
108
  id="new-prompts"
109
  src="d3-benchmark-comparison.html"
110
- desc="Seven new prompts compared against DCLM and FineWeb-Edu-HQ."
111
  config={{
112
- datasetNames: {
113
  "mix-fw_edu_hq-math_1b_hq": "Math",
114
  "mix-fw_edu_hq-table_1b_hq": "Table",
115
  "mix-fw_edu_hq-faq_1b_hq": "FAQ",
@@ -117,8 +102,7 @@ Since most existing prompts fail to beat DCLM, we designed seven novel prompt fo
117
  "mix-fw_edu_hq-article_1b_hq": "Article",
118
  "mix-fw_edu_hq-commentary_1b_hq": "Commentary",
119
  "mix-fw_edu_hq-discussion_1b_hq": "Discussion",
120
- dclm: "DCLM",
121
- fw_edu_hq: "FineWeb-Edu-HQ"
122
  }
123
  }}
124
  />
@@ -149,45 +133,41 @@ It is possible that larger models produce richer or more nuanced rephrasings tha
149
  config={{
150
  setups: {
151
  "Gemma-3: Tutorial": {
152
- datasetNames: {
153
  "mix-fw_edu_hq-tutorial_27b_hq": "Gemma-3 27B",
154
  "mix-fw_edu_hq-tutorial_12b_hq": "Gemma-3 12B",
155
  "mix-fw_edu_hq-tutorial_4b_hq": "Gemma-3 4B",
156
  "mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3 1B",
157
  "mix-fw_edu_hq-tutorial_270m_hq": "Gemma-3 270M",
158
- dclm: "DCLM",
159
- fw_edu_hq: "FineWeb-Edu-HQ"
160
  }
161
  },
162
  "Gemma-3: Math": {
163
- datasetNames: {
164
  "mix-fw_edu_hq-math_27b_hq": "Gemma-3 27B",
165
  "mix-fw_edu_hq-math_12b_hq": "Gemma-3 12B",
166
  "mix-fw_edu_hq-math_4b_hq": "Gemma-3 4B",
167
  "mix-fw_edu_hq-math_1b_hq": "Gemma-3 1B",
168
  "mix-fw_edu_hq-math_270m_hq": "Gemma-3 270M",
169
- dclm: "DCLM",
170
- fw_edu_hq: "FineWeb-Edu-HQ"
171
  }
172
  },
173
  "Gemma-3: REWIRE": {
174
- datasetNames: {
175
  "mix-fw_edu_hq-guided_rewrite_original_27b_hq": "Gemma-3 27B",
176
  "mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Gemma-3 12B",
177
  "mix-fw_edu_hq-guided_rewrite_original_4b_hq": "Gemma-3 4B",
178
  "mix-fw_edu_hq-guided_rewrite_original_1b_hq": "Gemma-3 1B",
179
  "mix-fw_edu_hq-guided_rewrite_original_270m_hq": "Gemma-3 270M",
180
- dclm: "DCLM",
181
- fw_edu_hq: "FineWeb-Edu-HQ"
182
  }
183
  },
184
  "SmolLM2: Tutorial": {
185
- datasetNames: {
186
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2 1.7B",
187
  "mix-fw_edu_hq-tutorial_smollm2_360m_hq": "SmolLM2 360M",
188
  "mix-fw_edu_hq-tutorial_smollm2_135m_hq": "SmolLM2 135M",
189
- dclm: "DCLM",
190
- fw_edu_hq: "FineWeb-Edu-HQ"
191
  }
192
  }
193
  }
@@ -207,43 +187,39 @@ The REWIRE [@rewire] paper claims that upcycling low-quality data requires large
207
  config={{
208
  setups: {
209
  "Continue Prompt": {
210
- datasetNames: {
211
  "mix-fw_edu_hq-continue_12b_hq": "12B, HQ Source",
212
  "mix-fw_edu_hq-continue_1b_hq": "1B, HQ Source",
213
  "mix-fw_edu_hq-continue_1b_lq": "1B, LQ Source",
214
  "mix-fw_edu_hq-continue_12b_lq": "12B, LQ Source",
215
- dclm: "DCLM",
216
- fw_edu_hq: "FineWeb-Edu-HQ"
217
  }
218
  },
219
  "Summarize Prompt": {
220
- datasetNames: {
221
  "mix-fw_edu_hq-summarize_1b_hq": "1B, HQ Source",
222
  "mix-fw_edu_hq-summarize_12b_hq": "12B, HQ Source",
223
  "mix-fw_edu_hq-summarize_1b_lq": "1B, LQ Source",
224
  "mix-fw_edu_hq-summarize_12b_lq": "12B, LQ Source",
225
- dclm: "DCLM",
226
- fw_edu_hq: "FineWeb-Edu-HQ"
227
  }
228
  },
229
  "Tutorial Prompt": {
230
- datasetNames: {
231
  "mix-fw_edu_hq-tutorial_1b_hq": "1B, HQ Source",
232
  "mix-fw_edu_hq-tutorial_12b_hq": "12B, HQ Source",
233
  "mix-fw_edu_hq-tutorial_12b_lq": "12B, LQ Source",
234
  "mix-fw_edu_hq-tutorial_1b_lq": "1B, LQ Source",
235
- dclm: "DCLM",
236
- fw_edu_hq: "FineWeb-Edu-HQ"
237
  }
238
  },
239
  "FAQ Prompt": {
240
- datasetNames: {
241
  "mix-fw_edu_hq-faq_1b_hq": "1B, HQ Source",
242
  "mix-fw_edu_hq-faq_1b_lq": "1B, LQ Source",
243
  "mix-fw_edu_hq-faq_12b_hq": "12B, HQ Source",
244
  "mix-fw_edu_hq-faq_12b_lq": "12B, LQ Source",
245
- dclm: "DCLM",
246
- fw_edu_hq: "FineWeb-Edu-HQ"
247
  }
248
  }
249
  }
@@ -267,51 +243,47 @@ We hypothesize that SmolLM2's consistently strong rephrasing performance origina
267
  config={{
268
  setups: {
269
  "Tutorial Prompt": {
270
- datasetNames: {
271
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
272
  "mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Falcon3",
273
  "mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3",
274
  "mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3",
275
  "mix-fw_edu_hq-tutorial_granite3_1b_hq": "Granite3",
276
  "mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2",
277
- dclm: "DCLM",
278
- fw_edu_hq: "FineWeb-Edu-HQ"
279
  }
280
  },
281
  "FAQ Prompt": {
282
- datasetNames: {
283
  "mix-fw_edu_hq-faq_smollm2_1.7b_hq": "SmolLM2",
284
  "mix-fw_edu_hq-faq_llama3.2_1b_hq": "Llama-3.2",
285
  "mix-fw_edu_hq-faq_falcon3_1b_hq": "Falcon3",
286
  "mix-fw_edu_hq-faq_1b_hq": "Gemma-3",
287
  "mix-fw_edu_hq-faq_granite3_1b_hq": "Granite3",
288
  "mix-fw_edu_hq-faq_qwen3_1.7b_hq": "Qwen3",
289
- dclm: "DCLM",
290
- fw_edu_hq: "FineWeb-Edu-HQ"
291
  }
292
  },
293
  "Table Prompt": {
294
- datasetNames: {
295
  "mix-fw_edu_hq-table_smollm2_1.7b_hq": "SmolLM2",
296
  "mix-fw_edu_hq-table_falcon3_1b_hq": "Falcon3",
297
  "mix-fw_edu_hq-table_granite3_1b_hq": "Granite3",
298
  "mix-fw_edu_hq-table_qwen3_1.7b_hq": "Qwen3",
299
  "mix-fw_edu_hq-table_llama3.2_1b_hq": "Llama-3.2",
300
  "mix-fw_edu_hq-table_1b_hq": "Gemma-3",
301
- dclm: "DCLM",
302
- fw_edu_hq: "FineWeb-Edu-HQ"
303
  }
304
  },
305
  "Math Prompt": {
306
- datasetNames: {
307
  "mix-fw_edu_hq-math_smollm2_1.7b_hq": "SmolLM2",
308
  "mix-fw_edu_hq-math_falcon3_1b_hq": "Falcon3",
309
  "mix-fw_edu_hq-math_granite3_1b_hq": "Granite3",
310
  "mix-fw_edu_hq-math_1b_hq": "Gemma-3",
311
  "mix-fw_edu_hq-math_llama3.2_1b_hq": "Llama-3.2",
312
  "mix-fw_edu_hq-math_qwen3_1.7b_hq": "Qwen3",
313
- dclm: "DCLM",
314
- fw_edu_hq: "FineWeb-Edu-HQ"
315
  }
316
  }
317
  }
@@ -329,13 +301,12 @@ We compare Qwen models from versions 1.5 [@qwen], 2 [@qwen2], 2.5 [@qwen25], and
329
  src="d3-benchmark-comparison.html"
330
  desc="Qwen model generations (1.5 to 3) on the tutorial prompt."
331
  config={{
332
- datasetNames: {
333
  "mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3 (1.7B)",
334
  "mix-fw_edu_hq-tutorial_qwen2.5_1.5b_hq": "Qwen2.5 (1.5B)",
335
  "mix-fw_edu_hq-tutorial_qwen2_1.5b_hq": "Qwen2 (1.5B)",
336
- dclm: "DCLM",
337
- "mix-fw_edu_hq-tutorial_qwen1.5_1.8b_hq": "Qwen1.5 (1.8B)",
338
- fw_edu_hq: "FineWeb-Edu-HQ"
339
  }
340
  }}
341
  />
@@ -355,7 +326,7 @@ So far we've always mixed synthetic data with a <Glossary term="source dataset"
355
 
356
  #### Is synthetic data enough?
357
 
358
- We compare synthetic-only training vs mixed training (synthetic + source) for [tutorial](#tutorial) and [faq](#faq) prompts on DCLM and FineWeb-Edu-HQ sources. Synthetic-only training beats FineWeb-Edu-HQ but falls short of both DCLM and mixed training (see <FigRef target="synthetic-only" />). Mixed training consistently improves over both the synthetic-only and original-data-only baselines.
359
 
360
  <HtmlEmbed
361
  id="synthetic-only"
@@ -364,23 +335,21 @@ We compare synthetic-only training vs mixed training (synthetic + source) for [t
364
  config={{
365
  setups: {
366
  "DCLM Source": {
367
- datasetNames: {
368
  "mix-dclm-faq_1b_dclm": "Mix: FAQ + DCLM",
369
- dclm: "DCLM",
370
  "mix-dclm-tutorial_1b_dclm": "Mix: Tutorial + DCLM",
371
  faq_1b_dclm: "FAQ Only",
372
- tutorial_1b_dclm: "Tutorial Only",
373
- fw_edu_hq: "FineWeb-Edu-HQ"
374
  }
375
  },
376
  "FineWeb-Edu-HQ Source": {
377
- datasetNames: {
378
  "mix-fw_edu_hq-faq_1b_hq": "Mix: FAQ + FineWeb-Edu-HQ",
379
  "mix-fw_edu_hq-tutorial_1b_hq": "Mix: Tutorial + FineWeb-Edu-HQ",
380
- dclm: "DCLM",
381
  faq_1b_hq: "FAQ Only",
382
- tutorial_1b_hq: "Tutorial Only",
383
- fw_edu_hq: "FineWeb-Edu-HQ"
384
  }
385
  }
386
  }
@@ -391,7 +360,7 @@ So synthetic data alone does not seem to be enough. But how much does the specif
391
 
392
  #### Does the mix-in dataset matter?
393
 
394
- We apply the [tutorial](#tutorial) prompt using Gemma-3-1B on FineWeb-Edu-HQ, then mix in one of four datasets: DCLM, Cosmopedia, FineWeb-Edu-HQ, or FineWeb-Edu-LQ. Use the Setup dropdown to also see results with LQ source data. DCLM and FineWeb-Edu-HQ outperform Cosmopedia and FineWeb-Edu-LQ as mix-in datasets. Adding synthetic data improves performance for all mix-in datasets, with the effect especially pronounced for the weaker ones (see <FigRef target="mixin-dataset" />). The mix-in dataset is a major performance driver, sometimes more important than the synthetic data itself.
395
 
396
  <HtmlEmbed
397
  id="mixin-dataset"
@@ -400,31 +369,30 @@ We apply the [tutorial](#tutorial) prompt using Gemma-3-1B on FineWeb-Edu-HQ, th
400
  config={{
401
  setups: {
402
  "HQ Source": {
403
- datasetNames: {
404
- "mix-dclm-tutorial_1b_hq": "Mix-in: DCLM",
405
- "mix-fw_edu_hq-tutorial_1b_hq": "Mix-in: FineWeb-Edu-HQ",
406
- dclm: "DCLM",
407
- "mix-fw_edu_lq-tutorial_1b_hq": "Mix-in: FineWeb-Edu-LQ",
408
- "mix-cosmopedia-tutorial_1b_hq": "Mix-in: Cosmopedia",
409
- fw_edu_hq: "FineWeb-Edu-HQ",
410
- cosmopedia: "Cosmopedia",
411
- fw_edu_lq: "FineWeb-Edu-LQ"
412
  }
413
  },
414
  "LQ Source": {
415
- datasetNames: {
416
- dclm: "DCLM",
417
- "mix-fw_edu_hq-tutorial_1b_lq": "Mix-in: FineWeb-Edu-HQ",
418
- "mix-dclm-tutorial_1b_lq": "Mix-in: DCLM",
419
- fw_edu_hq: "FineWeb-Edu-HQ",
420
- "mix-cosmopedia-tutorial_1b_lq": "Mix-in: Cosmopedia",
421
- cosmopedia: "Cosmopedia",
422
- "mix-fw_edu_lq-tutorial_1b_lq": "Mix-in: FineWeb-Edu-LQ",
423
- fw_edu_lq: "FineWeb-Edu-LQ"
424
  }
425
  }
426
- },
427
- baselines: ["dclm", "fw_edu_hq", "cosmopedia", "fw_edu_lq"]
428
  }}
429
  />
430
 
@@ -441,23 +409,21 @@ We rephrase four datasets (DCLM, Cosmopedia, FineWeb-Edu-HQ, FineWeb-Edu-LQ) wit
441
  config={{
442
  setups: {
443
  "Tutorial Prompt": {
444
- datasetNames: {
445
  "mix-fw_edu_hq-tutorial_1b_hq": "Source: FineWeb-Edu-HQ",
446
  "mix-dclm-tutorial_1b_dclm": "Source: DCLM",
447
  "mix-cosmopedia-tutorial_1b_cosmopedia": "Source: Cosmopedia",
448
  "mix-fw_edu_lq-tutorial_1b_lq": "Source: FineWeb-Edu-LQ",
449
- dclm: "DCLM",
450
- fw_edu_hq: "FineWeb-Edu-HQ"
451
  }
452
  },
453
  "FAQ Prompt": {
454
- datasetNames: {
455
  "mix-dclm-faq_1b_dclm": "Source: DCLM",
456
  "mix-fw_edu_hq-faq_1b_hq": "Source: FineWeb-Edu-HQ",
457
  "mix-fw_edu_lq-faq_1b_lq": "Source: FineWeb-Edu-LQ",
458
  "mix-cosmopedia-faq_1b_cosmopedia": "Source: Cosmopedia",
459
- dclm: "DCLM",
460
- fw_edu_hq: "FineWeb-Edu-HQ"
461
  }
462
  }
463
  }
@@ -471,23 +437,21 @@ We rephrase four datasets (DCLM, Cosmopedia, FineWeb-Edu-HQ, FineWeb-Edu-LQ) wit
471
  config={{
472
  setups: {
473
  "Tutorial Prompt": {
474
- datasetNames: {
475
  "mix-fw_edu_hq-tutorial_1b_dclm": "Source: DCLM",
476
  "mix-fw_edu_hq-tutorial_1b_hq": "Source: FineWeb-Edu-HQ",
477
  "mix-fw_edu_hq-tutorial_1b_cosmopedia": "Source: Cosmopedia",
478
  "mix-fw_edu_hq-tutorial_1b_lq": "Source: FineWeb-Edu-LQ",
479
- dclm: "DCLM",
480
- fw_edu_hq: "FineWeb-Edu-HQ"
481
  }
482
  },
483
  "FAQ Prompt": {
484
- datasetNames: {
485
  "mix-fw_edu_hq-faq_1b_dclm": "Source: DCLM",
486
  "mix-fw_edu_hq-faq_1b_hq": "Source: FineWeb-Edu-HQ",
487
  "mix-fw_edu_hq-faq_1b_lq": "Source: FineWeb-Edu-LQ",
488
  "mix-fw_edu_hq-faq_1b_cosmopedia": "Source: Cosmopedia",
489
- dclm: "DCLM",
490
- fw_edu_hq: "FineWeb-Edu-HQ"
491
  }
492
  }
493
  }
@@ -511,38 +475,35 @@ Interestingly, when mixing enough different prompts together, we don't seem to n
511
  config={{
512
  setups: {
513
  "Mixing Prompts": {
514
- datasetNames: {
515
  "mix-fw_edu_hq-tutorial_1b_hq-fw_edu_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts + FineWeb-Edu-HQ",
516
  "mix-fw_edu_hq-math_1b_hq": "Math + FineWeb-Edu-HQ",
517
  "mix-tutorial_1b_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts (No Source)",
518
  "mix-fw_edu_hq-table_1b_hq": "Table + FineWeb-Edu-HQ",
519
  "mix-fw_edu_hq-faq_1b_hq": "FAQ + FineWeb-Edu-HQ",
520
  "mix-fw_edu_hq-tutorial_1b_hq": "Tutorial + FineWeb-Edu-HQ",
521
- dclm: "DCLM",
522
- fw_edu_hq: "FineWeb-Edu-HQ"
523
  }
524
  },
525
  "Mixing Models": {
526
- datasetNames: {
527
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
528
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "SmolLM2 + Falcon3",
529
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_llama3.2_1b_hq": "SmolLM2 + Llama-3.2",
530
  "mix-fw_edu_hq-tutorial_llama3.2_1b_hq-tutorial_granite3_1b_hq": "Llama-3.2 + Granite3",
531
  "mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2",
532
- dclm: "DCLM",
533
- fw_edu_hq: "FineWeb-Edu-HQ"
534
  }
535
  },
536
  "Mixing Both": {
537
- datasetNames: {
538
  "mix-fw_edu_hq-faq_smollm2_1.7b_hq": "FAQ (SmolLM2)",
539
  "mix-fw_edu_hq-faq_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "FAQ (SmolLM2) + Tutorial (Falcon3)",
540
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "Tutorial (SmolLM2)",
541
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "Tutorial (SmolLM2) + Tutorial (Falcon3)",
542
  "mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Tutorial (Falcon3)",
543
  "mix-fw_edu_hq-faq_falcon3_1b_hq": "FAQ (Falcon3)",
544
- dclm: "DCLM",
545
- fw_edu_hq: "FineWeb-Edu-HQ"
546
  }
547
  }
548
  }
@@ -568,13 +529,12 @@ We compare REWIRE's [original prompt](#guided_rewrite_original) (with typos) aga
568
  src="d3-benchmark-comparison.html"
569
  desc="REWIRE prompt with original typos vs improved version at 1B and 12B scale."
570
  config={{
571
- datasetNames: {
572
  "mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Original (12B)",
573
  "mix-fw_edu_hq-guided_rewrite_improved_12b_hq": "Improved (12B)",
574
- dclm: "DCLM",
575
  "mix-fw_edu_hq-guided_rewrite_original_1b_hq": "Original (1B)",
576
- "mix-fw_edu_hq-guided_rewrite_improved_1b_hq": "Improved (1B)",
577
- fw_edu_hq: "FineWeb-Edu-HQ"
578
  }
579
  }}
580
  />
@@ -609,4 +569,3 @@ Here are the key takeaways from our experiments:
609
  A: No. Typos have no negative effect on downstream performance.
610
 
611
  The bottom line: the details of synthetic rephrasing matter a lot, and knowing which ones matter is the key to scaling it up. Prompt design is the single biggest lever, with structured formats like Math, Table, FAQ, and Tutorial consistently beating curated baselines. But equally important is knowing where you can cut corners without losing quality. You don't need a large rephrasing model (1B is enough for simple prompts, 4B for complex ones). You don't need pristine source data (even low-quality sources work with a strong mix-in). Smaller models generate faster, directly translating into higher throughput. And tolerating lower-quality sources opens up a much bigger and more diverse data pool to draw from. The practical recipe is straightforward: pick a strong structured prompt, use the smallest model that handles it, blend with high-quality original data, and spend your remaining compute on volume.
612
-
 
8
  {/* TODO: read through entire blog post and make improvements */}
9
  {/* TODO: potentially make a widget for data exploration: look at the same few samples generated by different models or transformed with different prompts */}
10
  {/* TODO: Integrate decay experiment as another analysis for proxy */}
11
+ {/* TODO: share on a bunch of discords/slacks/hackernews/locallama */}
12
+ {/* TODO: brainstorm better banner, be artsy */}
13
+ {/* TODO: only explain datatrove additions when we need them (for generating the final finephrase) */}
14
+ {/* TODO: move infrastructure section after analyses as precursor and explanation for finephrase */}
 
15
 
16
  {/*
17
  Notes:
 
39
  src="d3-benchmark-comparison.html"
40
  desc="Comparison of baseline datasets across different evaluation metrics. Use the dropdown to switch metrics."
41
  config={{
42
+ datasets: {
 
43
  cosmopedia: "Cosmopedia",
44
  dclm: "DCLM",
45
  fw_edu_hq: "FineWeb-Edu-HQ",
 
65
  <HtmlEmbed
66
  id="dissecting-baselines"
67
  src="d3-benchmark-comparison.html"
68
+ desc="Individual prompt performance from existing synthetic datasets compared to the DCLM baseline."
69
  config={{
70
+ datasets: {
71
+ "mix-fw_edu_hq-diverse_qa_pairs_1b_hq": { display: "Diverse QA Pairs", color: "#c5e384" },
72
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
73
+ "mix-fw_edu_hq-extract_knowledge_1b_hq": { display: "Extract Knowledge", color: "#3d6b00" },
74
+ "mix-fw_edu_hq-guided_rewrite_original_1b_hq": { display: "Guided Rewrite", color: "#6aabff" },
75
+ nemotron_hq_synth: { display: "Nemotron-HQ-Synth", color: "#76b900", shaded: true },
76
+ rewire: { display: "REWIRE", color: "#1877F2", shaded: true },
77
+ "mix-fw_edu_hq-distill_1b_hq": { display: "Distill", color: "#a0c95c" },
78
+ "mix-fw_edu_hq-wikipedia_style_rephrasing_1b_hq": { display: "Wikipedia Rephrasing", color: "#7fb034" },
79
+ "mix-fw_edu_hq-knowledge_list_1b_hq": { display: "Knowledge List", color: "#5e960e" },
80
+ "mix-fw_edu_hq-continue_1b_hq": { display: "Continue", color: "#e8713a" },
81
+ "mix-fw_edu_hq-summarize_1b_hq": { display: "Summarize", color: "#c4451c" }
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  }
83
  }}
84
  />
 
87
 
88
  ### Can New Prompts Beat DCLM?
89
 
90
+ Since most existing prompts fail to beat DCLM, we designed seven novel prompt formats targeting different skills ([math](#math), [table](#table), [faq](#faq), [tutorial](#tutorial), [article](#article), [commentary](#commentary), [discussion](#discussion)), all using Gemma-3-1B on FineWeb-Edu-HQ. Four prompts ([math](#math), [table](#table), [faq](#faq), [tutorial](#tutorial)) outperform DCLM, while [article](#article), [commentary](#commentary), and [discussion](#discussion) are at or below DCLM level (see <FigRef target="new-prompts" />). The best-performing prompts all restructure the source content into pedagogically rich formats.
91
 
92
  <HtmlEmbed
93
  id="new-prompts"
94
  src="d3-benchmark-comparison.html"
95
+ desc="Seven new prompts compared against the DCLM baseline."
96
  config={{
97
+ datasets: {
98
  "mix-fw_edu_hq-math_1b_hq": "Math",
99
  "mix-fw_edu_hq-table_1b_hq": "Table",
100
  "mix-fw_edu_hq-faq_1b_hq": "FAQ",
 
102
  "mix-fw_edu_hq-article_1b_hq": "Article",
103
  "mix-fw_edu_hq-commentary_1b_hq": "Commentary",
104
  "mix-fw_edu_hq-discussion_1b_hq": "Discussion",
105
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
106
  }
107
  }}
108
  />
 
133
  config={{
134
  setups: {
135
  "Gemma-3: Tutorial": {
136
+ datasets: {
137
  "mix-fw_edu_hq-tutorial_27b_hq": "Gemma-3 27B",
138
  "mix-fw_edu_hq-tutorial_12b_hq": "Gemma-3 12B",
139
  "mix-fw_edu_hq-tutorial_4b_hq": "Gemma-3 4B",
140
  "mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3 1B",
141
  "mix-fw_edu_hq-tutorial_270m_hq": "Gemma-3 270M",
142
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
143
  }
144
  },
145
  "Gemma-3: Math": {
146
+ datasets: {
147
  "mix-fw_edu_hq-math_27b_hq": "Gemma-3 27B",
148
  "mix-fw_edu_hq-math_12b_hq": "Gemma-3 12B",
149
  "mix-fw_edu_hq-math_4b_hq": "Gemma-3 4B",
150
  "mix-fw_edu_hq-math_1b_hq": "Gemma-3 1B",
151
  "mix-fw_edu_hq-math_270m_hq": "Gemma-3 270M",
152
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
153
  }
154
  },
155
  "Gemma-3: REWIRE": {
156
+ datasets: {
157
  "mix-fw_edu_hq-guided_rewrite_original_27b_hq": "Gemma-3 27B",
158
  "mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Gemma-3 12B",
159
  "mix-fw_edu_hq-guided_rewrite_original_4b_hq": "Gemma-3 4B",
160
  "mix-fw_edu_hq-guided_rewrite_original_1b_hq": "Gemma-3 1B",
161
  "mix-fw_edu_hq-guided_rewrite_original_270m_hq": "Gemma-3 270M",
162
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
163
  }
164
  },
165
  "SmolLM2: Tutorial": {
166
+ datasets: {
167
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2 1.7B",
168
  "mix-fw_edu_hq-tutorial_smollm2_360m_hq": "SmolLM2 360M",
169
  "mix-fw_edu_hq-tutorial_smollm2_135m_hq": "SmolLM2 135M",
170
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
171
  }
172
  }
173
  }
 
187
  config={{
188
  setups: {
189
  "Continue Prompt": {
190
+ datasets: {
191
  "mix-fw_edu_hq-continue_12b_hq": "12B, HQ Source",
192
  "mix-fw_edu_hq-continue_1b_hq": "1B, HQ Source",
193
  "mix-fw_edu_hq-continue_1b_lq": "1B, LQ Source",
194
  "mix-fw_edu_hq-continue_12b_lq": "12B, LQ Source",
195
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
196
  }
197
  },
198
  "Summarize Prompt": {
199
+ datasets: {
200
  "mix-fw_edu_hq-summarize_1b_hq": "1B, HQ Source",
201
  "mix-fw_edu_hq-summarize_12b_hq": "12B, HQ Source",
202
  "mix-fw_edu_hq-summarize_1b_lq": "1B, LQ Source",
203
  "mix-fw_edu_hq-summarize_12b_lq": "12B, LQ Source",
204
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
205
  }
206
  },
207
  "Tutorial Prompt": {
208
+ datasets: {
209
  "mix-fw_edu_hq-tutorial_1b_hq": "1B, HQ Source",
210
  "mix-fw_edu_hq-tutorial_12b_hq": "12B, HQ Source",
211
  "mix-fw_edu_hq-tutorial_12b_lq": "12B, LQ Source",
212
  "mix-fw_edu_hq-tutorial_1b_lq": "1B, LQ Source",
213
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
214
  }
215
  },
216
  "FAQ Prompt": {
217
+ datasets: {
218
  "mix-fw_edu_hq-faq_1b_hq": "1B, HQ Source",
219
  "mix-fw_edu_hq-faq_1b_lq": "1B, LQ Source",
220
  "mix-fw_edu_hq-faq_12b_hq": "12B, HQ Source",
221
  "mix-fw_edu_hq-faq_12b_lq": "12B, LQ Source",
222
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
223
  }
224
  }
225
  }
 
243
  config={{
244
  setups: {
245
  "Tutorial Prompt": {
246
+ datasets: {
247
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
248
  "mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Falcon3",
249
  "mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3",
250
  "mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3",
251
  "mix-fw_edu_hq-tutorial_granite3_1b_hq": "Granite3",
252
  "mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2",
253
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
254
  }
255
  },
256
  "FAQ Prompt": {
257
+ datasets: {
258
  "mix-fw_edu_hq-faq_smollm2_1.7b_hq": "SmolLM2",
259
  "mix-fw_edu_hq-faq_llama3.2_1b_hq": "Llama-3.2",
260
  "mix-fw_edu_hq-faq_falcon3_1b_hq": "Falcon3",
261
  "mix-fw_edu_hq-faq_1b_hq": "Gemma-3",
262
  "mix-fw_edu_hq-faq_granite3_1b_hq": "Granite3",
263
  "mix-fw_edu_hq-faq_qwen3_1.7b_hq": "Qwen3",
264
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
265
  }
266
  },
267
  "Table Prompt": {
268
+ datasets: {
269
  "mix-fw_edu_hq-table_smollm2_1.7b_hq": "SmolLM2",
270
  "mix-fw_edu_hq-table_falcon3_1b_hq": "Falcon3",
271
  "mix-fw_edu_hq-table_granite3_1b_hq": "Granite3",
272
  "mix-fw_edu_hq-table_qwen3_1.7b_hq": "Qwen3",
273
  "mix-fw_edu_hq-table_llama3.2_1b_hq": "Llama-3.2",
274
  "mix-fw_edu_hq-table_1b_hq": "Gemma-3",
275
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
276
  }
277
  },
278
  "Math Prompt": {
279
+ datasets: {
280
  "mix-fw_edu_hq-math_smollm2_1.7b_hq": "SmolLM2",
281
  "mix-fw_edu_hq-math_falcon3_1b_hq": "Falcon3",
282
  "mix-fw_edu_hq-math_granite3_1b_hq": "Granite3",
283
  "mix-fw_edu_hq-math_1b_hq": "Gemma-3",
284
  "mix-fw_edu_hq-math_llama3.2_1b_hq": "Llama-3.2",
285
  "mix-fw_edu_hq-math_qwen3_1.7b_hq": "Qwen3",
286
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
287
  }
288
  }
289
  }
 
301
  src="d3-benchmark-comparison.html"
302
  desc="Qwen model generations (1.5 to 3) on the tutorial prompt."
303
  config={{
304
+ datasets: {
305
  "mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3 (1.7B)",
306
  "mix-fw_edu_hq-tutorial_qwen2.5_1.5b_hq": "Qwen2.5 (1.5B)",
307
  "mix-fw_edu_hq-tutorial_qwen2_1.5b_hq": "Qwen2 (1.5B)",
308
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
309
+ "mix-fw_edu_hq-tutorial_qwen1.5_1.8b_hq": "Qwen1.5 (1.8B)"
 
310
  }
311
  }}
312
  />
 
326
 
327
  #### Is synthetic data enough?
328
 
329
+ We compare synthetic-only training vs mixed training (synthetic + source) for [tutorial](#tutorial) and [faq](#faq) prompts on DCLM and FineWeb-Edu-HQ sources. Synthetic-only training falls short of both DCLM and mixed training (see <FigRef target="synthetic-only" />). Mixed training consistently improves over both the synthetic-only and original-data-only baselines.
330
 
331
  <HtmlEmbed
332
  id="synthetic-only"
 
335
  config={{
336
  setups: {
337
  "DCLM Source": {
338
+ datasets: {
339
  "mix-dclm-faq_1b_dclm": "Mix: FAQ + DCLM",
340
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
341
  "mix-dclm-tutorial_1b_dclm": "Mix: Tutorial + DCLM",
342
  faq_1b_dclm: "FAQ Only",
343
+ tutorial_1b_dclm: "Tutorial Only"
 
344
  }
345
  },
346
  "FineWeb-Edu-HQ Source": {
347
+ datasets: {
348
  "mix-fw_edu_hq-faq_1b_hq": "Mix: FAQ + FineWeb-Edu-HQ",
349
  "mix-fw_edu_hq-tutorial_1b_hq": "Mix: Tutorial + FineWeb-Edu-HQ",
350
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
351
  faq_1b_hq: "FAQ Only",
352
+ tutorial_1b_hq: "Tutorial Only"
 
353
  }
354
  }
355
  }
 
360
 
361
  #### Does the mix-in dataset matter?
362
 
363
+ We apply the [tutorial](#tutorial) prompt using Gemma-3-1B on FineWeb-Edu-HQ, then mix in one of four datasets: DCLM, Cosmopedia, FineWeb-Edu-HQ, or FineWeb-Edu-LQ. Use the Setup dropdown to also see results with LQ source data. DCLM outperforms other mix-in datasets. Adding synthetic data improves performance for all mix-in datasets, with the effect especially pronounced for the weaker ones (see <FigRef target="mixin-dataset" />). The mix-in dataset is a major performance driver, sometimes more important than the synthetic data itself.
364
 
365
  <HtmlEmbed
366
  id="mixin-dataset"
 
369
  config={{
370
  setups: {
371
  "HQ Source": {
372
+ datasets: {
373
+ "mix-dclm-tutorial_1b_hq": { display: "Mix-in: DCLM", color: "#4e79a7" },
374
+ "mix-fw_edu_hq-tutorial_1b_hq": { display: "Mix-in: FineWeb-Edu-HQ", color: "#59a14f" },
375
+ dclm: { display: "DCLM", color: "#4e79a7", shaded: true },
376
+ "mix-fw_edu_lq-tutorial_1b_hq": { display: "Mix-in: FineWeb-Edu-LQ", color: "#e15759" },
377
+ "mix-cosmopedia-tutorial_1b_hq": { display: "Mix-in: Cosmopedia", color: "#f28e2b" },
378
+ cosmopedia: { display: "Cosmopedia", color: "#f28e2b", shaded: true },
379
+ fw_edu_hq: { display: "FineWeb-Edu-HQ", color: "#59a14f", shaded: true },
380
+ fw_edu_lq: { display: "FineWeb-Edu-LQ", color: "#e15759", shaded: true }
381
  }
382
  },
383
  "LQ Source": {
384
+ datasets: {
385
+ dclm: { display: "DCLM", color: "#4e79a7", shaded: true },
386
+ "mix-fw_edu_hq-tutorial_1b_lq": { display: "Mix-in: FineWeb-Edu-HQ", color: "#59a14f" },
387
+ "mix-dclm-tutorial_1b_lq": { display: "Mix-in: DCLM", color: "#4e79a7" },
388
+ "mix-cosmopedia-tutorial_1b_lq": { display: "Mix-in: Cosmopedia", color: "#f28e2b" },
389
+ cosmopedia: { display: "Cosmopedia", color: "#f28e2b", shaded: true },
390
+ "mix-fw_edu_lq-tutorial_1b_lq": { display: "Mix-in: FineWeb-Edu-LQ", color: "#e15759" },
391
+ fw_edu_hq: { display: "FineWeb-Edu-HQ", color: "#59a14f", shaded: true },
392
+ fw_edu_lq: { display: "FineWeb-Edu-LQ", color: "#e15759", shaded: true }
393
  }
394
  }
395
+ }
 
396
  }}
397
  />
398
 
 
409
  config={{
410
  setups: {
411
  "Tutorial Prompt": {
412
+ datasets: {
413
  "mix-fw_edu_hq-tutorial_1b_hq": "Source: FineWeb-Edu-HQ",
414
  "mix-dclm-tutorial_1b_dclm": "Source: DCLM",
415
  "mix-cosmopedia-tutorial_1b_cosmopedia": "Source: Cosmopedia",
416
  "mix-fw_edu_lq-tutorial_1b_lq": "Source: FineWeb-Edu-LQ",
417
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
418
  }
419
  },
420
  "FAQ Prompt": {
421
+ datasets: {
422
  "mix-dclm-faq_1b_dclm": "Source: DCLM",
423
  "mix-fw_edu_hq-faq_1b_hq": "Source: FineWeb-Edu-HQ",
424
  "mix-fw_edu_lq-faq_1b_lq": "Source: FineWeb-Edu-LQ",
425
  "mix-cosmopedia-faq_1b_cosmopedia": "Source: Cosmopedia",
426
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
427
  }
428
  }
429
  }
 
437
  config={{
438
  setups: {
439
  "Tutorial Prompt": {
440
+ datasets: {
441
  "mix-fw_edu_hq-tutorial_1b_dclm": "Source: DCLM",
442
  "mix-fw_edu_hq-tutorial_1b_hq": "Source: FineWeb-Edu-HQ",
443
  "mix-fw_edu_hq-tutorial_1b_cosmopedia": "Source: Cosmopedia",
444
  "mix-fw_edu_hq-tutorial_1b_lq": "Source: FineWeb-Edu-LQ",
445
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
446
  }
447
  },
448
  "FAQ Prompt": {
449
+ datasets: {
450
  "mix-fw_edu_hq-faq_1b_dclm": "Source: DCLM",
451
  "mix-fw_edu_hq-faq_1b_hq": "Source: FineWeb-Edu-HQ",
452
  "mix-fw_edu_hq-faq_1b_lq": "Source: FineWeb-Edu-LQ",
453
  "mix-fw_edu_hq-faq_1b_cosmopedia": "Source: Cosmopedia",
454
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
455
  }
456
  }
457
  }
 
475
  config={{
476
  setups: {
477
  "Mixing Prompts": {
478
+ datasets: {
479
  "mix-fw_edu_hq-tutorial_1b_hq-fw_edu_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts + FineWeb-Edu-HQ",
480
  "mix-fw_edu_hq-math_1b_hq": "Math + FineWeb-Edu-HQ",
481
  "mix-tutorial_1b_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts (No Source)",
482
  "mix-fw_edu_hq-table_1b_hq": "Table + FineWeb-Edu-HQ",
483
  "mix-fw_edu_hq-faq_1b_hq": "FAQ + FineWeb-Edu-HQ",
484
  "mix-fw_edu_hq-tutorial_1b_hq": "Tutorial + FineWeb-Edu-HQ",
485
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
486
  }
487
  },
488
  "Mixing Models": {
489
+ datasets: {
490
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
491
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "SmolLM2 + Falcon3",
492
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_llama3.2_1b_hq": "SmolLM2 + Llama-3.2",
493
  "mix-fw_edu_hq-tutorial_llama3.2_1b_hq-tutorial_granite3_1b_hq": "Llama-3.2 + Granite3",
494
  "mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2",
495
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
496
  }
497
  },
498
  "Mixing Both": {
499
+ datasets: {
500
  "mix-fw_edu_hq-faq_smollm2_1.7b_hq": "FAQ (SmolLM2)",
501
  "mix-fw_edu_hq-faq_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "FAQ (SmolLM2) + Tutorial (Falcon3)",
502
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "Tutorial (SmolLM2)",
503
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "Tutorial (SmolLM2) + Tutorial (Falcon3)",
504
  "mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Tutorial (Falcon3)",
505
  "mix-fw_edu_hq-faq_falcon3_1b_hq": "FAQ (Falcon3)",
506
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
 
507
  }
508
  }
509
  }
 
529
  src="d3-benchmark-comparison.html"
530
  desc="REWIRE prompt with original typos vs improved version at 1B and 12B scale."
531
  config={{
532
+ datasets: {
533
  "mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Original (12B)",
534
  "mix-fw_edu_hq-guided_rewrite_improved_12b_hq": "Improved (12B)",
535
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
536
  "mix-fw_edu_hq-guided_rewrite_original_1b_hq": "Original (1B)",
537
+ "mix-fw_edu_hq-guided_rewrite_improved_1b_hq": "Improved (1B)"
 
538
  }
539
  }}
540
  />
 
569
  A: No. Typos have no negative effect on downstream performance.
570
 
571
  The bottom line: the details of synthetic rephrasing matter a lot, and knowing which ones matter is the key to scaling it up. Prompt design is the single biggest lever, with structured formats like Math, Table, FAQ, and Tutorial consistently beating curated baselines. But equally important is knowing where you can cut corners without losing quality. You don't need a large rephrasing model (1B is enough for simple prompts, 4B for complex ones). You don't need pristine source data (even low-quality sources work with a strong mix-in). Smaller models generate faster, directly translating into higher throughput. And tolerating lower-quality sources opens up a much bigger and more diverse data pool to draw from. The practical recipe is straightforward: pick a strong structured prompt, use the smallest model that handles it, blend with high-quality original data, and spend your remaining compute on volume.
 
app/src/content/chapters/introduction.mdx CHANGED
@@ -44,11 +44,9 @@ Here's a preview of where we end up: FinePhrase, our best configuration, clearly
44
  desc="FinePhrase compared against synthetic data baselines across evaluation metrics."
45
  config={{
46
  defaultView: "line",
47
- pinnedColors: { "FinePhrase": "#EBA937" },
48
- baselines: ["cosmopedia", "nemotron_hq_synth", "rewire", "synth_query_reasoning_answer"],
49
- datasetNames: {
50
  cosmopedia: "Cosmopedia",
51
- "mix-fw_edu_hq-table_smollm2_1.7b_hq": "FinePhrase",
52
  nemotron_hq_synth: "Nemotron-HQ-Synth",
53
  rewire: "REWIRE",
54
  synth_query_reasoning_answer: "SYNTH"
 
44
  desc="FinePhrase compared against synthetic data baselines across evaluation metrics."
45
  config={{
46
  defaultView: "line",
47
+ datasets: {
 
 
48
  cosmopedia: "Cosmopedia",
49
+ "mix-fw_edu_hq-table_smollm2_1.7b_hq": { display: "FinePhrase", color: "#EBA937" },
50
  nemotron_hq_synth: "Nemotron-HQ-Synth",
51
  rewire: "REWIRE",
52
  synth_query_reasoning_answer: "SYNTH"
app/src/content/embeds/d3-benchmark-comparison.html CHANGED
@@ -3,29 +3,38 @@
3
 
4
  Configuration via data-config attribute:
5
  {
6
- "datasetNames": { "raw_name": "Display Name", ... }, // required (unless using setups)
7
- "setups": { "Setup Label": { "datasetNames": {...} }, ... }, // optional, multi-setup mode with dropdown + average
8
- "pinnedColors": { "DCLM": "#333", "FineWeb-Edu (HQ)": "#86a1a9" }, // optional
9
- "baselines": ["dclm", "fw_edu_hq"], // optional, raw keys for baseline datasets (dashed lines, striped bars). Default: ["dclm", "fw_edu_hq"]
10
- "defaultMetric": "agg_score_macro", // optional, default: "agg_score_macro"
11
- "defaultView": "bar", // optional, "bar" | "line", default: "bar"
12
- "tokensPerStep": 2100000, // optional, default: 2.1e6
13
- "runColumn": "runname", // optional, CSV column for series, default: "runname"
14
- "stepColumn": "steps" // optional, CSV column for x-axis, default: "steps"
 
 
15
  }
16
 
 
 
 
 
 
 
 
17
  Data: uses benchmark-results.csv by default (one CSV with all runs).
18
- Only rows matching keys in datasetNames are displayed.
19
 
20
  Example usage in MDX:
21
  <HtmlEmbed
22
  src="d3-benchmark-comparison.html"
23
  title="Baseline Comparison"
24
  config={{
25
- datasetNames: {
26
  cosmopedia: "Cosmopedia",
27
- dclm: "DCLM",
28
- fw_edu_hq: "FineWeb-Edu (HQ)"
29
  }
30
  }}
31
  />
@@ -107,12 +116,9 @@
107
  .d3-benchmark-comparison .bar.ghost { opacity: .25; }
108
  .d3-benchmark-comparison .value-label.ghost { opacity: .25; }
109
  .d3-benchmark-comparison .line-path { fill: none; stroke-width: 2; opacity: 0.85; }
110
- .d3-benchmark-comparison .line-path.baseline { stroke-dasharray: 6,4; opacity: 0.5; }
111
- .d3-benchmark-comparison .line-path.baseline.ghost { opacity: .1; }
112
  .d3-benchmark-comparison .line-path.ghost { opacity: .15; }
113
- .d3-benchmark-comparison .line-dot.baseline { opacity: 0.5; }
114
- .d3-benchmark-comparison .line-dot.baseline.ghost { opacity: .1; }
115
  .d3-benchmark-comparison .line-dot.ghost { opacity: .15; }
 
116
  .d3-benchmark-comparison .axes path { display: none; }
117
  .d3-benchmark-comparison .axes line { stroke: var(--axis-color); }
118
  .d3-benchmark-comparison .axes text { fill: var(--tick-color); }
@@ -183,14 +189,24 @@
183
  if (raw && raw.trim()) cfg = raw.trim().startsWith('{') ? JSON.parse(raw) : {};
184
  } catch (_) {}
185
 
186
- // Configurable settings with defaults
 
 
 
 
 
 
 
 
 
 
187
  // ─── SETUP SUPPORT ───
188
  const SETUPS = cfg.setups || null;
189
  const setupNames = SETUPS ? Object.keys(SETUPS) : [];
190
  let currentSetup = SETUPS ? setupNames[0] : null;
191
- let DATASET_NAMES = SETUPS ? { ...SETUPS[setupNames[0]].datasetNames } : (cfg.datasetNames || {});
192
  const AVG_SETUP_KEY = 'Average (all setups)';
193
- let avgDatasetNames = {};
194
  let parsedData = [];
195
 
196
  const RUN_COL = cfg.runColumn || 'runname';
@@ -198,21 +214,15 @@
198
  const TOKENS_PER_STEP = cfg.tokensPerStep || 2.1e6;
199
  const defaultMetric = cfg.defaultMetric || 'agg_score_macro';
200
  const defaultView = cfg.defaultView || 'bar';
201
- // Stable baseline colors, merged with per-chart overrides
202
- const PINNED_COLORS = Object.assign({ 'DCLM': '#8b8b8b', 'FineWeb-Edu (HQ)': '#86a1a9' }, cfg.pinnedColors || {});
203
- // Unique ID suffix for multiple instances on same page
204
  const uid = Math.random().toString(36).slice(2, 8);
205
 
206
- // Baseline datasets: dashed lines, striped bars, reduced opacity
207
- const BASELINES = new Set(cfg.baselines || ['dclm', 'fw_edu_hq']);
208
- function isBaseline(raw) { return BASELINES.has(raw); }
 
 
209
  function stripePatternId(raw) { return 'stripe-' + uid + '-' + raw.replace(/[^a-zA-Z0-9]/g, '_'); }
210
- function barFill(d) {
211
- if (isBaseline(d.rawName)) return `url(#${stripePatternId(d.rawName)})`;
212
- return colorMap[d.rawName] || 'var(--primary-color)';
213
- }
214
 
215
- // Standard metric display names (shared across all CSVs from this benchmark suite)
216
  const METRIC_NAMES = {
217
  'agg_score_macro': 'Aggregate Score (Macro)',
218
  'agg_score_micro': 'Aggregate Score (Micro)',
@@ -251,14 +261,13 @@
251
 
252
  // State
253
  let allData = [];
254
- let metricKeys = []; // auto-detected from CSV columns
255
  let currentMetric = defaultMetric;
256
  let currentView = defaultView;
257
  let colorMap = {};
258
  let highlight = null;
259
 
260
  // ─── HELPERS ───
261
- function displayName(raw) { return DATASET_NAMES[raw] || raw; }
262
  function metricName(key) { return METRIC_NAMES[key] || key; }
263
 
264
  function stepsToTokens(step) { return step * TOKENS_PER_STEP; }
@@ -282,49 +291,46 @@
282
  function initColors() {
283
  if (Object.keys(colorMap).length) return;
284
  const allRaw = Array.from(d3.group(allData, d => d[RUN_COL]).keys()).sort();
285
- // Assign pinned colors first (keyed by display name)
286
  const unpinned = [];
287
  allRaw.forEach(raw => {
288
- const name = displayName(raw);
289
- if (PINNED_COLORS[name]) { colorMap[raw] = PINNED_COLORS[name]; }
290
  else { unpinned.push(raw); }
291
  });
292
- // Fill remaining from categorical palette
293
  const palette = getCategoricalColors(unpinned.length);
294
  unpinned.forEach((raw, i) => { colorMap[raw] = palette[i % palette.length]; });
295
  }
296
 
297
  // ─── SETUP HELPERS ───
298
  function filterData() {
299
- const knownNames = Object.keys(DATASET_NAMES);
300
  allData = knownNames.length ? parsedData.filter(r => knownNames.includes(r[RUN_COL])) : parsedData;
301
  allData.columns = parsedData.columns;
302
  }
303
 
304
  function computeAverageData(rawData) {
305
- if (!SETUPS || setupNames.length < 2) return { data: [], datasetNames: {} };
306
- // Build mapping: displayName -> [rawName1, rawName2, ...]
307
  const displayToRaws = {};
308
  for (const sName of setupNames) {
309
- const dn = SETUPS[sName].datasetNames;
310
- for (const [raw, display] of Object.entries(dn)) {
311
- if (!displayToRaws[display]) displayToRaws[display] = [];
312
- displayToRaws[display].push(raw);
313
  }
314
  }
315
- // Only average display names that appear in ALL setups
316
  const fullDisplay = Object.entries(displayToRaws)
317
  .filter(([, raws]) => raws.length >= setupNames.length);
318
- // Index raw data by runname+step for fast lookup
319
  const byRunStep = {};
320
  for (const row of rawData) byRunStep[row[RUN_COL] + '|' + row[STEP_COL]] = row;
321
  const steps = Array.from(new Set(rawData.map(r => +r[STEP_COL]))).sort((a, b) => a - b);
322
  const cols = rawData.columns || Object.keys(rawData[0] || {});
323
  const result = [];
324
- const dnMap = {};
325
  for (const [display, raws] of fullDisplay) {
326
  const avgRaw = '__avg__' + display.replace(/[^a-zA-Z0-9]/g, '_');
327
- dnMap[avgRaw] = display;
 
 
328
  for (const step of steps) {
329
  const rows = raws.map(r => byRunStep[r + '|' + step]).filter(Boolean);
330
  if (!rows.length) continue;
@@ -337,26 +343,23 @@
337
  result.push(avgRow);
338
  }
339
  }
340
- return { data: result, datasetNames: dnMap };
341
  }
342
 
343
  function switchSetup(name) {
344
  currentSetup = name;
345
  if (name === AVG_SETUP_KEY) {
346
- DATASET_NAMES = { ...avgDatasetNames };
347
  } else {
348
- DATASET_NAMES = { ...SETUPS[name].datasetNames };
349
  }
350
- // Re-add baselines that may be shared across setups
351
- const baselineNames = cfg.baselines || ['dclm', 'fw_edu_hq'];
352
- for (const bRaw of baselineNames) {
353
- if (parsedData.some(r => r[RUN_COL] === bRaw) && !DATASET_NAMES[bRaw]) {
354
- // Find display name from any setup or use raw
355
- let bDisplay = bRaw;
356
- for (const sName of setupNames) {
357
- if (SETUPS[sName].datasetNames[bRaw]) { bDisplay = SETUPS[sName].datasetNames[bRaw]; break; }
358
  }
359
- DATASET_NAMES[bRaw] = bDisplay;
360
  }
361
  }
362
  colorMap = {};
@@ -384,6 +387,10 @@
384
  gRoot.selectAll('text.value-label').classed('ghost', d => highlight && d.name !== highlight);
385
  gRoot.selectAll('.line-path').classed('ghost', d => highlight && d.name !== highlight);
386
  gRoot.selectAll('.line-dot').classed('ghost', d => highlight && d.name !== highlight);
 
 
 
 
387
  container.querySelectorAll('.legend .item').forEach(el => {
388
  el.classList.toggle('ghost', highlight && el.getAttribute('data-name') !== highlight);
389
  });
@@ -392,7 +399,6 @@
392
  // ─── AUTO-DETECT METRICS from CSV columns ───
393
  function detectMetrics(columns) {
394
  const skip = new Set([RUN_COL, STEP_COL, 'seed']);
395
- // Ordered: aggregate first, then individual
396
  const aggOrder = ['agg_score_macro', 'agg_score_micro', 'agg_score_RC', 'agg_score_GK', 'agg_score_NLU', 'agg_score_MATH', 'agg_score_TABLE', 'agg_score_RES'];
397
  const agg = aggOrder.filter(k => columns.includes(k));
398
  const ind = columns.filter(k => !skip.has(k) && !agg.includes(k) && !isNaN(+allData[0][k]));
@@ -402,7 +408,8 @@
402
  // ─── BAR CHART ───
403
  function renderBar() {
404
  const width = container.clientWidth || 800;
405
- const margin = { top: 12, right: 56, bottom: 32, left: 190 };
 
406
 
407
  const grouped = d3.group(allData, d => d[RUN_COL]);
408
  const finalData = [];
@@ -413,8 +420,11 @@
413
  }
414
  finalData.sort((a, b) => b.value - a.value);
415
 
 
 
 
416
  const barHeight = 28, barGap = 8;
417
- const height = margin.top + margin.bottom + finalData.length * (barHeight + barGap);
418
  svg.attr('width', width).attr('height', height);
419
  gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
420
 
@@ -422,7 +432,7 @@
422
  const innerHeight = height - margin.top - margin.bottom;
423
 
424
  const x = d3.scaleLinear().domain([0, d3.max(finalData, d => d.value) * 1.05]).range([0, innerWidth]);
425
- const y = d3.scaleBand().domain(finalData.map(d => d.name)).range([0, innerHeight]).padding(0.2);
426
 
427
  // Grid
428
  gRoot.selectAll('.grid').data([0]).join('g').attr('class', 'grid').call(g => {
@@ -447,10 +457,9 @@
447
  g.selectAll('path, line').attr('stroke', 'var(--axis-color)');
448
  });
449
 
450
- // Bars
451
- // Stripe patterns for baseline bars
452
- finalData.forEach(d => {
453
- if (!isBaseline(d.rawName)) return;
454
  const c = colorMap[d.rawName] || '#999';
455
  const pat = defs.append('pattern').attr('id', stripePatternId(d.rawName))
456
  .attr('width', 6).attr('height', 6).attr('patternUnits', 'userSpaceOnUse').attr('patternTransform', 'rotate(45)');
@@ -458,11 +467,17 @@
458
  pat.append('line').attr('x1', 0).attr('y1', 0).attr('x2', 0).attr('y2', 6).attr('stroke', c).attr('stroke-width', 2.5);
459
  });
460
 
 
 
 
 
 
 
461
  const barTip = (ev, d) => {
462
  const [mx, my] = d3.pointer(ev, container);
463
- showTip(`<strong>${d.name}</strong><br/>${metricName(currentMetric)}: <strong>${d.value.toFixed(4)}</strong>`, mx, my);
464
  };
465
- gRoot.selectAll('rect.bar').data(finalData, d => d.name).join(
466
  enter => enter.append('rect').attr('class', 'bar')
467
  .attr('x', 0).attr('y', d => y(d.name)).attr('height', y.bandwidth()).attr('rx', 3)
468
  .attr('fill', d => barFill(d))
@@ -483,14 +498,39 @@
483
  );
484
 
485
  // Value labels
486
- gRoot.selectAll('text.value-label').data(finalData, d => d.name).join(
487
  enter => enter.append('text').attr('class', 'value-label')
488
  .attr('x', d => x(d.value) + 5).attr('y', d => y(d.name) + y.bandwidth() / 2)
489
  .attr('dy', '0.35em').attr('fill', 'var(--text-color)').attr('font-size', 11)
490
- .text(d => d.value.toFixed(4)),
491
  update => update.transition().duration(300)
492
  .attr('x', d => x(d.value) + 5).attr('y', d => y(d.name) + y.bandwidth() / 2)
493
- .text(d => d.value.toFixed(4)),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
  exit => exit.remove()
495
  );
496
  }
@@ -498,6 +538,7 @@
498
  // ─── LINE CHART ───
499
  function renderLine() {
500
  const width = container.clientWidth || 800;
 
501
  const margin = { top: 16, right: 50, bottom: 48, left: 60 };
502
  const height = Math.max(300, Math.round(width / 2.5));
503
  svg.attr('width', width).attr('height', height);
@@ -509,13 +550,20 @@
509
  // Build series
510
  const grouped = d3.group(allData, d => d[RUN_COL]);
511
  const series = [];
 
512
  for (const [raw, rows] of grouped) {
513
  const pts = rows.map(r => ({ step: +r[STEP_COL], value: +r[currentMetric] })).sort((a, b) => a.step - b.step);
514
- series.push({ name: displayName(raw), rawName: raw, values: pts });
 
 
 
 
 
 
515
  }
516
 
517
- const allSteps = Array.from(new Set(allData.map(r => +r[STEP_COL]))).sort((a, b) => a - b);
518
- const allValues = series.flatMap(s => s.values.map(v => v.value));
519
 
520
  const x = d3.scaleLinear().domain(d3.extent(allSteps)).range([0, innerWidth]);
521
  const yMin = d3.min(allValues), yMax = d3.max(allValues), yPad = (yMax - yMin) * 0.08;
@@ -555,28 +603,52 @@
555
  .attr('text-anchor', 'middle').attr('fill', 'var(--text-color)').attr('font-size', 12)
556
  .text(metricName(currentMetric));
557
 
558
- // Lines
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
  const line = d3.line().x(d => x(d.step)).y(d => y(d.value)).curve(d3.curveMonotoneX);
560
  gRoot.selectAll('.line-path').data(series, d => d.name).join(
561
- enter => enter.append('path').attr('class', d => 'line-path' + (isBaseline(d.rawName) ? ' baseline' : ''))
562
  .attr('stroke', d => colorMap[d.rawName] || 'var(--primary-color)')
563
  .attr('d', d => line(d.values)),
564
- update => update.attr('class', d => 'line-path' + (isBaseline(d.rawName) ? ' baseline' : ''))
565
- .transition().duration(300)
566
  .attr('stroke', d => colorMap[d.rawName] || 'var(--primary-color)')
567
  .attr('d', d => line(d.values)),
568
  exit => exit.remove()
569
  );
570
 
571
- // Dots
572
  const dotData = series.flatMap(s => s.values.map(v => ({ name: s.name, rawName: s.rawName, step: v.step, value: v.value })));
573
  gRoot.selectAll('.line-dot').data(dotData, d => d.name + '-' + d.step).join(
574
- enter => enter.append('circle').attr('class', d => 'line-dot' + (isBaseline(d.rawName) ? ' baseline' : ''))
575
  .attr('cx', d => x(d.step)).attr('cy', d => y(d.value)).attr('r', 3)
576
  .attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)')
577
  .attr('stroke', 'var(--surface-bg)').attr('stroke-width', 1),
578
- update => update.attr('class', d => 'line-dot' + (isBaseline(d.rawName) ? ' baseline' : ''))
579
- .transition().duration(300)
580
  .attr('cx', d => x(d.step)).attr('cy', d => y(d.value))
581
  .attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)'),
582
  exit => exit.remove()
@@ -597,11 +669,15 @@
597
  const entries = series.map(s => {
598
  const pt = s.values.find(v => v.step === nearest);
599
  return pt ? { name: s.name, rawName: s.rawName, value: pt.value } : null;
600
- }).filter(Boolean).sort((a, b) => b.value - a.value);
 
 
 
 
601
 
602
  let html = `<div style="font-weight:700;margin-bottom:4px;">${stepLabelLong(nearest)}</div>`;
603
  entries.forEach(e => {
604
- html += `<div><span class="tip-dot" style="background:${colorMap[e.rawName]}"></span>${e.name}: <strong>${e.value.toFixed(4)}</strong></div>`;
605
  });
606
  const [cx, cy] = d3.pointer(ev, container);
607
  showTip(html, cx, cy);
@@ -625,7 +701,6 @@
625
  function buildUI() {
626
  const controls = document.createElement('div'); controls.className = 'controls';
627
 
628
- // Setup selector (only shown when setups config is present)
629
  if (SETUPS && setupNames.length > 0) {
630
  const setupGroup = document.createElement('div'); setupGroup.className = 'control-group';
631
  const setupLabel = document.createElement('label'); setupLabel.setAttribute('for', 'setup-' + uid); setupLabel.textContent = 'Setup';
@@ -635,7 +710,6 @@
635
  if (name === currentSetup) opt.selected = true;
636
  setupSelect.appendChild(opt);
637
  });
638
- // Add Average option
639
  if (setupNames.length >= 2) {
640
  const avgOpt = document.createElement('option'); avgOpt.value = AVG_SETUP_KEY; avgOpt.textContent = AVG_SETUP_KEY;
641
  setupSelect.appendChild(avgOpt);
@@ -645,7 +719,6 @@
645
  controls.appendChild(setupGroup);
646
  }
647
 
648
- // View toggle
649
  const viewGroup = document.createElement('div'); viewGroup.className = 'control-group';
650
  const viewLabel = document.createElement('label'); viewLabel.setAttribute('for', 'view-' + uid); viewLabel.textContent = 'View';
651
  const viewSelect = document.createElement('select'); viewSelect.id = 'view-' + uid;
@@ -658,7 +731,6 @@
658
  viewGroup.appendChild(viewLabel); viewGroup.appendChild(viewSelect);
659
  controls.appendChild(viewGroup);
660
 
661
- // Metric select (populated after data load)
662
  const metricGroup = document.createElement('div'); metricGroup.className = 'control-group';
663
  const metricLabel = document.createElement('label'); metricLabel.setAttribute('for', 'metric-' + uid); metricLabel.textContent = 'Metric';
664
  const metricSelect = document.createElement('select'); metricSelect.id = 'metric-' + uid;
@@ -667,7 +739,6 @@
667
 
668
  container.appendChild(controls);
669
 
670
- // Legend
671
  const legend = document.createElement('div'); legend.className = 'legend';
672
  legend.innerHTML = '<div class="legend-title">Legend</div><div class="items"></div>';
673
  container.appendChild(legend);
@@ -693,7 +764,6 @@
693
  const items = container.querySelector('.legend .items');
694
  if (!items) return;
695
  items.innerHTML = '';
696
- // Sort by final score (max step) on current default metric, descending
697
  const grouped = d3.group(allData, d => d[RUN_COL]);
698
  const sorted = Array.from(grouped.entries())
699
  .map(([raw, rows]) => {
@@ -703,13 +773,17 @@
703
  })
704
  .sort((a, b) => b.score - a.score)
705
  .map(d => d.raw);
706
- sorted.forEach(raw => {
707
  const name = displayName(raw);
708
  const el = document.createElement('span'); el.className = 'item'; el.setAttribute('data-name', name);
709
  const sw = document.createElement('span'); sw.className = 'swatch';
710
- const swColor = colorMap[raw] || '#999';
711
- sw.style.background = swColor;
712
- if (isBaseline(raw)) sw.style.backgroundImage = 'repeating-linear-gradient(45deg, transparent, transparent 2px, rgba(255,255,255,0.4) 2px, rgba(255,255,255,0.4) 4px)';
 
 
 
 
713
  const txt = document.createElement('span'); txt.textContent = name;
714
  el.appendChild(sw); el.appendChild(txt); items.appendChild(el);
715
  el.addEventListener('mouseenter', () => { highlight = name; updateHighlight(); });
@@ -745,17 +819,14 @@
745
  const text = await fetchFirstAvailable(csvPaths);
746
  const parsed = d3.csvParse(text);
747
  parsedData = parsed;
748
- // Compute average data for setup mode
749
  if (SETUPS && setupNames.length >= 2) {
750
  const avg = computeAverageData(parsed);
751
- avgDatasetNames = avg.datasetNames;
752
  parsedData = parsed.concat(avg.data);
753
  parsedData.columns = parsed.columns;
754
  }
755
- // Filter to only datasets with configured display names
756
  filterData();
757
  metricKeys = detectMetrics(allData.columns);
758
- // Ensure defaultMetric is valid; fall back to first available
759
  if (!metricKeys.includes(currentMetric)) currentMetric = metricKeys[0];
760
  populateMetricSelect();
761
  render();
 
3
 
4
  Configuration via data-config attribute:
5
  {
6
+ "datasets": { // required (unless using setups)
7
+ "raw_name": "Display Name", // shorthand: string = display name
8
+ "raw_name": { "display": "Name", "color": "#hex", "shaded": true, "baseline": true }
9
+ // full form: display is required, rest optional
10
+ },
11
+ "setups": { "Setup Label": { "datasets": {...} }, ... }, // optional, multi-setup mode with dropdown + average
12
+ "defaultMetric": "agg_score_macro", // optional, default: "agg_score_macro"
13
+ "defaultView": "bar", // optional, "bar" | "line", default: "bar"
14
+ "tokensPerStep": 2100000, // optional, default: 2.1e6
15
+ "runColumn": "runname", // optional, CSV column for series, default: "runname"
16
+ "stepColumn": "steps" // optional, CSV column for x-axis, default: "steps"
17
  }
18
 
19
+ Per-dataset options (all optional except display):
20
+ display: Display name shown in legend, axes, and tooltips
21
+ color: Pinned hex color (otherwise auto-assigned from palette)
22
+ shaded: If true, bar gets a diagonal-stripe pattern (useful for aggregate baselines)
23
+ baseline: If true, rendered as a reference line (vertical in bar view, horizontal in line view)
24
+ instead of a regular bar/line. Not shown in the legend.
25
+
26
  Data: uses benchmark-results.csv by default (one CSV with all runs).
27
+ Only rows matching keys in datasets are displayed.
28
 
29
  Example usage in MDX:
30
  <HtmlEmbed
31
  src="d3-benchmark-comparison.html"
32
  title="Baseline Comparison"
33
  config={{
34
+ datasets: {
35
  cosmopedia: "Cosmopedia",
36
+ dclm: { display: "Baseline (DCLM)", baseline: true },
37
+ nemotron_hq_synth: { display: "Nemotron-HQ-Synth", color: "#76b900", shaded: true }
38
  }
39
  }}
40
  />
 
116
  .d3-benchmark-comparison .bar.ghost { opacity: .25; }
117
  .d3-benchmark-comparison .value-label.ghost { opacity: .25; }
118
  .d3-benchmark-comparison .line-path { fill: none; stroke-width: 2; opacity: 0.85; }
 
 
119
  .d3-benchmark-comparison .line-path.ghost { opacity: .15; }
 
 
120
  .d3-benchmark-comparison .line-dot.ghost { opacity: .15; }
121
+ .d3-benchmark-comparison .baseline.ghost { opacity: .1; }
122
  .d3-benchmark-comparison .axes path { display: none; }
123
  .d3-benchmark-comparison .axes line { stroke: var(--axis-color); }
124
  .d3-benchmark-comparison .axes text { fill: var(--tick-color); }
 
189
  if (raw && raw.trim()) cfg = raw.trim().startsWith('{') ? JSON.parse(raw) : {};
190
  } catch (_) {}
191
 
192
+ // ─── NORMALIZE DATASETS CONFIG ───
193
+ // Accepts: { "key": "Name" } or { "key": { display, color, shaded, baseline } }
194
+ // Returns: { key: { display, color, shaded, baseline } }
195
+ function normalizeDatasets(raw) {
196
+ const out = {};
197
+ for (const [k, v] of Object.entries(raw || {})) {
198
+ out[k] = typeof v === 'string' ? { display: v } : { ...v };
199
+ }
200
+ return out;
201
+ }
202
+
203
  // ─── SETUP SUPPORT ───
204
  const SETUPS = cfg.setups || null;
205
  const setupNames = SETUPS ? Object.keys(SETUPS) : [];
206
  let currentSetup = SETUPS ? setupNames[0] : null;
207
+ let DATASETS = SETUPS ? normalizeDatasets(SETUPS[setupNames[0]].datasets) : normalizeDatasets(cfg.datasets);
208
  const AVG_SETUP_KEY = 'Average (all setups)';
209
+ let avgDatasets = {};
210
  let parsedData = [];
211
 
212
  const RUN_COL = cfg.runColumn || 'runname';
 
214
  const TOKENS_PER_STEP = cfg.tokensPerStep || 2.1e6;
215
  const defaultMetric = cfg.defaultMetric || 'agg_score_macro';
216
  const defaultView = cfg.defaultView || 'bar';
 
 
 
217
  const uid = Math.random().toString(36).slice(2, 8);
218
 
219
+ // ─── DATASET ACCESSORS ───
220
+ function displayName(raw) { return DATASETS[raw] ? DATASETS[raw].display : raw; }
221
+ function isBaseline(raw) { return !!(DATASETS[raw] && DATASETS[raw].baseline); }
222
+ function isShaded(raw) { return !!(DATASETS[raw] && DATASETS[raw].shaded); }
223
+ function pinnedColor(raw) { return DATASETS[raw] && DATASETS[raw].color; }
224
  function stripePatternId(raw) { return 'stripe-' + uid + '-' + raw.replace(/[^a-zA-Z0-9]/g, '_'); }
 
 
 
 
225
 
 
226
  const METRIC_NAMES = {
227
  'agg_score_macro': 'Aggregate Score (Macro)',
228
  'agg_score_micro': 'Aggregate Score (Micro)',
 
261
 
262
  // State
263
  let allData = [];
264
+ let metricKeys = [];
265
  let currentMetric = defaultMetric;
266
  let currentView = defaultView;
267
  let colorMap = {};
268
  let highlight = null;
269
 
270
  // ─── HELPERS ───
 
271
  function metricName(key) { return METRIC_NAMES[key] || key; }
272
 
273
  function stepsToTokens(step) { return step * TOKENS_PER_STEP; }
 
291
  function initColors() {
292
  if (Object.keys(colorMap).length) return;
293
  const allRaw = Array.from(d3.group(allData, d => d[RUN_COL]).keys()).sort();
 
294
  const unpinned = [];
295
  allRaw.forEach(raw => {
296
+ const pc = pinnedColor(raw);
297
+ if (pc) { colorMap[raw] = pc; }
298
  else { unpinned.push(raw); }
299
  });
 
300
  const palette = getCategoricalColors(unpinned.length);
301
  unpinned.forEach((raw, i) => { colorMap[raw] = palette[i % palette.length]; });
302
  }
303
 
304
  // ─── SETUP HELPERS ───
305
  function filterData() {
306
+ const knownNames = Object.keys(DATASETS);
307
  allData = knownNames.length ? parsedData.filter(r => knownNames.includes(r[RUN_COL])) : parsedData;
308
  allData.columns = parsedData.columns;
309
  }
310
 
311
  function computeAverageData(rawData) {
312
+ if (!SETUPS || setupNames.length < 2) return { data: [], datasets: {} };
 
313
  const displayToRaws = {};
314
  for (const sName of setupNames) {
315
+ const ds = normalizeDatasets(SETUPS[sName].datasets);
316
+ for (const [raw, opts] of Object.entries(ds)) {
317
+ if (!displayToRaws[opts.display]) displayToRaws[opts.display] = [];
318
+ displayToRaws[opts.display].push(raw);
319
  }
320
  }
 
321
  const fullDisplay = Object.entries(displayToRaws)
322
  .filter(([, raws]) => raws.length >= setupNames.length);
 
323
  const byRunStep = {};
324
  for (const row of rawData) byRunStep[row[RUN_COL] + '|' + row[STEP_COL]] = row;
325
  const steps = Array.from(new Set(rawData.map(r => +r[STEP_COL]))).sort((a, b) => a - b);
326
  const cols = rawData.columns || Object.keys(rawData[0] || {});
327
  const result = [];
328
+ const dsMap = {};
329
  for (const [display, raws] of fullDisplay) {
330
  const avgRaw = '__avg__' + display.replace(/[^a-zA-Z0-9]/g, '_');
331
+ // Merge options from first setup that has this display name
332
+ const firstOpts = Object.values(normalizeDatasets(SETUPS[setupNames[0]].datasets)).find(o => o.display === display) || {};
333
+ dsMap[avgRaw] = { display, ...firstOpts };
334
  for (const step of steps) {
335
  const rows = raws.map(r => byRunStep[r + '|' + step]).filter(Boolean);
336
  if (!rows.length) continue;
 
343
  result.push(avgRow);
344
  }
345
  }
346
+ return { data: result, datasets: dsMap };
347
  }
348
 
349
  function switchSetup(name) {
350
  currentSetup = name;
351
  if (name === AVG_SETUP_KEY) {
352
+ DATASETS = { ...avgDatasets };
353
  } else {
354
+ DATASETS = normalizeDatasets(SETUPS[name].datasets);
355
  }
356
+ // Re-add baselines from any setup
357
+ for (const sName of setupNames) {
358
+ const ds = normalizeDatasets(SETUPS[sName].datasets);
359
+ for (const [raw, opts] of Object.entries(ds)) {
360
+ if (opts.baseline && !DATASETS[raw] && parsedData.some(r => r[RUN_COL] === raw)) {
361
+ DATASETS[raw] = { ...opts };
 
 
362
  }
 
363
  }
364
  }
365
  colorMap = {};
 
387
  gRoot.selectAll('text.value-label').classed('ghost', d => highlight && d.name !== highlight);
388
  gRoot.selectAll('.line-path').classed('ghost', d => highlight && d.name !== highlight);
389
  gRoot.selectAll('.line-dot').classed('ghost', d => highlight && d.name !== highlight);
390
+ gRoot.selectAll('.baseline-vline').classed('ghost', d => highlight && d.name !== highlight);
391
+ gRoot.selectAll('.baseline-vlabel').classed('ghost', d => highlight && d.name !== highlight);
392
+ gRoot.selectAll('.baseline-hline').classed('ghost', d => highlight && d.name !== highlight);
393
+ gRoot.selectAll('.baseline-hlabel').classed('ghost', d => highlight && d.name !== highlight);
394
  container.querySelectorAll('.legend .item').forEach(el => {
395
  el.classList.toggle('ghost', highlight && el.getAttribute('data-name') !== highlight);
396
  });
 
399
  // ─── AUTO-DETECT METRICS from CSV columns ───
400
  function detectMetrics(columns) {
401
  const skip = new Set([RUN_COL, STEP_COL, 'seed']);
 
402
  const aggOrder = ['agg_score_macro', 'agg_score_micro', 'agg_score_RC', 'agg_score_GK', 'agg_score_NLU', 'agg_score_MATH', 'agg_score_TABLE', 'agg_score_RES'];
403
  const agg = aggOrder.filter(k => columns.includes(k));
404
  const ind = columns.filter(k => !skip.has(k) && !agg.includes(k) && !isNaN(+allData[0][k]));
 
408
  // ─── BAR CHART ───
409
  function renderBar() {
410
  const width = container.clientWidth || 800;
411
+ const hasBaselines = allData.some(r => isBaseline(r[RUN_COL]));
412
+ const margin = { top: hasBaselines ? 20 : 12, right: 56, bottom: 32, left: 190 };
413
 
414
  const grouped = d3.group(allData, d => d[RUN_COL]);
415
  const finalData = [];
 
420
  }
421
  finalData.sort((a, b) => b.value - a.value);
422
 
423
+ const barData = finalData.filter(d => !isBaseline(d.rawName));
424
+ const baselineData = finalData.filter(d => isBaseline(d.rawName));
425
+
426
  const barHeight = 28, barGap = 8;
427
+ const height = margin.top + margin.bottom + barData.length * (barHeight + barGap);
428
  svg.attr('width', width).attr('height', height);
429
  gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
430
 
 
432
  const innerHeight = height - margin.top - margin.bottom;
433
 
434
  const x = d3.scaleLinear().domain([0, d3.max(finalData, d => d.value) * 1.05]).range([0, innerWidth]);
435
+ const y = d3.scaleBand().domain(barData.map(d => d.name)).range([0, innerHeight]).padding(0.2);
436
 
437
  // Grid
438
  gRoot.selectAll('.grid').data([0]).join('g').attr('class', 'grid').call(g => {
 
457
  g.selectAll('path, line').attr('stroke', 'var(--axis-color)');
458
  });
459
 
460
+ // Stripe patterns for shaded bars
461
+ barData.forEach(d => {
462
+ if (!isShaded(d.rawName)) return;
 
463
  const c = colorMap[d.rawName] || '#999';
464
  const pat = defs.append('pattern').attr('id', stripePatternId(d.rawName))
465
  .attr('width', 6).attr('height', 6).attr('patternUnits', 'userSpaceOnUse').attr('patternTransform', 'rotate(45)');
 
467
  pat.append('line').attr('x1', 0).attr('y1', 0).attr('x2', 0).attr('y2', 6).attr('stroke', c).attr('stroke-width', 2.5);
468
  });
469
 
470
+ function barFill(d) {
471
+ if (isShaded(d.rawName)) return `url(#${stripePatternId(d.rawName)})`;
472
+ return colorMap[d.rawName] || 'var(--primary-color)';
473
+ }
474
+
475
+ // Bars
476
  const barTip = (ev, d) => {
477
  const [mx, my] = d3.pointer(ev, container);
478
+ showTip(`<strong>${d.name}</strong><br/>${metricName(currentMetric)}: <strong>${d.value.toFixed(3)}</strong>`, mx, my);
479
  };
480
+ gRoot.selectAll('rect.bar').data(barData, d => d.name).join(
481
  enter => enter.append('rect').attr('class', 'bar')
482
  .attr('x', 0).attr('y', d => y(d.name)).attr('height', y.bandwidth()).attr('rx', 3)
483
  .attr('fill', d => barFill(d))
 
498
  );
499
 
500
  // Value labels
501
+ gRoot.selectAll('text.value-label').data(barData, d => d.name).join(
502
  enter => enter.append('text').attr('class', 'value-label')
503
  .attr('x', d => x(d.value) + 5).attr('y', d => y(d.name) + y.bandwidth() / 2)
504
  .attr('dy', '0.35em').attr('fill', 'var(--text-color)').attr('font-size', 11)
505
+ .text(d => d.value.toFixed(3)),
506
  update => update.transition().duration(300)
507
  .attr('x', d => x(d.value) + 5).attr('y', d => y(d.name) + y.bandwidth() / 2)
508
+ .text(d => d.value.toFixed(3)),
509
+ exit => exit.remove()
510
+ );
511
+
512
+ // Baseline vertical reference lines
513
+ gRoot.selectAll('.baseline-vline').data(baselineData, d => d.name).join(
514
+ enter => enter.append('line').attr('class', 'baseline-vline baseline')
515
+ .attr('x1', d => x(d.value)).attr('x2', d => x(d.value))
516
+ .attr('y1', 0).attr('y2', innerHeight)
517
+ .attr('stroke', d => colorMap[d.rawName] || '#999')
518
+ .attr('stroke-width', 2).attr('stroke-dasharray', '6,4').attr('opacity', 0.7),
519
+ update => update.transition().duration(300)
520
+ .attr('x1', d => x(d.value)).attr('x2', d => x(d.value))
521
+ .attr('y1', 0).attr('y2', innerHeight)
522
+ .attr('stroke', d => colorMap[d.rawName] || '#999'),
523
+ exit => exit.remove()
524
+ );
525
+ gRoot.selectAll('.baseline-vlabel').data(baselineData, d => d.name).join(
526
+ enter => enter.append('text').attr('class', 'baseline-vlabel baseline')
527
+ .attr('x', d => x(d.value)).attr('y', -4)
528
+ .attr('text-anchor', 'middle').attr('fill', d => colorMap[d.rawName] || '#999')
529
+ .attr('font-size', 11).attr('font-weight', 600)
530
+ .text(d => `${d.name} (${d.value.toFixed(3)})`),
531
+ update => update.transition().duration(300)
532
+ .attr('x', d => x(d.value))
533
+ .text(d => `${d.name} (${d.value.toFixed(3)})`),
534
  exit => exit.remove()
535
  );
536
  }
 
538
  // ─── LINE CHART ───
539
  function renderLine() {
540
  const width = container.clientWidth || 800;
541
+ const hasBaselines = allData.some(r => isBaseline(r[RUN_COL]));
542
  const margin = { top: 16, right: 50, bottom: 48, left: 60 };
543
  const height = Math.max(300, Math.round(width / 2.5));
544
  svg.attr('width', width).attr('height', height);
 
550
  // Build series
551
  const grouped = d3.group(allData, d => d[RUN_COL]);
552
  const series = [];
553
+ const baselineSeries = [];
554
  for (const [raw, rows] of grouped) {
555
  const pts = rows.map(r => ({ step: +r[STEP_COL], value: +r[currentMetric] })).sort((a, b) => a.step - b.step);
556
+ const entry = { name: displayName(raw), rawName: raw, values: pts };
557
+ if (isBaseline(raw)) {
558
+ entry.finalValue = pts[pts.length - 1].value;
559
+ baselineSeries.push(entry);
560
+ } else {
561
+ series.push(entry);
562
+ }
563
  }
564
 
565
+ const allSteps = Array.from(new Set(allData.filter(r => !isBaseline(r[RUN_COL])).map(r => +r[STEP_COL]))).sort((a, b) => a - b);
566
+ const allValues = [...series, ...baselineSeries].flatMap(s => s.finalValue != null ? [s.finalValue] : s.values.map(v => v.value));
567
 
568
  const x = d3.scaleLinear().domain(d3.extent(allSteps)).range([0, innerWidth]);
569
  const yMin = d3.min(allValues), yMax = d3.max(allValues), yPad = (yMax - yMin) * 0.08;
 
603
  .attr('text-anchor', 'middle').attr('fill', 'var(--text-color)').attr('font-size', 12)
604
  .text(metricName(currentMetric));
605
 
606
+ // Baseline horizontal reference lines
607
+ gRoot.selectAll('.baseline-hline').data(baselineSeries, d => d.name).join(
608
+ enter => enter.append('line').attr('class', 'baseline-hline baseline')
609
+ .attr('x1', 0).attr('x2', innerWidth)
610
+ .attr('y1', d => y(d.finalValue)).attr('y2', d => y(d.finalValue))
611
+ .attr('stroke', d => colorMap[d.rawName] || '#999')
612
+ .attr('stroke-width', 2).attr('stroke-dasharray', '6,4').attr('opacity', 0.7),
613
+ update => update.transition().duration(300)
614
+ .attr('x1', 0).attr('x2', innerWidth)
615
+ .attr('y1', d => y(d.finalValue)).attr('y2', d => y(d.finalValue))
616
+ .attr('stroke', d => colorMap[d.rawName] || '#999'),
617
+ exit => exit.remove()
618
+ );
619
+ gRoot.selectAll('.baseline-hlabel').data(baselineSeries, d => d.name).join(
620
+ enter => enter.append('text').attr('class', 'baseline-hlabel baseline')
621
+ .attr('x', 4).attr('y', d => y(d.finalValue) - 6)
622
+ .attr('text-anchor', 'start')
623
+ .attr('fill', d => colorMap[d.rawName] || '#999')
624
+ .attr('font-size', 10).attr('font-weight', 600)
625
+ .text(d => `${d.name} (${d.finalValue.toFixed(3)})`),
626
+ update => update.transition().duration(300)
627
+ .attr('x', 4).attr('y', d => y(d.finalValue) - 6)
628
+ .text(d => `${d.name} (${d.finalValue.toFixed(3)})`),
629
+ exit => exit.remove()
630
+ );
631
+
632
+ // Lines (non-baseline)
633
  const line = d3.line().x(d => x(d.step)).y(d => y(d.value)).curve(d3.curveMonotoneX);
634
  gRoot.selectAll('.line-path').data(series, d => d.name).join(
635
+ enter => enter.append('path').attr('class', 'line-path')
636
  .attr('stroke', d => colorMap[d.rawName] || 'var(--primary-color)')
637
  .attr('d', d => line(d.values)),
638
+ update => update.transition().duration(300)
 
639
  .attr('stroke', d => colorMap[d.rawName] || 'var(--primary-color)')
640
  .attr('d', d => line(d.values)),
641
  exit => exit.remove()
642
  );
643
 
644
+ // Dots (non-baseline)
645
  const dotData = series.flatMap(s => s.values.map(v => ({ name: s.name, rawName: s.rawName, step: v.step, value: v.value })));
646
  gRoot.selectAll('.line-dot').data(dotData, d => d.name + '-' + d.step).join(
647
+ enter => enter.append('circle').attr('class', 'line-dot')
648
  .attr('cx', d => x(d.step)).attr('cy', d => y(d.value)).attr('r', 3)
649
  .attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)')
650
  .attr('stroke', 'var(--surface-bg)').attr('stroke-width', 1),
651
+ update => update.transition().duration(300)
 
652
  .attr('cx', d => x(d.step)).attr('cy', d => y(d.value))
653
  .attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)'),
654
  exit => exit.remove()
 
669
  const entries = series.map(s => {
670
  const pt = s.values.find(v => v.step === nearest);
671
  return pt ? { name: s.name, rawName: s.rawName, value: pt.value } : null;
672
+ }).filter(Boolean);
673
+ baselineSeries.forEach(s => {
674
+ entries.push({ name: s.name, rawName: s.rawName, value: s.finalValue });
675
+ });
676
+ entries.sort((a, b) => b.value - a.value);
677
 
678
  let html = `<div style="font-weight:700;margin-bottom:4px;">${stepLabelLong(nearest)}</div>`;
679
  entries.forEach(e => {
680
+ html += `<div><span class="tip-dot" style="background:${colorMap[e.rawName]}"></span>${e.name}: <strong>${e.value.toFixed(3)}</strong></div>`;
681
  });
682
  const [cx, cy] = d3.pointer(ev, container);
683
  showTip(html, cx, cy);
 
701
  function buildUI() {
702
  const controls = document.createElement('div'); controls.className = 'controls';
703
 
 
704
  if (SETUPS && setupNames.length > 0) {
705
  const setupGroup = document.createElement('div'); setupGroup.className = 'control-group';
706
  const setupLabel = document.createElement('label'); setupLabel.setAttribute('for', 'setup-' + uid); setupLabel.textContent = 'Setup';
 
710
  if (name === currentSetup) opt.selected = true;
711
  setupSelect.appendChild(opt);
712
  });
 
713
  if (setupNames.length >= 2) {
714
  const avgOpt = document.createElement('option'); avgOpt.value = AVG_SETUP_KEY; avgOpt.textContent = AVG_SETUP_KEY;
715
  setupSelect.appendChild(avgOpt);
 
719
  controls.appendChild(setupGroup);
720
  }
721
 
 
722
  const viewGroup = document.createElement('div'); viewGroup.className = 'control-group';
723
  const viewLabel = document.createElement('label'); viewLabel.setAttribute('for', 'view-' + uid); viewLabel.textContent = 'View';
724
  const viewSelect = document.createElement('select'); viewSelect.id = 'view-' + uid;
 
731
  viewGroup.appendChild(viewLabel); viewGroup.appendChild(viewSelect);
732
  controls.appendChild(viewGroup);
733
 
 
734
  const metricGroup = document.createElement('div'); metricGroup.className = 'control-group';
735
  const metricLabel = document.createElement('label'); metricLabel.setAttribute('for', 'metric-' + uid); metricLabel.textContent = 'Metric';
736
  const metricSelect = document.createElement('select'); metricSelect.id = 'metric-' + uid;
 
739
 
740
  container.appendChild(controls);
741
 
 
742
  const legend = document.createElement('div'); legend.className = 'legend';
743
  legend.innerHTML = '<div class="legend-title">Legend</div><div class="items"></div>';
744
  container.appendChild(legend);
 
764
  const items = container.querySelector('.legend .items');
765
  if (!items) return;
766
  items.innerHTML = '';
 
767
  const grouped = d3.group(allData, d => d[RUN_COL]);
768
  const sorted = Array.from(grouped.entries())
769
  .map(([raw, rows]) => {
 
773
  })
774
  .sort((a, b) => b.score - a.score)
775
  .map(d => d.raw);
776
+ sorted.filter(raw => !isBaseline(raw)).forEach(raw => {
777
  const name = displayName(raw);
778
  const el = document.createElement('span'); el.className = 'item'; el.setAttribute('data-name', name);
779
  const sw = document.createElement('span'); sw.className = 'swatch';
780
+ const c = colorMap[raw] || '#999';
781
+ if (isShaded(raw)) {
782
+ sw.style.background = c;
783
+ sw.style.backgroundImage = 'repeating-linear-gradient(45deg, transparent, transparent 2px, rgba(255,255,255,0.4) 2px, rgba(255,255,255,0.4) 4px)';
784
+ } else {
785
+ sw.style.background = c;
786
+ }
787
  const txt = document.createElement('span'); txt.textContent = name;
788
  el.appendChild(sw); el.appendChild(txt); items.appendChild(el);
789
  el.addEventListener('mouseenter', () => { highlight = name; updateHighlight(); });
 
819
  const text = await fetchFirstAvailable(csvPaths);
820
  const parsed = d3.csvParse(text);
821
  parsedData = parsed;
 
822
  if (SETUPS && setupNames.length >= 2) {
823
  const avg = computeAverageData(parsed);
824
+ avgDatasets = avg.datasets;
825
  parsedData = parsed.concat(avg.data);
826
  parsedData.columns = parsed.columns;
827
  }
 
828
  filterData();
829
  metricKeys = detectMetrics(allData.columns);
 
830
  if (!metricKeys.includes(currentMetric)) currentMetric = metricKeys[0];
831
  populateMetricSelect();
832
  render();