joelniklaus HF Staff commited on
Commit
1834e40
·
1 Parent(s): cbf941b

removed redundant defaultView and added dclm and fw-edu-hq baselines everywhere with stable colors for less mental load

Browse files
app/src/content/chapters/experiments.mdx CHANGED
@@ -37,6 +37,7 @@ We see that FinePhrase clearly outperforms the synthetic baselines.
37
  desc="Figure: FinePhrase compared against synthetic data baselines across evaluation metrics."
38
  config={{
39
  defaultView: "line",
 
40
  datasetNames: {
41
  cosmopedia: "Cosmopedia",
42
  "mix-fw_edu_hq-table_smollm2_1.7b_hq": "FinePhrase",
@@ -80,7 +81,6 @@ Using gemma-3-1b, the prompt from REWIRE (guided_rewrite_original) is on-par wit
80
  title="Dissecting Synthetic Baselines"
81
  desc="Figure: Individual prompt performance from existing synthetic datasets compared to DCLM and FineWeb-Edu (HQ)."
82
  config={{
83
- defaultView: "line",
84
  datasetNames: {
85
  "mix-fw_edu_hq-diverse_qa_pairs_1b_hq": "Diverse QA Pairs",
86
  dclm: "DCLM",
@@ -107,7 +107,6 @@ We found four prompts that outperform both fw-edu-hq and the challenging dclm ba
107
  title="New Prompt Performance"
108
  desc="Figure: Four new prompts (math, table, faq, tutorial) compared against DCLM and FineWeb-Edu (HQ)."
109
  config={{
110
- defaultView: "line",
111
  datasetNames: {
112
  "mix-fw_edu_hq-math_1b_hq": "Math",
113
  "mix-fw_edu_hq-table_1b_hq": "Table",
@@ -135,7 +134,6 @@ We compare rephrasing with all Gemma-3 sizes (270m, 1b, 4b, 12b, 27b) using the
135
  title="Model Size: Tutorial Prompt"
136
  desc="Figure: Gemma-3 model sizes (270M to 27B) on the tutorial prompt."
137
  config={{
138
- defaultView: "line",
139
  datasetNames: {
140
  "mix-fw_edu_hq-tutorial_27b_hq": "Gemma-3 27B",
141
  "mix-fw_edu_hq-tutorial_12b_hq": "Gemma-3 12B",
@@ -156,7 +154,6 @@ Potentially, writing a tutorial is easy enough and we only need larger models fo
156
  title="Model Size: Math Prompt"
157
  desc="Figure: Gemma-3 model sizes (270M to 27B) on the math prompt."
158
  config={{
159
- defaultView: "line",
160
  datasetNames: {
161
  "mix-fw_edu_hq-math_1b_hq": "Gemma-3 1B",
162
  "mix-fw_edu_hq-math_4b_hq": "Gemma-3 4B",
@@ -181,12 +178,13 @@ Continue prompt: For the 1b model the source data does not seem to matter, but t
181
  title="Model Size vs Data Quality: Continue Prompt"
182
  desc="Figure: 1B vs 12B model on HQ vs LQ data using the continue prompt."
183
  config={{
184
- defaultView: "line",
185
  datasetNames: {
186
  "mix-fw_edu_hq-continue_12b_hq": "12B, HQ Source",
187
  "mix-fw_edu_hq-continue_1b_hq": "1B, HQ Source",
188
  "mix-fw_edu_hq-continue_1b_lq": "1B, LQ Source",
189
- "mix-fw_edu_hq-continue_12b_lq": "12B, LQ Source"
 
 
190
  }
191
  }}
192
  />
@@ -199,12 +197,13 @@ Tutorial prompt: For the hq data the model size does not seem to matter whereas
199
  title="Model Size vs Data Quality: Tutorial Prompt"
200
  desc="Figure: 1B vs 12B model on HQ vs LQ data using the tutorial prompt."
201
  config={{
202
- defaultView: "line",
203
  datasetNames: {
204
  "mix-fw_edu_hq-tutorial_1b_hq": "1B, HQ Source",
205
  "mix-fw_edu_hq-tutorial_12b_hq": "12B, HQ Source",
206
  "mix-fw_edu_hq-tutorial_12b_lq": "12B, LQ Source",
207
- "mix-fw_edu_hq-tutorial_1b_lq": "1B, LQ Source"
 
 
208
  }
209
  }}
210
  />
@@ -217,12 +216,13 @@ FAQ prompt: Surprisingly, the 1b model is better for both lq and hq data.
217
  title="Model Size vs Data Quality: FAQ Prompt"
218
  desc="Figure: 1B vs 12B model on HQ vs LQ data using the FAQ prompt."
219
  config={{
220
- defaultView: "line",
221
  datasetNames: {
222
  "mix-fw_edu_hq-faq_1b_hq": "1B, HQ Source",
223
  "mix-fw_edu_hq-faq_1b_lq": "1B, LQ Source",
224
  "mix-fw_edu_hq-faq_12b_hq": "12B, HQ Source",
225
- "mix-fw_edu_hq-faq_12b_lq": "12B, LQ Source"
 
 
226
  }
227
  }}
228
  />
@@ -238,14 +238,15 @@ Some model families may be better suited for rephrasing than others based on the
238
  title="Model Family: Tutorial Prompt"
239
  desc="Figure: Model families compared on the tutorial prompt at ~1B scale."
240
  config={{
241
- defaultView: "line",
242
  datasetNames: {
243
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
244
  "mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Falcon3",
245
  "mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3",
246
  "mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3",
247
  "mix-fw_edu_hq-tutorial_granite3_1b_hq": "Granite3",
248
- "mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2"
 
 
249
  }
250
  }}
251
  />
@@ -258,14 +259,15 @@ In the faq prompt SmolLM2 again clearly outperforms the others. Here Qwen3 under
258
  title="Model Family: FAQ Prompt"
259
  desc="Figure: Model families compared on the FAQ prompt at ~1B scale."
260
  config={{
261
- defaultView: "line",
262
  datasetNames: {
263
  "mix-fw_edu_hq-faq_smollm2_1.7b_hq": "SmolLM2",
264
  "mix-fw_edu_hq-faq_llama3.2_1b_hq": "Llama-3.2",
265
  "mix-fw_edu_hq-faq_falcon3_1b_hq": "Falcon3",
266
  "mix-fw_edu_hq-faq_1b_hq": "Gemma-3",
267
  "mix-fw_edu_hq-faq_granite3_1b_hq": "Granite3",
268
- "mix-fw_edu_hq-faq_qwen3_1.7b_hq": "Qwen3"
 
 
269
  }
270
  }}
271
  />
@@ -278,14 +280,15 @@ For the table prompt we again see SmolLM2 and to some degree Falcon3 outperform.
278
  title="Model Family: Table Prompt"
279
  desc="Figure: Model families compared on the table prompt at ~1B scale."
280
  config={{
281
- defaultView: "line",
282
  datasetNames: {
283
  "mix-fw_edu_hq-table_smollm2_1.7b_hq": "SmolLM2",
284
  "mix-fw_edu_hq-table_falcon3_1b_hq": "Falcon3",
285
  "mix-fw_edu_hq-table_granite3_1b_hq": "Granite3",
286
  "mix-fw_edu_hq-table_qwen3_1.7b_hq": "Qwen3",
287
  "mix-fw_edu_hq-table_llama3.2_1b_hq": "Llama-3.2",
288
- "mix-fw_edu_hq-table_1b_hq": "Gemma-3"
 
 
289
  }
290
  }}
291
  />
@@ -298,14 +301,15 @@ Finally, math is again a clear win for SmolLM2 with Qwen3 underperforming.
298
  title="Model Family: Math Prompt"
299
  desc="Figure: Model families compared on the math prompt at ~1B scale."
300
  config={{
301
- defaultView: "line",
302
  datasetNames: {
303
  "mix-fw_edu_hq-math_smollm2_1.7b_hq": "SmolLM2",
304
  "mix-fw_edu_hq-math_falcon3_1b_hq": "Falcon3",
305
  "mix-fw_edu_hq-math_granite3_1b_hq": "Granite3",
306
  "mix-fw_edu_hq-math_1b_hq": "Gemma-3",
307
  "mix-fw_edu_hq-math_llama3.2_1b_hq": "Llama-3.2",
308
- "mix-fw_edu_hq-math_qwen3_1.7b_hq": "Qwen3"
 
 
309
  }
310
  }}
311
  />
@@ -322,7 +326,6 @@ We compare rephrasing with Qwen models from versions 1.5, 2, 2.5 and 3 using the
322
  title="Model Generation: Qwen Tutorial"
323
  desc="Figure: Qwen model generations (1.5 to 3) on the tutorial prompt."
324
  config={{
325
- defaultView: "line",
326
  datasetNames: {
327
  "mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3 (1.7B)",
328
  "mix-fw_edu_hq-tutorial_qwen2.5_1.5b_hq": "Qwen2.5 (1.5B)",
@@ -348,7 +351,6 @@ To test the effect of the mix-in dataset we apply the tutorial prompt using Gemm
348
  title="Mix-in Dataset Effect (HQ Source)"
349
  desc="Figure: Effect of different mix-in datasets with fw_edu_hq as source for the tutorial prompt."
350
  config={{
351
- defaultView: "line",
352
  datasetNames: {
353
  "mix-dclm-tutorial_1b_hq": "Mix-in: DCLM",
354
  "mix-fw_edu_hq-tutorial_1b_hq": "Mix-in: FW-Edu (HQ)",
@@ -370,7 +372,6 @@ Does this trend hold for other source datasets? We ran the experiment for fw_edu
370
  title="Mix-in Dataset Effect (LQ Source)"
371
  desc="Figure: Effect of different mix-in datasets with fw_edu_lq as source for the tutorial prompt."
372
  config={{
373
- defaultView: "line",
374
  datasetNames: {
375
  dclm: "DCLM",
376
  "mix-fw_edu_hq-tutorial_1b_lq": "Mix-in: FW-Edu (HQ)",
@@ -396,12 +397,13 @@ To investigate to what extent the source dataset for rephrasing matters we rephr
396
  title="Source Dataset: Tutorial (Mix-in = Source)"
397
  desc="Figure: Effect of source dataset choice for the tutorial prompt when mix-in equals source."
398
  config={{
399
- defaultView: "line",
400
  datasetNames: {
401
  "mix-fw_edu_hq-tutorial_1b_hq": "Source: FW-Edu (HQ)",
402
  "mix-dclm-tutorial_1b_dclm": "Source: DCLM",
403
  "mix-cosmopedia-tutorial_1b_cosmopedia": "Source: Cosmopedia",
404
- "mix-fw_edu_lq-tutorial_1b_lq": "Source: FW-Edu (LQ)"
 
 
405
  }
406
  }}
407
  />
@@ -412,12 +414,13 @@ To investigate to what extent the source dataset for rephrasing matters we rephr
412
  title="Source Dataset: FAQ (Mix-in = Source)"
413
  desc="Figure: Effect of source dataset choice for the FAQ prompt when mix-in equals source."
414
  config={{
415
- defaultView: "line",
416
  datasetNames: {
417
  "mix-dclm-faq_1b_dclm": "Source: DCLM",
418
  "mix-fw_edu_hq-faq_1b_hq": "Source: FW-Edu (HQ)",
419
  "mix-fw_edu_lq-faq_1b_lq": "Source: FW-Edu (LQ)",
420
- "mix-cosmopedia-faq_1b_cosmopedia": "Source: Cosmopedia"
 
 
421
  }
422
  }}
423
  />
@@ -430,12 +433,13 @@ When fix the mix-in dataset to fw_edu_hq, the difference shrinks drastically for
430
  title="Source Dataset: Tutorial (Fixed Mix-in: FW-Edu HQ)"
431
  desc="Figure: Effect of source dataset for the tutorial prompt with fw_edu_hq as fixed mix-in."
432
  config={{
433
- defaultView: "line",
434
  datasetNames: {
435
  "mix-fw_edu_hq-tutorial_1b_dclm": "Source: DCLM",
436
  "mix-fw_edu_hq-tutorial_1b_hq": "Source: FW-Edu (HQ)",
437
  "mix-fw_edu_hq-tutorial_1b_cosmopedia": "Source: Cosmopedia",
438
- "mix-fw_edu_hq-tutorial_1b_lq": "Source: FW-Edu (LQ)"
 
 
439
  }
440
  }}
441
  />
@@ -446,12 +450,13 @@ When fix the mix-in dataset to fw_edu_hq, the difference shrinks drastically for
446
  title="Source Dataset: FAQ (Fixed Mix-in: FW-Edu HQ)"
447
  desc="Figure: Effect of source dataset for the FAQ prompt with fw_edu_hq as fixed mix-in."
448
  config={{
449
- defaultView: "line",
450
  datasetNames: {
451
  "mix-fw_edu_hq-faq_1b_dclm": "Source: DCLM",
452
  "mix-fw_edu_hq-faq_1b_hq": "Source: FW-Edu (HQ)",
453
  "mix-fw_edu_hq-faq_1b_lq": "Source: FW-Edu (LQ)",
454
- "mix-fw_edu_hq-faq_1b_cosmopedia": "Source: Cosmopedia"
 
 
455
  }
456
  }}
457
  />
@@ -466,7 +471,6 @@ We were wondering whether just training on synthetic data works. While we get in
466
  title="Is Synthetic Data Enough? (DCLM Source)"
467
  desc="Figure: Synthetic-only vs mixed training with DCLM as source."
468
  config={{
469
- defaultView: "line",
470
  datasetNames: {
471
  "mix-dclm-faq_1b_dclm": "Mix: FAQ + DCLM",
472
  dclm: "DCLM",
@@ -484,7 +488,6 @@ We were wondering whether just training on synthetic data works. While we get in
484
  title="Is Synthetic Data Enough? (FW-Edu HQ Source)"
485
  desc="Figure: Synthetic-only vs mixed training with FW-Edu (HQ) as source."
486
  config={{
487
- defaultView: "line",
488
  datasetNames: {
489
  "mix-fw_edu_hq-faq_1b_hq": "Mix: FAQ + FW-Edu (HQ)",
490
  "mix-fw_edu_hq-tutorial_1b_hq": "Mix: Tutorial + FW-Edu (HQ)",
@@ -508,7 +511,6 @@ We were wondering whether mixing the best performing rephrasing approaches can i
508
  title="Mixing Rephrasing Approaches"
509
  desc="Figure: Mixing multiple prompts vs individual prompts."
510
  config={{
511
- defaultView: "line",
512
  datasetNames: {
513
  "mix-fw_edu_hq-tutorial_1b_hq-fw_edu_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts + FW-Edu (HQ)",
514
  "mix-fw_edu_hq-math_1b_hq": "Math",
@@ -531,7 +533,6 @@ We rephrased using different model families and saw SmolLM2 and Falcon3 clearly
531
  title="Mixing Model Families"
532
  desc="Figure: Mixing rephrased outputs from different model families."
533
  config={{
534
- defaultView: "line",
535
  datasetNames: {
536
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
537
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "SmolLM2 + Falcon3",
@@ -554,7 +555,6 @@ Maybe we need more diversity by mixing both rephrasing approaches and model fami
554
  title="Mixing Approaches and Model Families"
555
  desc="Figure: Mixing both rephrasing approaches and model families."
556
  config={{
557
- defaultView: "line",
558
  datasetNames: {
559
  "mix-fw_edu_hq-faq_smollm2_1.7b_hq": "FAQ (SmolLM2)",
560
  "mix-fw_edu_hq-faq_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "FAQ (SmolLM2) + Tutorial (Falcon3)",
@@ -580,7 +580,6 @@ The original REWIRE prompt contains many typos and grammar errors. To what exten
580
  title="Effect of Typos in Prompt"
581
  desc="Figure: REWIRE prompt with original typos vs improved version at 1B and 12B scale."
582
  config={{
583
- defaultView: "line",
584
  datasetNames: {
585
  "mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Original (12B)",
586
  "mix-fw_edu_hq-guided_rewrite_improved_12b_hq": "Improved (12B)",
 
37
  desc="Figure: FinePhrase compared against synthetic data baselines across evaluation metrics."
38
  config={{
39
  defaultView: "line",
40
+ pinnedColors: { "FinePhrase": "#ff6d00" },
41
  datasetNames: {
42
  cosmopedia: "Cosmopedia",
43
  "mix-fw_edu_hq-table_smollm2_1.7b_hq": "FinePhrase",
 
81
  title="Dissecting Synthetic Baselines"
82
  desc="Figure: Individual prompt performance from existing synthetic datasets compared to DCLM and FineWeb-Edu (HQ)."
83
  config={{
 
84
  datasetNames: {
85
  "mix-fw_edu_hq-diverse_qa_pairs_1b_hq": "Diverse QA Pairs",
86
  dclm: "DCLM",
 
107
  title="New Prompt Performance"
108
  desc="Figure: Four new prompts (math, table, faq, tutorial) compared against DCLM and FineWeb-Edu (HQ)."
109
  config={{
 
110
  datasetNames: {
111
  "mix-fw_edu_hq-math_1b_hq": "Math",
112
  "mix-fw_edu_hq-table_1b_hq": "Table",
 
134
  title="Model Size: Tutorial Prompt"
135
  desc="Figure: Gemma-3 model sizes (270M to 27B) on the tutorial prompt."
136
  config={{
 
137
  datasetNames: {
138
  "mix-fw_edu_hq-tutorial_27b_hq": "Gemma-3 27B",
139
  "mix-fw_edu_hq-tutorial_12b_hq": "Gemma-3 12B",
 
154
  title="Model Size: Math Prompt"
155
  desc="Figure: Gemma-3 model sizes (270M to 27B) on the math prompt."
156
  config={{
 
157
  datasetNames: {
158
  "mix-fw_edu_hq-math_1b_hq": "Gemma-3 1B",
159
  "mix-fw_edu_hq-math_4b_hq": "Gemma-3 4B",
 
178
  title="Model Size vs Data Quality: Continue Prompt"
179
  desc="Figure: 1B vs 12B model on HQ vs LQ data using the continue prompt."
180
  config={{
 
181
  datasetNames: {
182
  "mix-fw_edu_hq-continue_12b_hq": "12B, HQ Source",
183
  "mix-fw_edu_hq-continue_1b_hq": "1B, HQ Source",
184
  "mix-fw_edu_hq-continue_1b_lq": "1B, LQ Source",
185
+ "mix-fw_edu_hq-continue_12b_lq": "12B, LQ Source",
186
+ dclm: "DCLM",
187
+ fw_edu_hq: "FineWeb-Edu (HQ)"
188
  }
189
  }}
190
  />
 
197
  title="Model Size vs Data Quality: Tutorial Prompt"
198
  desc="Figure: 1B vs 12B model on HQ vs LQ data using the tutorial prompt."
199
  config={{
 
200
  datasetNames: {
201
  "mix-fw_edu_hq-tutorial_1b_hq": "1B, HQ Source",
202
  "mix-fw_edu_hq-tutorial_12b_hq": "12B, HQ Source",
203
  "mix-fw_edu_hq-tutorial_12b_lq": "12B, LQ Source",
204
+ "mix-fw_edu_hq-tutorial_1b_lq": "1B, LQ Source",
205
+ dclm: "DCLM",
206
+ fw_edu_hq: "FineWeb-Edu (HQ)"
207
  }
208
  }}
209
  />
 
216
  title="Model Size vs Data Quality: FAQ Prompt"
217
  desc="Figure: 1B vs 12B model on HQ vs LQ data using the FAQ prompt."
218
  config={{
 
219
  datasetNames: {
220
  "mix-fw_edu_hq-faq_1b_hq": "1B, HQ Source",
221
  "mix-fw_edu_hq-faq_1b_lq": "1B, LQ Source",
222
  "mix-fw_edu_hq-faq_12b_hq": "12B, HQ Source",
223
+ "mix-fw_edu_hq-faq_12b_lq": "12B, LQ Source",
224
+ dclm: "DCLM",
225
+ fw_edu_hq: "FineWeb-Edu (HQ)"
226
  }
227
  }}
228
  />
 
238
  title="Model Family: Tutorial Prompt"
239
  desc="Figure: Model families compared on the tutorial prompt at ~1B scale."
240
  config={{
 
241
  datasetNames: {
242
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
243
  "mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Falcon3",
244
  "mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3",
245
  "mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3",
246
  "mix-fw_edu_hq-tutorial_granite3_1b_hq": "Granite3",
247
+ "mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2",
248
+ dclm: "DCLM",
249
+ fw_edu_hq: "FineWeb-Edu (HQ)"
250
  }
251
  }}
252
  />
 
259
  title="Model Family: FAQ Prompt"
260
  desc="Figure: Model families compared on the FAQ prompt at ~1B scale."
261
  config={{
 
262
  datasetNames: {
263
  "mix-fw_edu_hq-faq_smollm2_1.7b_hq": "SmolLM2",
264
  "mix-fw_edu_hq-faq_llama3.2_1b_hq": "Llama-3.2",
265
  "mix-fw_edu_hq-faq_falcon3_1b_hq": "Falcon3",
266
  "mix-fw_edu_hq-faq_1b_hq": "Gemma-3",
267
  "mix-fw_edu_hq-faq_granite3_1b_hq": "Granite3",
268
+ "mix-fw_edu_hq-faq_qwen3_1.7b_hq": "Qwen3",
269
+ dclm: "DCLM",
270
+ fw_edu_hq: "FineWeb-Edu (HQ)"
271
  }
272
  }}
273
  />
 
280
  title="Model Family: Table Prompt"
281
  desc="Figure: Model families compared on the table prompt at ~1B scale."
282
  config={{
 
283
  datasetNames: {
284
  "mix-fw_edu_hq-table_smollm2_1.7b_hq": "SmolLM2",
285
  "mix-fw_edu_hq-table_falcon3_1b_hq": "Falcon3",
286
  "mix-fw_edu_hq-table_granite3_1b_hq": "Granite3",
287
  "mix-fw_edu_hq-table_qwen3_1.7b_hq": "Qwen3",
288
  "mix-fw_edu_hq-table_llama3.2_1b_hq": "Llama-3.2",
289
+ "mix-fw_edu_hq-table_1b_hq": "Gemma-3",
290
+ dclm: "DCLM",
291
+ fw_edu_hq: "FineWeb-Edu (HQ)"
292
  }
293
  }}
294
  />
 
301
  title="Model Family: Math Prompt"
302
  desc="Figure: Model families compared on the math prompt at ~1B scale."
303
  config={{
 
304
  datasetNames: {
305
  "mix-fw_edu_hq-math_smollm2_1.7b_hq": "SmolLM2",
306
  "mix-fw_edu_hq-math_falcon3_1b_hq": "Falcon3",
307
  "mix-fw_edu_hq-math_granite3_1b_hq": "Granite3",
308
  "mix-fw_edu_hq-math_1b_hq": "Gemma-3",
309
  "mix-fw_edu_hq-math_llama3.2_1b_hq": "Llama-3.2",
310
+ "mix-fw_edu_hq-math_qwen3_1.7b_hq": "Qwen3",
311
+ dclm: "DCLM",
312
+ fw_edu_hq: "FineWeb-Edu (HQ)"
313
  }
314
  }}
315
  />
 
326
  title="Model Generation: Qwen Tutorial"
327
  desc="Figure: Qwen model generations (1.5 to 3) on the tutorial prompt."
328
  config={{
 
329
  datasetNames: {
330
  "mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3 (1.7B)",
331
  "mix-fw_edu_hq-tutorial_qwen2.5_1.5b_hq": "Qwen2.5 (1.5B)",
 
351
  title="Mix-in Dataset Effect (HQ Source)"
352
  desc="Figure: Effect of different mix-in datasets with fw_edu_hq as source for the tutorial prompt."
353
  config={{
 
354
  datasetNames: {
355
  "mix-dclm-tutorial_1b_hq": "Mix-in: DCLM",
356
  "mix-fw_edu_hq-tutorial_1b_hq": "Mix-in: FW-Edu (HQ)",
 
372
  title="Mix-in Dataset Effect (LQ Source)"
373
  desc="Figure: Effect of different mix-in datasets with fw_edu_lq as source for the tutorial prompt."
374
  config={{
 
375
  datasetNames: {
376
  dclm: "DCLM",
377
  "mix-fw_edu_hq-tutorial_1b_lq": "Mix-in: FW-Edu (HQ)",
 
397
  title="Source Dataset: Tutorial (Mix-in = Source)"
398
  desc="Figure: Effect of source dataset choice for the tutorial prompt when mix-in equals source."
399
  config={{
 
400
  datasetNames: {
401
  "mix-fw_edu_hq-tutorial_1b_hq": "Source: FW-Edu (HQ)",
402
  "mix-dclm-tutorial_1b_dclm": "Source: DCLM",
403
  "mix-cosmopedia-tutorial_1b_cosmopedia": "Source: Cosmopedia",
404
+ "mix-fw_edu_lq-tutorial_1b_lq": "Source: FW-Edu (LQ)",
405
+ dclm: "DCLM",
406
+ fw_edu_hq: "FineWeb-Edu (HQ)"
407
  }
408
  }}
409
  />
 
414
  title="Source Dataset: FAQ (Mix-in = Source)"
415
  desc="Figure: Effect of source dataset choice for the FAQ prompt when mix-in equals source."
416
  config={{
 
417
  datasetNames: {
418
  "mix-dclm-faq_1b_dclm": "Source: DCLM",
419
  "mix-fw_edu_hq-faq_1b_hq": "Source: FW-Edu (HQ)",
420
  "mix-fw_edu_lq-faq_1b_lq": "Source: FW-Edu (LQ)",
421
+ "mix-cosmopedia-faq_1b_cosmopedia": "Source: Cosmopedia",
422
+ dclm: "DCLM",
423
+ fw_edu_hq: "FineWeb-Edu (HQ)"
424
  }
425
  }}
426
  />
 
433
  title="Source Dataset: Tutorial (Fixed Mix-in: FW-Edu HQ)"
434
  desc="Figure: Effect of source dataset for the tutorial prompt with fw_edu_hq as fixed mix-in."
435
  config={{
 
436
  datasetNames: {
437
  "mix-fw_edu_hq-tutorial_1b_dclm": "Source: DCLM",
438
  "mix-fw_edu_hq-tutorial_1b_hq": "Source: FW-Edu (HQ)",
439
  "mix-fw_edu_hq-tutorial_1b_cosmopedia": "Source: Cosmopedia",
440
+ "mix-fw_edu_hq-tutorial_1b_lq": "Source: FW-Edu (LQ)",
441
+ dclm: "DCLM",
442
+ fw_edu_hq: "FineWeb-Edu (HQ)"
443
  }
444
  }}
445
  />
 
450
  title="Source Dataset: FAQ (Fixed Mix-in: FW-Edu HQ)"
451
  desc="Figure: Effect of source dataset for the FAQ prompt with fw_edu_hq as fixed mix-in."
452
  config={{
 
453
  datasetNames: {
454
  "mix-fw_edu_hq-faq_1b_dclm": "Source: DCLM",
455
  "mix-fw_edu_hq-faq_1b_hq": "Source: FW-Edu (HQ)",
456
  "mix-fw_edu_hq-faq_1b_lq": "Source: FW-Edu (LQ)",
457
+ "mix-fw_edu_hq-faq_1b_cosmopedia": "Source: Cosmopedia",
458
+ dclm: "DCLM",
459
+ fw_edu_hq: "FineWeb-Edu (HQ)"
460
  }
461
  }}
462
  />
 
471
  title="Is Synthetic Data Enough? (DCLM Source)"
472
  desc="Figure: Synthetic-only vs mixed training with DCLM as source."
473
  config={{
 
474
  datasetNames: {
475
  "mix-dclm-faq_1b_dclm": "Mix: FAQ + DCLM",
476
  dclm: "DCLM",
 
488
  title="Is Synthetic Data Enough? (FW-Edu HQ Source)"
489
  desc="Figure: Synthetic-only vs mixed training with FW-Edu (HQ) as source."
490
  config={{
 
491
  datasetNames: {
492
  "mix-fw_edu_hq-faq_1b_hq": "Mix: FAQ + FW-Edu (HQ)",
493
  "mix-fw_edu_hq-tutorial_1b_hq": "Mix: Tutorial + FW-Edu (HQ)",
 
511
  title="Mixing Rephrasing Approaches"
512
  desc="Figure: Mixing multiple prompts vs individual prompts."
513
  config={{
 
514
  datasetNames: {
515
  "mix-fw_edu_hq-tutorial_1b_hq-fw_edu_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts + FW-Edu (HQ)",
516
  "mix-fw_edu_hq-math_1b_hq": "Math",
 
533
  title="Mixing Model Families"
534
  desc="Figure: Mixing rephrased outputs from different model families."
535
  config={{
 
536
  datasetNames: {
537
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
538
  "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "SmolLM2 + Falcon3",
 
555
  title="Mixing Approaches and Model Families"
556
  desc="Figure: Mixing both rephrasing approaches and model families."
557
  config={{
 
558
  datasetNames: {
559
  "mix-fw_edu_hq-faq_smollm2_1.7b_hq": "FAQ (SmolLM2)",
560
  "mix-fw_edu_hq-faq_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "FAQ (SmolLM2) + Tutorial (Falcon3)",
 
580
  title="Effect of Typos in Prompt"
581
  desc="Figure: REWIRE prompt with original typos vs improved version at 1B and 12B scale."
582
  config={{
 
583
  datasetNames: {
584
  "mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Original (12B)",
585
  "mix-fw_edu_hq-guided_rewrite_improved_12b_hq": "Improved (12B)",
app/src/content/embeds/d3-benchmark-comparison.html CHANGED
@@ -3,12 +3,13 @@
3
 
4
  Configuration via data-config attribute:
5
  {
6
- "datasetNames": { "raw_name": "Display Name", ... }, // required
7
- "defaultMetric": "agg_score_macro", // optional, default: "agg_score_macro"
8
- "defaultView": "bar", // optional, "bar" | "line", default: "bar"
9
- "tokensPerStep": 2100000, // optional, default: 2.1e6
10
- "runColumn": "runname", // optional, CSV column for series, default: "runname"
11
- "stepColumn": "steps" // optional, CSV column for x-axis, default: "steps"
 
12
  }
13
 
14
  Data: uses benchmark-results.csv by default (one CSV with all runs).
@@ -184,6 +185,8 @@
184
  const TOKENS_PER_STEP = cfg.tokensPerStep || 2.1e6;
185
  const defaultMetric = cfg.defaultMetric || 'agg_score_macro';
186
  const defaultView = cfg.defaultView || 'bar';
 
 
187
 
188
  // Unique ID suffix for multiple instances on same page
189
  const uid = Math.random().toString(36).slice(2, 8);
@@ -255,11 +258,18 @@
255
  }
256
 
257
  function initColors() {
258
- const allNames = Array.from(d3.group(allData, d => d[RUN_COL]).keys()).sort();
259
- if (!Object.keys(colorMap).length) {
260
- const palette = getCategoricalColors(allNames.length);
261
- allNames.forEach((name, i) => { colorMap[name] = palette[i % palette.length]; });
262
- }
 
 
 
 
 
 
 
263
  }
264
 
265
  function showTip(html, x, y) {
 
3
 
4
  Configuration via data-config attribute:
5
  {
6
+ "datasetNames": { "raw_name": "Display Name", ... }, // required
7
+ "pinnedColors": { "DCLM": "#333", "FineWeb-Edu (HQ)": "#86a1a9" }, // optional
8
+ "defaultMetric": "agg_score_macro", // optional, default: "agg_score_macro"
9
+ "defaultView": "bar", // optional, "bar" | "line", default: "bar"
10
+ "tokensPerStep": 2100000, // optional, default: 2.1e6
11
+ "runColumn": "runname", // optional, CSV column for series, default: "runname"
12
+ "stepColumn": "steps" // optional, CSV column for x-axis, default: "steps"
13
  }
14
 
15
  Data: uses benchmark-results.csv by default (one CSV with all runs).
 
185
  const TOKENS_PER_STEP = cfg.tokensPerStep || 2.1e6;
186
  const defaultMetric = cfg.defaultMetric || 'agg_score_macro';
187
  const defaultView = cfg.defaultView || 'bar';
188
+ // Stable baseline colors, merged with per-chart overrides
189
+ const PINNED_COLORS = Object.assign({ 'DCLM': '#333', 'FineWeb-Edu (HQ)': '#86a1a9' }, cfg.pinnedColors || {});
190
 
191
  // Unique ID suffix for multiple instances on same page
192
  const uid = Math.random().toString(36).slice(2, 8);
 
258
  }
259
 
260
  function initColors() {
261
+ if (Object.keys(colorMap).length) return;
262
+ const allRaw = Array.from(d3.group(allData, d => d[RUN_COL]).keys()).sort();
263
+ // Assign pinned colors first (keyed by display name)
264
+ const unpinned = [];
265
+ allRaw.forEach(raw => {
266
+ const name = displayName(raw);
267
+ if (PINNED_COLORS[name]) { colorMap[raw] = PINNED_COLORS[name]; }
268
+ else { unpinned.push(raw); }
269
+ });
270
+ // Fill remaining from categorical palette
271
+ const palette = getCategoricalColors(unpinned.length);
272
+ unpinned.forEach((raw, i) => { colorMap[raw] = palette[i % palette.length]; });
273
  }
274
 
275
  function showTip(html, x, y) {