finephrase

Running on CPU Upgrade

App Files Files Community

joelniklaus HF Staff commited on Feb 9

Commit

1834e40

1 Parent(s): cbf941b

removed redundant defaultView and added dclm and fw-edu-hq baselines everywhere with stable colors for less mental load

Browse files

Files changed (2) hide show

app/src/content/chapters/experiments.mdx +34 -35
app/src/content/embeds/d3-benchmark-comparison.html +21 -11

app/src/content/chapters/experiments.mdx CHANGED Viewed

@@ -37,6 +37,7 @@ We see that FinePhrase clearly outperforms the synthetic baselines.
   desc="Figure: FinePhrase compared against synthetic data baselines across evaluation metrics."
   config={{
     defaultView: "line",
     datasetNames: {
       cosmopedia: "Cosmopedia",
       "mix-fw_edu_hq-table_smollm2_1.7b_hq": "FinePhrase",
@@ -80,7 +81,6 @@ Using gemma-3-1b, the prompt from REWIRE (guided_rewrite_original) is on-par wit
   title="Dissecting Synthetic Baselines"
   desc="Figure: Individual prompt performance from existing synthetic datasets compared to DCLM and FineWeb-Edu (HQ)."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-diverse_qa_pairs_1b_hq": "Diverse QA Pairs",
       dclm: "DCLM",
@@ -107,7 +107,6 @@ We found four prompts that outperform both fw-edu-hq and the challenging dclm ba
   title="New Prompt Performance"
   desc="Figure: Four new prompts (math, table, faq, tutorial) compared against DCLM and FineWeb-Edu (HQ)."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-math_1b_hq": "Math",
       "mix-fw_edu_hq-table_1b_hq": "Table",
@@ -135,7 +134,6 @@ We compare rephrasing with all Gemma-3 sizes (270m, 1b, 4b, 12b, 27b) using the
   title="Model Size: Tutorial Prompt"
   desc="Figure: Gemma-3 model sizes (270M to 27B) on the tutorial prompt."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-tutorial_27b_hq": "Gemma-3 27B",
       "mix-fw_edu_hq-tutorial_12b_hq": "Gemma-3 12B",
@@ -156,7 +154,6 @@ Potentially, writing a tutorial is easy enough and we only need larger models fo
   title="Model Size: Math Prompt"
   desc="Figure: Gemma-3 model sizes (270M to 27B) on the math prompt."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-math_1b_hq": "Gemma-3 1B",
       "mix-fw_edu_hq-math_4b_hq": "Gemma-3 4B",
@@ -181,12 +178,13 @@ Continue prompt: For the 1b model the source data does not seem to matter, but t
   title="Model Size vs Data Quality: Continue Prompt"
   desc="Figure: 1B vs 12B model on HQ vs LQ data using the continue prompt."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-continue_12b_hq": "12B, HQ Source",
       "mix-fw_edu_hq-continue_1b_hq": "1B, HQ Source",
       "mix-fw_edu_hq-continue_1b_lq": "1B, LQ Source",
-      "mix-fw_edu_hq-continue_12b_lq": "12B, LQ Source"
     }
   }}
 />
@@ -199,12 +197,13 @@ Tutorial prompt: For the hq data the model size does not seem to matter whereas
   title="Model Size vs Data Quality: Tutorial Prompt"
   desc="Figure: 1B vs 12B model on HQ vs LQ data using the tutorial prompt."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-tutorial_1b_hq": "1B, HQ Source",
       "mix-fw_edu_hq-tutorial_12b_hq": "12B, HQ Source",
       "mix-fw_edu_hq-tutorial_12b_lq": "12B, LQ Source",
-      "mix-fw_edu_hq-tutorial_1b_lq": "1B, LQ Source"
     }
   }}
 />
@@ -217,12 +216,13 @@ FAQ prompt: Surprisingly, the 1b model is better for both lq and hq data.
   title="Model Size vs Data Quality: FAQ Prompt"
   desc="Figure: 1B vs 12B model on HQ vs LQ data using the FAQ prompt."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-faq_1b_hq": "1B, HQ Source",
       "mix-fw_edu_hq-faq_1b_lq": "1B, LQ Source",
       "mix-fw_edu_hq-faq_12b_hq": "12B, HQ Source",
-      "mix-fw_edu_hq-faq_12b_lq": "12B, LQ Source"
     }
   }}
 />
@@ -238,14 +238,15 @@ Some model families may be better suited for rephrasing than others based on the
   title="Model Family: Tutorial Prompt"
   desc="Figure: Model families compared on the tutorial prompt at ~1B scale."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
       "mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Falcon3",
       "mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3",
       "mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3",
       "mix-fw_edu_hq-tutorial_granite3_1b_hq": "Granite3",
-      "mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2"
     }
   }}
 />
@@ -258,14 +259,15 @@ In the faq prompt SmolLM2 again clearly outperforms the others. Here Qwen3 under
   title="Model Family: FAQ Prompt"
   desc="Figure: Model families compared on the FAQ prompt at ~1B scale."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-faq_smollm2_1.7b_hq": "SmolLM2",
       "mix-fw_edu_hq-faq_llama3.2_1b_hq": "Llama-3.2",
       "mix-fw_edu_hq-faq_falcon3_1b_hq": "Falcon3",
       "mix-fw_edu_hq-faq_1b_hq": "Gemma-3",
       "mix-fw_edu_hq-faq_granite3_1b_hq": "Granite3",
-      "mix-fw_edu_hq-faq_qwen3_1.7b_hq": "Qwen3"
     }
   }}
 />
@@ -278,14 +280,15 @@ For the table prompt we again see SmolLM2 and to some degree Falcon3 outperform.
   title="Model Family: Table Prompt"
   desc="Figure: Model families compared on the table prompt at ~1B scale."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-table_smollm2_1.7b_hq": "SmolLM2",
       "mix-fw_edu_hq-table_falcon3_1b_hq": "Falcon3",
       "mix-fw_edu_hq-table_granite3_1b_hq": "Granite3",
       "mix-fw_edu_hq-table_qwen3_1.7b_hq": "Qwen3",
       "mix-fw_edu_hq-table_llama3.2_1b_hq": "Llama-3.2",
-      "mix-fw_edu_hq-table_1b_hq": "Gemma-3"
     }
   }}
 />
@@ -298,14 +301,15 @@ Finally, math is again a clear win for SmolLM2 with Qwen3 underperforming.
   title="Model Family: Math Prompt"
   desc="Figure: Model families compared on the math prompt at ~1B scale."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-math_smollm2_1.7b_hq": "SmolLM2",
       "mix-fw_edu_hq-math_falcon3_1b_hq": "Falcon3",
       "mix-fw_edu_hq-math_granite3_1b_hq": "Granite3",
       "mix-fw_edu_hq-math_1b_hq": "Gemma-3",
       "mix-fw_edu_hq-math_llama3.2_1b_hq": "Llama-3.2",
-      "mix-fw_edu_hq-math_qwen3_1.7b_hq": "Qwen3"
     }
   }}
 />
@@ -322,7 +326,6 @@ We compare rephrasing with Qwen models from versions 1.5, 2, 2.5 and 3 using the
   title="Model Generation: Qwen Tutorial"
   desc="Figure: Qwen model generations (1.5 to 3) on the tutorial prompt."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3 (1.7B)",
       "mix-fw_edu_hq-tutorial_qwen2.5_1.5b_hq": "Qwen2.5 (1.5B)",
@@ -348,7 +351,6 @@ To test the effect of the mix-in dataset we apply the tutorial prompt using Gemm
   title="Mix-in Dataset Effect (HQ Source)"
   desc="Figure: Effect of different mix-in datasets with fw_edu_hq as source for the tutorial prompt."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-dclm-tutorial_1b_hq": "Mix-in: DCLM",
       "mix-fw_edu_hq-tutorial_1b_hq": "Mix-in: FW-Edu (HQ)",
@@ -370,7 +372,6 @@ Does this trend hold for other source datasets? We ran the experiment for fw_edu
   title="Mix-in Dataset Effect (LQ Source)"
   desc="Figure: Effect of different mix-in datasets with fw_edu_lq as source for the tutorial prompt."
   config={{
-    defaultView: "line",
     datasetNames: {
       dclm: "DCLM",
       "mix-fw_edu_hq-tutorial_1b_lq": "Mix-in: FW-Edu (HQ)",
@@ -396,12 +397,13 @@ To investigate to what extent the source dataset for rephrasing matters we rephr
   title="Source Dataset: Tutorial (Mix-in = Source)"
   desc="Figure: Effect of source dataset choice for the tutorial prompt when mix-in equals source."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-tutorial_1b_hq": "Source: FW-Edu (HQ)",
       "mix-dclm-tutorial_1b_dclm": "Source: DCLM",
       "mix-cosmopedia-tutorial_1b_cosmopedia": "Source: Cosmopedia",
-      "mix-fw_edu_lq-tutorial_1b_lq": "Source: FW-Edu (LQ)"
     }
   }}
 />
@@ -412,12 +414,13 @@ To investigate to what extent the source dataset for rephrasing matters we rephr
   title="Source Dataset: FAQ (Mix-in = Source)"
   desc="Figure: Effect of source dataset choice for the FAQ prompt when mix-in equals source."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-dclm-faq_1b_dclm": "Source: DCLM",
       "mix-fw_edu_hq-faq_1b_hq": "Source: FW-Edu (HQ)",
       "mix-fw_edu_lq-faq_1b_lq": "Source: FW-Edu (LQ)",
-      "mix-cosmopedia-faq_1b_cosmopedia": "Source: Cosmopedia"
     }
   }}
 />
@@ -430,12 +433,13 @@ When fix the mix-in dataset to fw_edu_hq, the difference shrinks drastically for
   title="Source Dataset: Tutorial (Fixed Mix-in: FW-Edu HQ)"
   desc="Figure: Effect of source dataset for the tutorial prompt with fw_edu_hq as fixed mix-in."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-tutorial_1b_dclm": "Source: DCLM",
       "mix-fw_edu_hq-tutorial_1b_hq": "Source: FW-Edu (HQ)",
       "mix-fw_edu_hq-tutorial_1b_cosmopedia": "Source: Cosmopedia",
-      "mix-fw_edu_hq-tutorial_1b_lq": "Source: FW-Edu (LQ)"
     }
   }}
 />
@@ -446,12 +450,13 @@ When fix the mix-in dataset to fw_edu_hq, the difference shrinks drastically for
   title="Source Dataset: FAQ (Fixed Mix-in: FW-Edu HQ)"
   desc="Figure: Effect of source dataset for the FAQ prompt with fw_edu_hq as fixed mix-in."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-faq_1b_dclm": "Source: DCLM",
       "mix-fw_edu_hq-faq_1b_hq": "Source: FW-Edu (HQ)",
       "mix-fw_edu_hq-faq_1b_lq": "Source: FW-Edu (LQ)",
-      "mix-fw_edu_hq-faq_1b_cosmopedia": "Source: Cosmopedia"
     }
   }}
 />
@@ -466,7 +471,6 @@ We were wondering whether just training on synthetic data works. While we get in
   title="Is Synthetic Data Enough? (DCLM Source)"
   desc="Figure: Synthetic-only vs mixed training with DCLM as source."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-dclm-faq_1b_dclm": "Mix: FAQ + DCLM",
       dclm: "DCLM",
@@ -484,7 +488,6 @@ We were wondering whether just training on synthetic data works. While we get in
   title="Is Synthetic Data Enough? (FW-Edu HQ Source)"
   desc="Figure: Synthetic-only vs mixed training with FW-Edu (HQ) as source."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-faq_1b_hq": "Mix: FAQ + FW-Edu (HQ)",
       "mix-fw_edu_hq-tutorial_1b_hq": "Mix: Tutorial + FW-Edu (HQ)",
@@ -508,7 +511,6 @@ We were wondering whether mixing the best performing rephrasing approaches can i
   title="Mixing Rephrasing Approaches"
   desc="Figure: Mixing multiple prompts vs individual prompts."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-tutorial_1b_hq-fw_edu_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts + FW-Edu (HQ)",
       "mix-fw_edu_hq-math_1b_hq": "Math",
@@ -531,7 +533,6 @@ We rephrased using different model families and saw SmolLM2 and Falcon3 clearly
   title="Mixing Model Families"
   desc="Figure: Mixing rephrased outputs from different model families."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
       "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "SmolLM2 + Falcon3",
@@ -554,7 +555,6 @@ Maybe we need more diversity by mixing both rephrasing approaches and model fami
   title="Mixing Approaches and Model Families"
   desc="Figure: Mixing both rephrasing approaches and model families."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-faq_smollm2_1.7b_hq": "FAQ (SmolLM2)",
       "mix-fw_edu_hq-faq_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "FAQ (SmolLM2) + Tutorial (Falcon3)",
@@ -580,7 +580,6 @@ The original REWIRE prompt contains many typos and grammar errors. To what exten
   title="Effect of Typos in Prompt"
   desc="Figure: REWIRE prompt with original typos vs improved version at 1B and 12B scale."
   config={{
-    defaultView: "line",
     datasetNames: {
       "mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Original (12B)",
       "mix-fw_edu_hq-guided_rewrite_improved_12b_hq": "Improved (12B)",

   desc="Figure: FinePhrase compared against synthetic data baselines across evaluation metrics."
   config={{
     defaultView: "line",
+    pinnedColors: { "FinePhrase": "#ff6d00" },
     datasetNames: {
       cosmopedia: "Cosmopedia",
       "mix-fw_edu_hq-table_smollm2_1.7b_hq": "FinePhrase",
   title="Dissecting Synthetic Baselines"
   desc="Figure: Individual prompt performance from existing synthetic datasets compared to DCLM and FineWeb-Edu (HQ)."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-diverse_qa_pairs_1b_hq": "Diverse QA Pairs",
       dclm: "DCLM",
   title="New Prompt Performance"
   desc="Figure: Four new prompts (math, table, faq, tutorial) compared against DCLM and FineWeb-Edu (HQ)."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-math_1b_hq": "Math",
       "mix-fw_edu_hq-table_1b_hq": "Table",
   title="Model Size: Tutorial Prompt"
   desc="Figure: Gemma-3 model sizes (270M to 27B) on the tutorial prompt."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-tutorial_27b_hq": "Gemma-3 27B",
       "mix-fw_edu_hq-tutorial_12b_hq": "Gemma-3 12B",
   title="Model Size: Math Prompt"
   desc="Figure: Gemma-3 model sizes (270M to 27B) on the math prompt."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-math_1b_hq": "Gemma-3 1B",
       "mix-fw_edu_hq-math_4b_hq": "Gemma-3 4B",
   title="Model Size vs Data Quality: Continue Prompt"
   desc="Figure: 1B vs 12B model on HQ vs LQ data using the continue prompt."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-continue_12b_hq": "12B, HQ Source",
       "mix-fw_edu_hq-continue_1b_hq": "1B, HQ Source",
       "mix-fw_edu_hq-continue_1b_lq": "1B, LQ Source",
+      "mix-fw_edu_hq-continue_12b_lq": "12B, LQ Source",
+      dclm: "DCLM",
+      fw_edu_hq: "FineWeb-Edu (HQ)"
     }
   }}
 />
   title="Model Size vs Data Quality: Tutorial Prompt"
   desc="Figure: 1B vs 12B model on HQ vs LQ data using the tutorial prompt."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-tutorial_1b_hq": "1B, HQ Source",
       "mix-fw_edu_hq-tutorial_12b_hq": "12B, HQ Source",
       "mix-fw_edu_hq-tutorial_12b_lq": "12B, LQ Source",
+      "mix-fw_edu_hq-tutorial_1b_lq": "1B, LQ Source",
+      dclm: "DCLM",
+      fw_edu_hq: "FineWeb-Edu (HQ)"
     }
   }}
 />
   title="Model Size vs Data Quality: FAQ Prompt"
   desc="Figure: 1B vs 12B model on HQ vs LQ data using the FAQ prompt."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-faq_1b_hq": "1B, HQ Source",
       "mix-fw_edu_hq-faq_1b_lq": "1B, LQ Source",
       "mix-fw_edu_hq-faq_12b_hq": "12B, HQ Source",
+      "mix-fw_edu_hq-faq_12b_lq": "12B, LQ Source",
+      dclm: "DCLM",
+      fw_edu_hq: "FineWeb-Edu (HQ)"
     }
   }}
 />
   title="Model Family: Tutorial Prompt"
   desc="Figure: Model families compared on the tutorial prompt at ~1B scale."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
       "mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Falcon3",
       "mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3",
       "mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3",
       "mix-fw_edu_hq-tutorial_granite3_1b_hq": "Granite3",
+      "mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2",
+      dclm: "DCLM",
+      fw_edu_hq: "FineWeb-Edu (HQ)"
     }
   }}
 />
   title="Model Family: FAQ Prompt"
   desc="Figure: Model families compared on the FAQ prompt at ~1B scale."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-faq_smollm2_1.7b_hq": "SmolLM2",
       "mix-fw_edu_hq-faq_llama3.2_1b_hq": "Llama-3.2",
       "mix-fw_edu_hq-faq_falcon3_1b_hq": "Falcon3",
       "mix-fw_edu_hq-faq_1b_hq": "Gemma-3",
       "mix-fw_edu_hq-faq_granite3_1b_hq": "Granite3",
+      "mix-fw_edu_hq-faq_qwen3_1.7b_hq": "Qwen3",
+      dclm: "DCLM",
+      fw_edu_hq: "FineWeb-Edu (HQ)"
     }
   }}
 />
   title="Model Family: Table Prompt"
   desc="Figure: Model families compared on the table prompt at ~1B scale."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-table_smollm2_1.7b_hq": "SmolLM2",
       "mix-fw_edu_hq-table_falcon3_1b_hq": "Falcon3",
       "mix-fw_edu_hq-table_granite3_1b_hq": "Granite3",
       "mix-fw_edu_hq-table_qwen3_1.7b_hq": "Qwen3",
       "mix-fw_edu_hq-table_llama3.2_1b_hq": "Llama-3.2",
+      "mix-fw_edu_hq-table_1b_hq": "Gemma-3",
+      dclm: "DCLM",
+      fw_edu_hq: "FineWeb-Edu (HQ)"
     }
   }}
 />
   title="Model Family: Math Prompt"
   desc="Figure: Model families compared on the math prompt at ~1B scale."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-math_smollm2_1.7b_hq": "SmolLM2",
       "mix-fw_edu_hq-math_falcon3_1b_hq": "Falcon3",
       "mix-fw_edu_hq-math_granite3_1b_hq": "Granite3",
       "mix-fw_edu_hq-math_1b_hq": "Gemma-3",
       "mix-fw_edu_hq-math_llama3.2_1b_hq": "Llama-3.2",
+      "mix-fw_edu_hq-math_qwen3_1.7b_hq": "Qwen3",
+      dclm: "DCLM",
+      fw_edu_hq: "FineWeb-Edu (HQ)"
     }
   }}
 />
   title="Model Generation: Qwen Tutorial"
   desc="Figure: Qwen model generations (1.5 to 3) on the tutorial prompt."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3 (1.7B)",
       "mix-fw_edu_hq-tutorial_qwen2.5_1.5b_hq": "Qwen2.5 (1.5B)",
   title="Mix-in Dataset Effect (HQ Source)"
   desc="Figure: Effect of different mix-in datasets with fw_edu_hq as source for the tutorial prompt."
   config={{
     datasetNames: {
       "mix-dclm-tutorial_1b_hq": "Mix-in: DCLM",
       "mix-fw_edu_hq-tutorial_1b_hq": "Mix-in: FW-Edu (HQ)",
   title="Mix-in Dataset Effect (LQ Source)"
   desc="Figure: Effect of different mix-in datasets with fw_edu_lq as source for the tutorial prompt."
   config={{
     datasetNames: {
       dclm: "DCLM",
       "mix-fw_edu_hq-tutorial_1b_lq": "Mix-in: FW-Edu (HQ)",
   title="Source Dataset: Tutorial (Mix-in = Source)"
   desc="Figure: Effect of source dataset choice for the tutorial prompt when mix-in equals source."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-tutorial_1b_hq": "Source: FW-Edu (HQ)",
       "mix-dclm-tutorial_1b_dclm": "Source: DCLM",
       "mix-cosmopedia-tutorial_1b_cosmopedia": "Source: Cosmopedia",
+      "mix-fw_edu_lq-tutorial_1b_lq": "Source: FW-Edu (LQ)",
+      dclm: "DCLM",
+      fw_edu_hq: "FineWeb-Edu (HQ)"
     }
   }}
 />
   title="Source Dataset: FAQ (Mix-in = Source)"
   desc="Figure: Effect of source dataset choice for the FAQ prompt when mix-in equals source."
   config={{
     datasetNames: {
       "mix-dclm-faq_1b_dclm": "Source: DCLM",
       "mix-fw_edu_hq-faq_1b_hq": "Source: FW-Edu (HQ)",
       "mix-fw_edu_lq-faq_1b_lq": "Source: FW-Edu (LQ)",
+      "mix-cosmopedia-faq_1b_cosmopedia": "Source: Cosmopedia",
+      dclm: "DCLM",
+      fw_edu_hq: "FineWeb-Edu (HQ)"
     }
   }}
 />
   title="Source Dataset: Tutorial (Fixed Mix-in: FW-Edu HQ)"
   desc="Figure: Effect of source dataset for the tutorial prompt with fw_edu_hq as fixed mix-in."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-tutorial_1b_dclm": "Source: DCLM",
       "mix-fw_edu_hq-tutorial_1b_hq": "Source: FW-Edu (HQ)",
       "mix-fw_edu_hq-tutorial_1b_cosmopedia": "Source: Cosmopedia",
+      "mix-fw_edu_hq-tutorial_1b_lq": "Source: FW-Edu (LQ)",
+      dclm: "DCLM",
+      fw_edu_hq: "FineWeb-Edu (HQ)"
     }
   }}
 />
   title="Source Dataset: FAQ (Fixed Mix-in: FW-Edu HQ)"
   desc="Figure: Effect of source dataset for the FAQ prompt with fw_edu_hq as fixed mix-in."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-faq_1b_dclm": "Source: DCLM",
       "mix-fw_edu_hq-faq_1b_hq": "Source: FW-Edu (HQ)",
       "mix-fw_edu_hq-faq_1b_lq": "Source: FW-Edu (LQ)",
+      "mix-fw_edu_hq-faq_1b_cosmopedia": "Source: Cosmopedia",
+      dclm: "DCLM",
+      fw_edu_hq: "FineWeb-Edu (HQ)"
     }
   }}
 />
   title="Is Synthetic Data Enough? (DCLM Source)"
   desc="Figure: Synthetic-only vs mixed training with DCLM as source."
   config={{
     datasetNames: {
       "mix-dclm-faq_1b_dclm": "Mix: FAQ + DCLM",
       dclm: "DCLM",
   title="Is Synthetic Data Enough? (FW-Edu HQ Source)"
   desc="Figure: Synthetic-only vs mixed training with FW-Edu (HQ) as source."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-faq_1b_hq": "Mix: FAQ + FW-Edu (HQ)",
       "mix-fw_edu_hq-tutorial_1b_hq": "Mix: Tutorial + FW-Edu (HQ)",
   title="Mixing Rephrasing Approaches"
   desc="Figure: Mixing multiple prompts vs individual prompts."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-tutorial_1b_hq-fw_edu_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts + FW-Edu (HQ)",
       "mix-fw_edu_hq-math_1b_hq": "Math",
   title="Mixing Model Families"
   desc="Figure: Mixing rephrased outputs from different model families."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
       "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "SmolLM2 + Falcon3",
   title="Mixing Approaches and Model Families"
   desc="Figure: Mixing both rephrasing approaches and model families."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-faq_smollm2_1.7b_hq": "FAQ (SmolLM2)",
       "mix-fw_edu_hq-faq_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "FAQ (SmolLM2) + Tutorial (Falcon3)",
   title="Effect of Typos in Prompt"
   desc="Figure: REWIRE prompt with original typos vs improved version at 1B and 12B scale."
   config={{
     datasetNames: {
       "mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Original (12B)",
       "mix-fw_edu_hq-guided_rewrite_improved_12b_hq": "Improved (12B)",

app/src/content/embeds/d3-benchmark-comparison.html CHANGED Viewed

@@ -3,12 +3,13 @@
   Configuration via data-config attribute:
   {
-    "datasetNames":   { "raw_name": "Display Name", ... },  // required
-    "defaultMetric":  "agg_score_macro",                     // optional, default: "agg_score_macro"
-    "defaultView":    "bar",                                 // optional, "bar" | "line", default: "bar"
-    "tokensPerStep":  2100000,                               // optional, default: 2.1e6
-    "runColumn":      "runname",                             // optional, CSV column for series, default: "runname"
-    "stepColumn":     "steps"                                // optional, CSV column for x-axis, default: "steps"
   }
   Data: uses benchmark-results.csv by default (one CSV with all runs).
@@ -184,6 +185,8 @@
       const TOKENS_PER_STEP = cfg.tokensPerStep || 2.1e6;
       const defaultMetric = cfg.defaultMetric || 'agg_score_macro';
       const defaultView   = cfg.defaultView   || 'bar';
       // Unique ID suffix for multiple instances on same page
       const uid = Math.random().toString(36).slice(2, 8);
@@ -255,11 +258,18 @@
       }
       function initColors() {
-        const allNames = Array.from(d3.group(allData, d => d[RUN_COL]).keys()).sort();
-        if (!Object.keys(colorMap).length) {
-          const palette = getCategoricalColors(allNames.length);
-          allNames.forEach((name, i) => { colorMap[name] = palette[i % palette.length]; });
-        }
       }
       function showTip(html, x, y) {

   Configuration via data-config attribute:
   {
+    "datasetNames":   { "raw_name": "Display Name", ... },                // required
+    "pinnedColors":   { "DCLM": "#333", "FineWeb-Edu (HQ)": "#86a1a9" },  // optional
+    "defaultMetric":  "agg_score_macro",                                  // optional, default: "agg_score_macro"
+    "defaultView":    "bar",                                              // optional, "bar" | "line", default: "bar"
+    "tokensPerStep":  2100000,                                            // optional, default: 2.1e6
+    "runColumn":      "runname",                                          // optional, CSV column for series, default: "runname"
+    "stepColumn":     "steps"                                             // optional, CSV column for x-axis, default: "steps"
   }
   Data: uses benchmark-results.csv by default (one CSV with all runs).
       const TOKENS_PER_STEP = cfg.tokensPerStep || 2.1e6;
       const defaultMetric = cfg.defaultMetric || 'agg_score_macro';
       const defaultView   = cfg.defaultView   || 'bar';
+      // Stable baseline colors, merged with per-chart overrides
+      const PINNED_COLORS = Object.assign({ 'DCLM': '#333', 'FineWeb-Edu (HQ)': '#86a1a9' }, cfg.pinnedColors || {});
       // Unique ID suffix for multiple instances on same page
       const uid = Math.random().toString(36).slice(2, 8);
       }
       function initColors() {
+        if (Object.keys(colorMap).length) return;
+        const allRaw = Array.from(d3.group(allData, d => d[RUN_COL]).keys()).sort();
+        // Assign pinned colors first (keyed by display name)
+        const unpinned = [];
+        allRaw.forEach(raw => {
+          const name = displayName(raw);
+          if (PINNED_COLORS[name]) { colorMap[raw] = PINNED_COLORS[name]; }
+          else { unpinned.push(raw); }
+        });
+        // Fill remaining from categorical palette
+        const palette = getCategoricalColors(unpinned.length);
+        unpinned.forEach((raw, i) => { colorMap[raw] = palette[i % palette.length]; });
       }
       function showTip(html, x, y) {