finephrase

Running on CPU Upgrade

App Files Files Community

joelniklaus HF Staff commited on Feb 20

Commit

fdfc515

1 Parent(s): 621688d

prettified experiment plots

Browse files

Files changed (3) hide show

app/src/content/chapters/experiments.mdx +93 -134
app/src/content/chapters/introduction.mdx +2 -4
app/src/content/embeds/d3-benchmark-comparison.html +171 -100

app/src/content/chapters/experiments.mdx CHANGED Viewed

@@ -8,11 +8,10 @@ import FigRef from "../../components/FigRef.astro";
 {/* TODO: read through entire blog post and make improvements */}
 {/* TODO: potentially make a widget for data exploration: look at the same few samples generated by different models or transformed with different prompts */}
 {/* TODO: Integrate decay experiment as another analysis for proxy */}
-{/* TODO: ask elie about reddit post */}
-{/* TODO: draft tweet */}
-{/* TODO: ask kashif about hackernews posting */}
-{/* TODO: ask elie and merve to share it on discord channels and ask about comms in general */}
-{/* TODO: share on a bunch of discords/slacks */}
 {/*
 Notes:
@@ -40,8 +39,7 @@ We train on eight datasets under identical conditions and compare their final ev
   src="d3-benchmark-comparison.html"
   desc="Comparison of baseline datasets across different evaluation metrics. Use the dropdown to switch metrics."
   config={{
-    baselines: [],
-    datasetNames: {
       cosmopedia: "Cosmopedia",
       dclm: "DCLM",
       fw_edu_hq: "FineWeb-Edu-HQ",
@@ -67,33 +65,20 @@ The BeyondWeb dataset was never released and the paper omits key details, yet cl
 <HtmlEmbed
   id="dissecting-baselines"
   src="d3-benchmark-comparison.html"
-  desc="Individual prompt performance from existing synthetic datasets compared to DCLM and FineWeb-Edu-HQ."
   config={{
-    baselines: ["dclm", "nemotron_hq_synth", "rewire"],
-    datasetNames: {
-      "mix-fw_edu_hq-diverse_qa_pairs_1b_hq": "Diverse QA Pairs",
-      dclm: "DCLM",
-      "mix-fw_edu_hq-extract_knowledge_1b_hq": "Extract Knowledge",
-      "mix-fw_edu_hq-guided_rewrite_original_1b_hq": "Guided Rewrite",
-      nemotron_hq_synth: "Nemotron-HQ-Synth",
-      rewire: "REWIRE",
-      "mix-fw_edu_hq-distill_1b_hq": "Distill",
-      "mix-fw_edu_hq-wikipedia_style_rephrasing_1b_hq": "Wikipedia Rephrasing",
-      "mix-fw_edu_hq-knowledge_list_1b_hq": "Knowledge List",
-      "mix-fw_edu_hq-continue_1b_hq": "Continue",
-      "mix-fw_edu_hq-summarize_1b_hq": "Summarize"
-    },
-    pinnedColors: {
-      "Nemotron-HQ-Synth": "#76b900",
-      "Diverse QA Pairs": "#c5e384",
-      "Distill": "#a0c95c",
-      "Wikipedia Rephrasing": "#7fb034",
-      "Knowledge List": "#5e960e",
-      "Extract Knowledge": "#3d6b00",
-      "REWIRE": "#1877F2",
-      "Guided Rewrite": "#6aabff",
-      "Continue (BeyondWeb)": "#e8713a",
-      "Summarize (BeyondWeb)": "#c4451c"
     }
   }}
 />
@@ -102,14 +87,14 @@ Can we design prompts that consistently beat DCLM?
 ### Can New Prompts Beat DCLM?
-Since most existing prompts fail to beat DCLM, we designed seven novel prompt formats targeting different skills ([math](#math), [table](#table), [faq](#faq), [tutorial](#tutorial), [article](#article), [commentary](#commentary), [discussion](#discussion)), all using Gemma-3-1B on FineWeb-Edu-HQ. Four prompts ([math](#math), [table](#table), [faq](#faq), [tutorial](#tutorial)) outperform both FineWeb-Edu-HQ and DCLM, while [article](#article), [commentary](#commentary), and [discussion](#discussion) are at or below DCLM level (see <FigRef target="new-prompts" />). The best-performing prompts all restructure the source content into pedagogically rich formats.
 <HtmlEmbed
   id="new-prompts"
   src="d3-benchmark-comparison.html"
-  desc="Seven new prompts compared against DCLM and FineWeb-Edu-HQ."
   config={{
-    datasetNames: {
       "mix-fw_edu_hq-math_1b_hq": "Math",
       "mix-fw_edu_hq-table_1b_hq": "Table",
       "mix-fw_edu_hq-faq_1b_hq": "FAQ",
@@ -117,8 +102,7 @@ Since most existing prompts fail to beat DCLM, we designed seven novel prompt fo
       "mix-fw_edu_hq-article_1b_hq": "Article",
       "mix-fw_edu_hq-commentary_1b_hq": "Commentary",
       "mix-fw_edu_hq-discussion_1b_hq": "Discussion",
-      dclm: "DCLM",
-      fw_edu_hq: "FineWeb-Edu-HQ"
     }
   }}
 />
@@ -149,45 +133,41 @@ It is possible that larger models produce richer or more nuanced rephrasings tha
   config={{
     setups: {
       "Gemma-3: Tutorial": {
-        datasetNames: {
           "mix-fw_edu_hq-tutorial_27b_hq": "Gemma-3 27B",
           "mix-fw_edu_hq-tutorial_12b_hq": "Gemma-3 12B",
           "mix-fw_edu_hq-tutorial_4b_hq": "Gemma-3 4B",
           "mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3 1B",
           "mix-fw_edu_hq-tutorial_270m_hq": "Gemma-3 270M",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       },
       "Gemma-3: Math": {
-        datasetNames: {
           "mix-fw_edu_hq-math_27b_hq": "Gemma-3 27B",
           "mix-fw_edu_hq-math_12b_hq": "Gemma-3 12B",
           "mix-fw_edu_hq-math_4b_hq": "Gemma-3 4B",
           "mix-fw_edu_hq-math_1b_hq": "Gemma-3 1B",
           "mix-fw_edu_hq-math_270m_hq": "Gemma-3 270M",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       },
       "Gemma-3: REWIRE": {
-        datasetNames: {
           "mix-fw_edu_hq-guided_rewrite_original_27b_hq": "Gemma-3 27B",
           "mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Gemma-3 12B",
           "mix-fw_edu_hq-guided_rewrite_original_4b_hq": "Gemma-3 4B",
           "mix-fw_edu_hq-guided_rewrite_original_1b_hq": "Gemma-3 1B",
           "mix-fw_edu_hq-guided_rewrite_original_270m_hq": "Gemma-3 270M",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       },
       "SmolLM2: Tutorial": {
-        datasetNames: {
           "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2 1.7B",
           "mix-fw_edu_hq-tutorial_smollm2_360m_hq": "SmolLM2 360M",
           "mix-fw_edu_hq-tutorial_smollm2_135m_hq": "SmolLM2 135M",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       }
     }
@@ -207,43 +187,39 @@ The REWIRE [@rewire] paper claims that upcycling low-quality data requires large
   config={{
     setups: {
       "Continue Prompt": {
-        datasetNames: {
           "mix-fw_edu_hq-continue_12b_hq": "12B, HQ Source",
           "mix-fw_edu_hq-continue_1b_hq": "1B, HQ Source",
           "mix-fw_edu_hq-continue_1b_lq": "1B, LQ Source",
           "mix-fw_edu_hq-continue_12b_lq": "12B, LQ Source",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       },
       "Summarize Prompt": {
-        datasetNames: {
           "mix-fw_edu_hq-summarize_1b_hq": "1B, HQ Source",
           "mix-fw_edu_hq-summarize_12b_hq": "12B, HQ Source",
           "mix-fw_edu_hq-summarize_1b_lq": "1B, LQ Source",
           "mix-fw_edu_hq-summarize_12b_lq": "12B, LQ Source",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       },
       "Tutorial Prompt": {
-        datasetNames: {
           "mix-fw_edu_hq-tutorial_1b_hq": "1B, HQ Source",
           "mix-fw_edu_hq-tutorial_12b_hq": "12B, HQ Source",
           "mix-fw_edu_hq-tutorial_12b_lq": "12B, LQ Source",
           "mix-fw_edu_hq-tutorial_1b_lq": "1B, LQ Source",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       },
       "FAQ Prompt": {
-        datasetNames: {
           "mix-fw_edu_hq-faq_1b_hq": "1B, HQ Source",
           "mix-fw_edu_hq-faq_1b_lq": "1B, LQ Source",
           "mix-fw_edu_hq-faq_12b_hq": "12B, HQ Source",
           "mix-fw_edu_hq-faq_12b_lq": "12B, LQ Source",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       }
     }
@@ -267,51 +243,47 @@ We hypothesize that SmolLM2's consistently strong rephrasing performance origina
   config={{
     setups: {
       "Tutorial Prompt": {
-        datasetNames: {
           "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
           "mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Falcon3",
           "mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3",
           "mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3",
           "mix-fw_edu_hq-tutorial_granite3_1b_hq": "Granite3",
           "mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       },
       "FAQ Prompt": {
-        datasetNames: {
           "mix-fw_edu_hq-faq_smollm2_1.7b_hq": "SmolLM2",
           "mix-fw_edu_hq-faq_llama3.2_1b_hq": "Llama-3.2",
           "mix-fw_edu_hq-faq_falcon3_1b_hq": "Falcon3",
           "mix-fw_edu_hq-faq_1b_hq": "Gemma-3",
           "mix-fw_edu_hq-faq_granite3_1b_hq": "Granite3",
           "mix-fw_edu_hq-faq_qwen3_1.7b_hq": "Qwen3",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       },
       "Table Prompt": {
-        datasetNames: {
           "mix-fw_edu_hq-table_smollm2_1.7b_hq": "SmolLM2",
           "mix-fw_edu_hq-table_falcon3_1b_hq": "Falcon3",
           "mix-fw_edu_hq-table_granite3_1b_hq": "Granite3",
           "mix-fw_edu_hq-table_qwen3_1.7b_hq": "Qwen3",
           "mix-fw_edu_hq-table_llama3.2_1b_hq": "Llama-3.2",
           "mix-fw_edu_hq-table_1b_hq": "Gemma-3",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       },
       "Math Prompt": {
-        datasetNames: {
           "mix-fw_edu_hq-math_smollm2_1.7b_hq": "SmolLM2",
           "mix-fw_edu_hq-math_falcon3_1b_hq": "Falcon3",
           "mix-fw_edu_hq-math_granite3_1b_hq": "Granite3",
           "mix-fw_edu_hq-math_1b_hq": "Gemma-3",
           "mix-fw_edu_hq-math_llama3.2_1b_hq": "Llama-3.2",
           "mix-fw_edu_hq-math_qwen3_1.7b_hq": "Qwen3",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       }
     }
@@ -329,13 +301,12 @@ We compare Qwen models from versions 1.5 [@qwen], 2 [@qwen2], 2.5 [@qwen25], and
   src="d3-benchmark-comparison.html"
   desc="Qwen model generations (1.5 to 3) on the tutorial prompt."
   config={{
-    datasetNames: {
       "mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3 (1.7B)",
       "mix-fw_edu_hq-tutorial_qwen2.5_1.5b_hq": "Qwen2.5 (1.5B)",
       "mix-fw_edu_hq-tutorial_qwen2_1.5b_hq": "Qwen2 (1.5B)",
-      dclm: "DCLM",
-      "mix-fw_edu_hq-tutorial_qwen1.5_1.8b_hq": "Qwen1.5 (1.8B)",
-      fw_edu_hq: "FineWeb-Edu-HQ"
     }
   }}
 />
@@ -355,7 +326,7 @@ So far we've always mixed synthetic data with a <Glossary term="source dataset"
 #### Is synthetic data enough?
-We compare synthetic-only training vs mixed training (synthetic + source) for [tutorial](#tutorial) and [faq](#faq) prompts on DCLM and FineWeb-Edu-HQ sources. Synthetic-only training beats FineWeb-Edu-HQ but falls short of both DCLM and mixed training (see <FigRef target="synthetic-only" />). Mixed training consistently improves over both the synthetic-only and original-data-only baselines.
 <HtmlEmbed
   id="synthetic-only"
@@ -364,23 +335,21 @@ We compare synthetic-only training vs mixed training (synthetic + source) for [t
   config={{
     setups: {
       "DCLM Source": {
-        datasetNames: {
           "mix-dclm-faq_1b_dclm": "Mix: FAQ + DCLM",
-          dclm: "DCLM",
           "mix-dclm-tutorial_1b_dclm": "Mix: Tutorial + DCLM",
           faq_1b_dclm: "FAQ Only",
-          tutorial_1b_dclm: "Tutorial Only",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       },
       "FineWeb-Edu-HQ Source": {
-        datasetNames: {
           "mix-fw_edu_hq-faq_1b_hq": "Mix: FAQ + FineWeb-Edu-HQ",
           "mix-fw_edu_hq-tutorial_1b_hq": "Mix: Tutorial + FineWeb-Edu-HQ",
-          dclm: "DCLM",
           faq_1b_hq: "FAQ Only",
-          tutorial_1b_hq: "Tutorial Only",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       }
     }
@@ -391,7 +360,7 @@ So synthetic data alone does not seem to be enough. But how much does the specif
 #### Does the mix-in dataset matter?
-We apply the [tutorial](#tutorial) prompt using Gemma-3-1B on FineWeb-Edu-HQ, then mix in one of four datasets: DCLM, Cosmopedia, FineWeb-Edu-HQ, or FineWeb-Edu-LQ. Use the Setup dropdown to also see results with LQ source data. DCLM and FineWeb-Edu-HQ outperform Cosmopedia and FineWeb-Edu-LQ as mix-in datasets. Adding synthetic data improves performance for all mix-in datasets, with the effect especially pronounced for the weaker ones (see <FigRef target="mixin-dataset" />). The mix-in dataset is a major performance driver, sometimes more important than the synthetic data itself.
 <HtmlEmbed
   id="mixin-dataset"
@@ -400,31 +369,30 @@ We apply the [tutorial](#tutorial) prompt using Gemma-3-1B on FineWeb-Edu-HQ, th
   config={{
     setups: {
       "HQ Source": {
-        datasetNames: {
-          "mix-dclm-tutorial_1b_hq": "Mix-in: DCLM",
-          "mix-fw_edu_hq-tutorial_1b_hq": "Mix-in: FineWeb-Edu-HQ",
-          dclm: "DCLM",
-          "mix-fw_edu_lq-tutorial_1b_hq": "Mix-in: FineWeb-Edu-LQ",
-          "mix-cosmopedia-tutorial_1b_hq": "Mix-in: Cosmopedia",
-          fw_edu_hq: "FineWeb-Edu-HQ",
-          cosmopedia: "Cosmopedia",
-          fw_edu_lq: "FineWeb-Edu-LQ"
         }
       },
       "LQ Source": {
-        datasetNames: {
-          dclm: "DCLM",
-          "mix-fw_edu_hq-tutorial_1b_lq": "Mix-in: FineWeb-Edu-HQ",
-          "mix-dclm-tutorial_1b_lq": "Mix-in: DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ",
-          "mix-cosmopedia-tutorial_1b_lq": "Mix-in: Cosmopedia",
-          cosmopedia: "Cosmopedia",
-          "mix-fw_edu_lq-tutorial_1b_lq": "Mix-in: FineWeb-Edu-LQ",
-          fw_edu_lq: "FineWeb-Edu-LQ"
         }
       }
-    },
-    baselines: ["dclm", "fw_edu_hq", "cosmopedia", "fw_edu_lq"]
   }}
 />
@@ -441,23 +409,21 @@ We rephrase four datasets (DCLM, Cosmopedia, FineWeb-Edu-HQ, FineWeb-Edu-LQ) wit
   config={{
     setups: {
       "Tutorial Prompt": {
-        datasetNames: {
           "mix-fw_edu_hq-tutorial_1b_hq": "Source: FineWeb-Edu-HQ",
           "mix-dclm-tutorial_1b_dclm": "Source: DCLM",
           "mix-cosmopedia-tutorial_1b_cosmopedia": "Source: Cosmopedia",
           "mix-fw_edu_lq-tutorial_1b_lq": "Source: FineWeb-Edu-LQ",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       },
       "FAQ Prompt": {
-        datasetNames: {
           "mix-dclm-faq_1b_dclm": "Source: DCLM",
           "mix-fw_edu_hq-faq_1b_hq": "Source: FineWeb-Edu-HQ",
           "mix-fw_edu_lq-faq_1b_lq": "Source: FineWeb-Edu-LQ",
           "mix-cosmopedia-faq_1b_cosmopedia": "Source: Cosmopedia",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       }
     }
@@ -471,23 +437,21 @@ We rephrase four datasets (DCLM, Cosmopedia, FineWeb-Edu-HQ, FineWeb-Edu-LQ) wit
   config={{
     setups: {
       "Tutorial Prompt": {
-        datasetNames: {
           "mix-fw_edu_hq-tutorial_1b_dclm": "Source: DCLM",
           "mix-fw_edu_hq-tutorial_1b_hq": "Source: FineWeb-Edu-HQ",
           "mix-fw_edu_hq-tutorial_1b_cosmopedia": "Source: Cosmopedia",
           "mix-fw_edu_hq-tutorial_1b_lq": "Source: FineWeb-Edu-LQ",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       },
       "FAQ Prompt": {
-        datasetNames: {
           "mix-fw_edu_hq-faq_1b_dclm": "Source: DCLM",
           "mix-fw_edu_hq-faq_1b_hq": "Source: FineWeb-Edu-HQ",
           "mix-fw_edu_hq-faq_1b_lq": "Source: FineWeb-Edu-LQ",
           "mix-fw_edu_hq-faq_1b_cosmopedia": "Source: Cosmopedia",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       }
     }
@@ -511,38 +475,35 @@ Interestingly, when mixing enough different prompts together, we don't seem to n
   config={{
     setups: {
       "Mixing Prompts": {
-        datasetNames: {
           "mix-fw_edu_hq-tutorial_1b_hq-fw_edu_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts + FineWeb-Edu-HQ",
           "mix-fw_edu_hq-math_1b_hq": "Math + FineWeb-Edu-HQ",
           "mix-tutorial_1b_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts (No Source)",
           "mix-fw_edu_hq-table_1b_hq": "Table + FineWeb-Edu-HQ",
           "mix-fw_edu_hq-faq_1b_hq": "FAQ + FineWeb-Edu-HQ",
           "mix-fw_edu_hq-tutorial_1b_hq": "Tutorial + FineWeb-Edu-HQ",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       },
       "Mixing Models": {
-        datasetNames: {
           "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
           "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "SmolLM2 + Falcon3",
           "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_llama3.2_1b_hq": "SmolLM2 + Llama-3.2",
           "mix-fw_edu_hq-tutorial_llama3.2_1b_hq-tutorial_granite3_1b_hq": "Llama-3.2 + Granite3",
           "mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       },
       "Mixing Both": {
-        datasetNames: {
           "mix-fw_edu_hq-faq_smollm2_1.7b_hq": "FAQ (SmolLM2)",
           "mix-fw_edu_hq-faq_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "FAQ (SmolLM2) + Tutorial (Falcon3)",
           "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "Tutorial (SmolLM2)",
           "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "Tutorial (SmolLM2) + Tutorial (Falcon3)",
           "mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Tutorial (Falcon3)",
           "mix-fw_edu_hq-faq_falcon3_1b_hq": "FAQ (Falcon3)",
-          dclm: "DCLM",
-          fw_edu_hq: "FineWeb-Edu-HQ"
         }
       }
     }
@@ -568,13 +529,12 @@ We compare REWIRE's [original prompt](#guided_rewrite_original) (with typos) aga
   src="d3-benchmark-comparison.html"
   desc="REWIRE prompt with original typos vs improved version at 1B and 12B scale."
   config={{
-    datasetNames: {
       "mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Original (12B)",
       "mix-fw_edu_hq-guided_rewrite_improved_12b_hq": "Improved (12B)",
-      dclm: "DCLM",
       "mix-fw_edu_hq-guided_rewrite_original_1b_hq": "Original (1B)",
-      "mix-fw_edu_hq-guided_rewrite_improved_1b_hq": "Improved (1B)",
-      fw_edu_hq: "FineWeb-Edu-HQ"
     }
   }}
 />
@@ -609,4 +569,3 @@ Here are the key takeaways from our experiments:
   A: No. Typos have no negative effect on downstream performance.
 The bottom line: the details of synthetic rephrasing matter a lot, and knowing which ones matter is the key to scaling it up. Prompt design is the single biggest lever, with structured formats like Math, Table, FAQ, and Tutorial consistently beating curated baselines. But equally important is knowing where you can cut corners without losing quality. You don't need a large rephrasing model (1B is enough for simple prompts, 4B for complex ones). You don't need pristine source data (even low-quality sources work with a strong mix-in). Smaller models generate faster, directly translating into higher throughput. And tolerating lower-quality sources opens up a much bigger and more diverse data pool to draw from. The practical recipe is straightforward: pick a strong structured prompt, use the smallest model that handles it, blend with high-quality original data, and spend your remaining compute on volume.

 {/* TODO: read through entire blog post and make improvements */}
 {/* TODO: potentially make a widget for data exploration: look at the same few samples generated by different models or transformed with different prompts */}
 {/* TODO: Integrate decay experiment as another analysis for proxy */}
+{/* TODO: share on a bunch of discords/slacks/hackernews/locallama */}
+{/* TODO: brainstorm better banner, be artsy */}
+{/* TODO: only explain datatrove additions when we need them (for generating the final finephrase) */}
+{/* TODO: move infrastructure section after analyses as precursor and explanation for finephrase */}
 {/*
 Notes:
   src="d3-benchmark-comparison.html"
   desc="Comparison of baseline datasets across different evaluation metrics. Use the dropdown to switch metrics."
   config={{
+    datasets: {
       cosmopedia: "Cosmopedia",
       dclm: "DCLM",
       fw_edu_hq: "FineWeb-Edu-HQ",
 <HtmlEmbed
   id="dissecting-baselines"
   src="d3-benchmark-comparison.html"
+  desc="Individual prompt performance from existing synthetic datasets compared to the DCLM baseline."
   config={{
+    datasets: {
+      "mix-fw_edu_hq-diverse_qa_pairs_1b_hq": { display: "Diverse QA Pairs", color: "#c5e384" },
+      dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
+      "mix-fw_edu_hq-extract_knowledge_1b_hq": { display: "Extract Knowledge", color: "#3d6b00" },
+      "mix-fw_edu_hq-guided_rewrite_original_1b_hq": { display: "Guided Rewrite", color: "#6aabff" },
+      nemotron_hq_synth: { display: "Nemotron-HQ-Synth", color: "#76b900", shaded: true },
+      rewire: { display: "REWIRE", color: "#1877F2", shaded: true },
+      "mix-fw_edu_hq-distill_1b_hq": { display: "Distill", color: "#a0c95c" },
+      "mix-fw_edu_hq-wikipedia_style_rephrasing_1b_hq": { display: "Wikipedia Rephrasing", color: "#7fb034" },
+      "mix-fw_edu_hq-knowledge_list_1b_hq": { display: "Knowledge List", color: "#5e960e" },
+      "mix-fw_edu_hq-continue_1b_hq": { display: "Continue", color: "#e8713a" },
+      "mix-fw_edu_hq-summarize_1b_hq": { display: "Summarize", color: "#c4451c" }
     }
   }}
 />
 ### Can New Prompts Beat DCLM?
+Since most existing prompts fail to beat DCLM, we designed seven novel prompt formats targeting different skills ([math](#math), [table](#table), [faq](#faq), [tutorial](#tutorial), [article](#article), [commentary](#commentary), [discussion](#discussion)), all using Gemma-3-1B on FineWeb-Edu-HQ. Four prompts ([math](#math), [table](#table), [faq](#faq), [tutorial](#tutorial)) outperform DCLM, while [article](#article), [commentary](#commentary), and [discussion](#discussion) are at or below DCLM level (see <FigRef target="new-prompts" />). The best-performing prompts all restructure the source content into pedagogically rich formats.
 <HtmlEmbed
   id="new-prompts"
   src="d3-benchmark-comparison.html"
+  desc="Seven new prompts compared against the DCLM baseline."
   config={{
+    datasets: {
       "mix-fw_edu_hq-math_1b_hq": "Math",
       "mix-fw_edu_hq-table_1b_hq": "Table",
       "mix-fw_edu_hq-faq_1b_hq": "FAQ",
       "mix-fw_edu_hq-article_1b_hq": "Article",
       "mix-fw_edu_hq-commentary_1b_hq": "Commentary",
       "mix-fw_edu_hq-discussion_1b_hq": "Discussion",
+      dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
     }
   }}
 />
   config={{
     setups: {
       "Gemma-3: Tutorial": {
+        datasets: {
           "mix-fw_edu_hq-tutorial_27b_hq": "Gemma-3 27B",
           "mix-fw_edu_hq-tutorial_12b_hq": "Gemma-3 12B",
           "mix-fw_edu_hq-tutorial_4b_hq": "Gemma-3 4B",
           "mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3 1B",
           "mix-fw_edu_hq-tutorial_270m_hq": "Gemma-3 270M",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       },
       "Gemma-3: Math": {
+        datasets: {
           "mix-fw_edu_hq-math_27b_hq": "Gemma-3 27B",
           "mix-fw_edu_hq-math_12b_hq": "Gemma-3 12B",
           "mix-fw_edu_hq-math_4b_hq": "Gemma-3 4B",
           "mix-fw_edu_hq-math_1b_hq": "Gemma-3 1B",
           "mix-fw_edu_hq-math_270m_hq": "Gemma-3 270M",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       },
       "Gemma-3: REWIRE": {
+        datasets: {
           "mix-fw_edu_hq-guided_rewrite_original_27b_hq": "Gemma-3 27B",
           "mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Gemma-3 12B",
           "mix-fw_edu_hq-guided_rewrite_original_4b_hq": "Gemma-3 4B",
           "mix-fw_edu_hq-guided_rewrite_original_1b_hq": "Gemma-3 1B",
           "mix-fw_edu_hq-guided_rewrite_original_270m_hq": "Gemma-3 270M",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       },
       "SmolLM2: Tutorial": {
+        datasets: {
           "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2 1.7B",
           "mix-fw_edu_hq-tutorial_smollm2_360m_hq": "SmolLM2 360M",
           "mix-fw_edu_hq-tutorial_smollm2_135m_hq": "SmolLM2 135M",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       }
     }
   config={{
     setups: {
       "Continue Prompt": {
+        datasets: {
           "mix-fw_edu_hq-continue_12b_hq": "12B, HQ Source",
           "mix-fw_edu_hq-continue_1b_hq": "1B, HQ Source",
           "mix-fw_edu_hq-continue_1b_lq": "1B, LQ Source",
           "mix-fw_edu_hq-continue_12b_lq": "12B, LQ Source",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       },
       "Summarize Prompt": {
+        datasets: {
           "mix-fw_edu_hq-summarize_1b_hq": "1B, HQ Source",
           "mix-fw_edu_hq-summarize_12b_hq": "12B, HQ Source",
           "mix-fw_edu_hq-summarize_1b_lq": "1B, LQ Source",
           "mix-fw_edu_hq-summarize_12b_lq": "12B, LQ Source",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       },
       "Tutorial Prompt": {
+        datasets: {
           "mix-fw_edu_hq-tutorial_1b_hq": "1B, HQ Source",
           "mix-fw_edu_hq-tutorial_12b_hq": "12B, HQ Source",
           "mix-fw_edu_hq-tutorial_12b_lq": "12B, LQ Source",
           "mix-fw_edu_hq-tutorial_1b_lq": "1B, LQ Source",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       },
       "FAQ Prompt": {
+        datasets: {
           "mix-fw_edu_hq-faq_1b_hq": "1B, HQ Source",
           "mix-fw_edu_hq-faq_1b_lq": "1B, LQ Source",
           "mix-fw_edu_hq-faq_12b_hq": "12B, HQ Source",
           "mix-fw_edu_hq-faq_12b_lq": "12B, LQ Source",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       }
     }
   config={{
     setups: {
       "Tutorial Prompt": {
+        datasets: {
           "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
           "mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Falcon3",
           "mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3",
           "mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3",
           "mix-fw_edu_hq-tutorial_granite3_1b_hq": "Granite3",
           "mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       },
       "FAQ Prompt": {
+        datasets: {
           "mix-fw_edu_hq-faq_smollm2_1.7b_hq": "SmolLM2",
           "mix-fw_edu_hq-faq_llama3.2_1b_hq": "Llama-3.2",
           "mix-fw_edu_hq-faq_falcon3_1b_hq": "Falcon3",
           "mix-fw_edu_hq-faq_1b_hq": "Gemma-3",
           "mix-fw_edu_hq-faq_granite3_1b_hq": "Granite3",
           "mix-fw_edu_hq-faq_qwen3_1.7b_hq": "Qwen3",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       },
       "Table Prompt": {
+        datasets: {
           "mix-fw_edu_hq-table_smollm2_1.7b_hq": "SmolLM2",
           "mix-fw_edu_hq-table_falcon3_1b_hq": "Falcon3",
           "mix-fw_edu_hq-table_granite3_1b_hq": "Granite3",
           "mix-fw_edu_hq-table_qwen3_1.7b_hq": "Qwen3",
           "mix-fw_edu_hq-table_llama3.2_1b_hq": "Llama-3.2",
           "mix-fw_edu_hq-table_1b_hq": "Gemma-3",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       },
       "Math Prompt": {
+        datasets: {
           "mix-fw_edu_hq-math_smollm2_1.7b_hq": "SmolLM2",
           "mix-fw_edu_hq-math_falcon3_1b_hq": "Falcon3",
           "mix-fw_edu_hq-math_granite3_1b_hq": "Granite3",
           "mix-fw_edu_hq-math_1b_hq": "Gemma-3",
           "mix-fw_edu_hq-math_llama3.2_1b_hq": "Llama-3.2",
           "mix-fw_edu_hq-math_qwen3_1.7b_hq": "Qwen3",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       }
     }
   src="d3-benchmark-comparison.html"
   desc="Qwen model generations (1.5 to 3) on the tutorial prompt."
   config={{
+    datasets: {
       "mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3 (1.7B)",
       "mix-fw_edu_hq-tutorial_qwen2.5_1.5b_hq": "Qwen2.5 (1.5B)",
       "mix-fw_edu_hq-tutorial_qwen2_1.5b_hq": "Qwen2 (1.5B)",
+      dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
+      "mix-fw_edu_hq-tutorial_qwen1.5_1.8b_hq": "Qwen1.5 (1.8B)"
     }
   }}
 />
 #### Is synthetic data enough?
+We compare synthetic-only training vs mixed training (synthetic + source) for [tutorial](#tutorial) and [faq](#faq) prompts on DCLM and FineWeb-Edu-HQ sources. Synthetic-only training falls short of both DCLM and mixed training (see <FigRef target="synthetic-only" />). Mixed training consistently improves over both the synthetic-only and original-data-only baselines.
 <HtmlEmbed
   id="synthetic-only"
   config={{
     setups: {
       "DCLM Source": {
+        datasets: {
           "mix-dclm-faq_1b_dclm": "Mix: FAQ + DCLM",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
           "mix-dclm-tutorial_1b_dclm": "Mix: Tutorial + DCLM",
           faq_1b_dclm: "FAQ Only",
+          tutorial_1b_dclm: "Tutorial Only"
         }
       },
       "FineWeb-Edu-HQ Source": {
+        datasets: {
           "mix-fw_edu_hq-faq_1b_hq": "Mix: FAQ + FineWeb-Edu-HQ",
           "mix-fw_edu_hq-tutorial_1b_hq": "Mix: Tutorial + FineWeb-Edu-HQ",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
           faq_1b_hq: "FAQ Only",
+          tutorial_1b_hq: "Tutorial Only"
         }
       }
     }
 #### Does the mix-in dataset matter?
+We apply the [tutorial](#tutorial) prompt using Gemma-3-1B on FineWeb-Edu-HQ, then mix in one of four datasets: DCLM, Cosmopedia, FineWeb-Edu-HQ, or FineWeb-Edu-LQ. Use the Setup dropdown to also see results with LQ source data. DCLM outperforms other mix-in datasets. Adding synthetic data improves performance for all mix-in datasets, with the effect especially pronounced for the weaker ones (see <FigRef target="mixin-dataset" />). The mix-in dataset is a major performance driver, sometimes more important than the synthetic data itself.
 <HtmlEmbed
   id="mixin-dataset"
   config={{
     setups: {
       "HQ Source": {
+        datasets: {
+          "mix-dclm-tutorial_1b_hq": { display: "Mix-in: DCLM", color: "#4e79a7" },
+          "mix-fw_edu_hq-tutorial_1b_hq": { display: "Mix-in: FineWeb-Edu-HQ", color: "#59a14f" },
+          dclm: { display: "DCLM", color: "#4e79a7", shaded: true },
+          "mix-fw_edu_lq-tutorial_1b_hq": { display: "Mix-in: FineWeb-Edu-LQ", color: "#e15759" },
+          "mix-cosmopedia-tutorial_1b_hq": { display: "Mix-in: Cosmopedia", color: "#f28e2b" },
+          cosmopedia: { display: "Cosmopedia", color: "#f28e2b", shaded: true },
+          fw_edu_hq: { display: "FineWeb-Edu-HQ", color: "#59a14f", shaded: true },
+          fw_edu_lq: { display: "FineWeb-Edu-LQ", color: "#e15759", shaded: true }
         }
       },
       "LQ Source": {
+        datasets: {
+          dclm: { display: "DCLM", color: "#4e79a7", shaded: true },
+          "mix-fw_edu_hq-tutorial_1b_lq": { display: "Mix-in: FineWeb-Edu-HQ", color: "#59a14f" },
+          "mix-dclm-tutorial_1b_lq": { display: "Mix-in: DCLM", color: "#4e79a7" },
+          "mix-cosmopedia-tutorial_1b_lq": { display: "Mix-in: Cosmopedia", color: "#f28e2b" },
+          cosmopedia: { display: "Cosmopedia", color: "#f28e2b", shaded: true },
+          "mix-fw_edu_lq-tutorial_1b_lq": { display: "Mix-in: FineWeb-Edu-LQ", color: "#e15759" },
+          fw_edu_hq: { display: "FineWeb-Edu-HQ", color: "#59a14f", shaded: true },
+          fw_edu_lq: { display: "FineWeb-Edu-LQ", color: "#e15759", shaded: true }
         }
       }
+    }
   }}
 />
   config={{
     setups: {
       "Tutorial Prompt": {
+        datasets: {
           "mix-fw_edu_hq-tutorial_1b_hq": "Source: FineWeb-Edu-HQ",
           "mix-dclm-tutorial_1b_dclm": "Source: DCLM",
           "mix-cosmopedia-tutorial_1b_cosmopedia": "Source: Cosmopedia",
           "mix-fw_edu_lq-tutorial_1b_lq": "Source: FineWeb-Edu-LQ",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       },
       "FAQ Prompt": {
+        datasets: {
           "mix-dclm-faq_1b_dclm": "Source: DCLM",
           "mix-fw_edu_hq-faq_1b_hq": "Source: FineWeb-Edu-HQ",
           "mix-fw_edu_lq-faq_1b_lq": "Source: FineWeb-Edu-LQ",
           "mix-cosmopedia-faq_1b_cosmopedia": "Source: Cosmopedia",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       }
     }
   config={{
     setups: {
       "Tutorial Prompt": {
+        datasets: {
           "mix-fw_edu_hq-tutorial_1b_dclm": "Source: DCLM",
           "mix-fw_edu_hq-tutorial_1b_hq": "Source: FineWeb-Edu-HQ",
           "mix-fw_edu_hq-tutorial_1b_cosmopedia": "Source: Cosmopedia",
           "mix-fw_edu_hq-tutorial_1b_lq": "Source: FineWeb-Edu-LQ",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       },
       "FAQ Prompt": {
+        datasets: {
           "mix-fw_edu_hq-faq_1b_dclm": "Source: DCLM",
           "mix-fw_edu_hq-faq_1b_hq": "Source: FineWeb-Edu-HQ",
           "mix-fw_edu_hq-faq_1b_lq": "Source: FineWeb-Edu-LQ",
           "mix-fw_edu_hq-faq_1b_cosmopedia": "Source: Cosmopedia",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       }
     }
   config={{
     setups: {
       "Mixing Prompts": {
+        datasets: {
           "mix-fw_edu_hq-tutorial_1b_hq-fw_edu_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts + FineWeb-Edu-HQ",
           "mix-fw_edu_hq-math_1b_hq": "Math + FineWeb-Edu-HQ",
           "mix-tutorial_1b_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts (No Source)",
           "mix-fw_edu_hq-table_1b_hq": "Table + FineWeb-Edu-HQ",
           "mix-fw_edu_hq-faq_1b_hq": "FAQ + FineWeb-Edu-HQ",
           "mix-fw_edu_hq-tutorial_1b_hq": "Tutorial + FineWeb-Edu-HQ",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       },
       "Mixing Models": {
+        datasets: {
           "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
           "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "SmolLM2 + Falcon3",
           "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_llama3.2_1b_hq": "SmolLM2 + Llama-3.2",
           "mix-fw_edu_hq-tutorial_llama3.2_1b_hq-tutorial_granite3_1b_hq": "Llama-3.2 + Granite3",
           "mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       },
       "Mixing Both": {
+        datasets: {
           "mix-fw_edu_hq-faq_smollm2_1.7b_hq": "FAQ (SmolLM2)",
           "mix-fw_edu_hq-faq_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "FAQ (SmolLM2) + Tutorial (Falcon3)",
           "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "Tutorial (SmolLM2)",
           "mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "Tutorial (SmolLM2) + Tutorial (Falcon3)",
           "mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Tutorial (Falcon3)",
           "mix-fw_edu_hq-faq_falcon3_1b_hq": "FAQ (Falcon3)",
+          dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
         }
       }
     }
   src="d3-benchmark-comparison.html"
   desc="REWIRE prompt with original typos vs improved version at 1B and 12B scale."
   config={{
+    datasets: {
       "mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Original (12B)",
       "mix-fw_edu_hq-guided_rewrite_improved_12b_hq": "Improved (12B)",
+      dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
       "mix-fw_edu_hq-guided_rewrite_original_1b_hq": "Original (1B)",
+      "mix-fw_edu_hq-guided_rewrite_improved_1b_hq": "Improved (1B)"
     }
   }}
 />
   A: No. Typos have no negative effect on downstream performance.
 The bottom line: the details of synthetic rephrasing matter a lot, and knowing which ones matter is the key to scaling it up. Prompt design is the single biggest lever, with structured formats like Math, Table, FAQ, and Tutorial consistently beating curated baselines. But equally important is knowing where you can cut corners without losing quality. You don't need a large rephrasing model (1B is enough for simple prompts, 4B for complex ones). You don't need pristine source data (even low-quality sources work with a strong mix-in). Smaller models generate faster, directly translating into higher throughput. And tolerating lower-quality sources opens up a much bigger and more diverse data pool to draw from. The practical recipe is straightforward: pick a strong structured prompt, use the smallest model that handles it, blend with high-quality original data, and spend your remaining compute on volume.

app/src/content/chapters/introduction.mdx CHANGED Viewed

@@ -44,11 +44,9 @@ Here's a preview of where we end up: FinePhrase, our best configuration, clearly
   desc="FinePhrase compared against synthetic data baselines across evaluation metrics."
   config={{
     defaultView: "line",
-    pinnedColors: { "FinePhrase": "#EBA937" },
-    baselines: ["cosmopedia", "nemotron_hq_synth", "rewire", "synth_query_reasoning_answer"],
-    datasetNames: {
       cosmopedia: "Cosmopedia",
-      "mix-fw_edu_hq-table_smollm2_1.7b_hq": "FinePhrase",
       nemotron_hq_synth: "Nemotron-HQ-Synth",
       rewire: "REWIRE",
       synth_query_reasoning_answer: "SYNTH"

   desc="FinePhrase compared against synthetic data baselines across evaluation metrics."
   config={{
     defaultView: "line",
+    datasets: {
       cosmopedia: "Cosmopedia",
+      "mix-fw_edu_hq-table_smollm2_1.7b_hq": { display: "FinePhrase", color: "#EBA937" },
       nemotron_hq_synth: "Nemotron-HQ-Synth",
       rewire: "REWIRE",
       synth_query_reasoning_answer: "SYNTH"

app/src/content/embeds/d3-benchmark-comparison.html CHANGED Viewed

@@ -3,29 +3,38 @@
   Configuration via data-config attribute:
   {
-    "datasetNames":   { "raw_name": "Display Name", ... },                // required (unless using setups)
-    "setups":         { "Setup Label": { "datasetNames": {...} }, ... },   // optional, multi-setup mode with dropdown + average
-    "pinnedColors":   { "DCLM": "#333", "FineWeb-Edu (HQ)": "#86a1a9" },  // optional
-    "baselines":      ["dclm", "fw_edu_hq"],                              // optional, raw keys for baseline datasets (dashed lines, striped bars). Default: ["dclm", "fw_edu_hq"]
-    "defaultMetric":  "agg_score_macro",                                  // optional, default: "agg_score_macro"
-    "defaultView":    "bar",                                              // optional, "bar" | "line", default: "bar"
-    "tokensPerStep":  2100000,                                            // optional, default: 2.1e6
-    "runColumn":      "runname",                                          // optional, CSV column for series, default: "runname"
-    "stepColumn":     "steps"                                             // optional, CSV column for x-axis, default: "steps"
   }
   Data: uses benchmark-results.csv by default (one CSV with all runs).
-  Only rows matching keys in datasetNames are displayed.
   Example usage in MDX:
   <HtmlEmbed
     src="d3-benchmark-comparison.html"
     title="Baseline Comparison"
     config={{
-      datasetNames: {
         cosmopedia: "Cosmopedia",
-        dclm: "DCLM",
-        fw_edu_hq: "FineWeb-Edu (HQ)"
       }
     }}
   />
@@ -107,12 +116,9 @@
   .d3-benchmark-comparison .bar.ghost { opacity: .25; }
   .d3-benchmark-comparison .value-label.ghost { opacity: .25; }
   .d3-benchmark-comparison .line-path { fill: none; stroke-width: 2; opacity: 0.85; }
-  .d3-benchmark-comparison .line-path.baseline { stroke-dasharray: 6,4; opacity: 0.5; }
-  .d3-benchmark-comparison .line-path.baseline.ghost { opacity: .1; }
   .d3-benchmark-comparison .line-path.ghost { opacity: .15; }
-  .d3-benchmark-comparison .line-dot.baseline { opacity: 0.5; }
-  .d3-benchmark-comparison .line-dot.baseline.ghost { opacity: .1; }
   .d3-benchmark-comparison .line-dot.ghost { opacity: .15; }
   .d3-benchmark-comparison .axes path { display: none; }
   .d3-benchmark-comparison .axes line { stroke: var(--axis-color); }
   .d3-benchmark-comparison .axes text { fill: var(--tick-color); }
@@ -183,14 +189,24 @@
         if (raw && raw.trim()) cfg = raw.trim().startsWith('{') ? JSON.parse(raw) : {};
       } catch (_) {}
-      // Configurable settings with defaults
       // ─── SETUP SUPPORT ───
       const SETUPS = cfg.setups || null;
       const setupNames = SETUPS ? Object.keys(SETUPS) : [];
       let currentSetup = SETUPS ? setupNames[0] : null;
-      let DATASET_NAMES = SETUPS ? { ...SETUPS[setupNames[0]].datasetNames } : (cfg.datasetNames || {});
       const AVG_SETUP_KEY = 'Average (all setups)';
-      let avgDatasetNames = {};
       let parsedData = [];
       const RUN_COL       = cfg.runColumn    || 'runname';
@@ -198,21 +214,15 @@
       const TOKENS_PER_STEP = cfg.tokensPerStep || 2.1e6;
       const defaultMetric = cfg.defaultMetric || 'agg_score_macro';
       const defaultView   = cfg.defaultView   || 'bar';
-      // Stable baseline colors, merged with per-chart overrides
-      const PINNED_COLORS = Object.assign({ 'DCLM': '#8b8b8b', 'FineWeb-Edu (HQ)': '#86a1a9' }, cfg.pinnedColors || {});
-      // Unique ID suffix for multiple instances on same page
       const uid = Math.random().toString(36).slice(2, 8);
-      // Baseline datasets: dashed lines, striped bars, reduced opacity
-      const BASELINES = new Set(cfg.baselines || ['dclm', 'fw_edu_hq']);
-      function isBaseline(raw) { return BASELINES.has(raw); }
       function stripePatternId(raw) { return 'stripe-' + uid + '-' + raw.replace(/[^a-zA-Z0-9]/g, '_'); }
-      function barFill(d) {
-        if (isBaseline(d.rawName)) return `url(#${stripePatternId(d.rawName)})`;
-        return colorMap[d.rawName] || 'var(--primary-color)';
-      }
-      // Standard metric display names (shared across all CSVs from this benchmark suite)
       const METRIC_NAMES = {
         'agg_score_macro': 'Aggregate Score (Macro)',
         'agg_score_micro': 'Aggregate Score (Micro)',
@@ -251,14 +261,13 @@
       // State
       let allData = [];
-      let metricKeys = []; // auto-detected from CSV columns
       let currentMetric = defaultMetric;
       let currentView = defaultView;
       let colorMap = {};
       let highlight = null;
       // ─── HELPERS ───
-      function displayName(raw) { return DATASET_NAMES[raw] || raw; }
       function metricName(key) { return METRIC_NAMES[key] || key; }
       function stepsToTokens(step) { return step * TOKENS_PER_STEP; }
@@ -282,49 +291,46 @@
       function initColors() {
         if (Object.keys(colorMap).length) return;
         const allRaw = Array.from(d3.group(allData, d => d[RUN_COL]).keys()).sort();
-        // Assign pinned colors first (keyed by display name)
         const unpinned = [];
         allRaw.forEach(raw => {
-          const name = displayName(raw);
-          if (PINNED_COLORS[name]) { colorMap[raw] = PINNED_COLORS[name]; }
           else { unpinned.push(raw); }
         });
-        // Fill remaining from categorical palette
         const palette = getCategoricalColors(unpinned.length);
         unpinned.forEach((raw, i) => { colorMap[raw] = palette[i % palette.length]; });
       }
       // ─── SETUP HELPERS ───
       function filterData() {
-        const knownNames = Object.keys(DATASET_NAMES);
         allData = knownNames.length ? parsedData.filter(r => knownNames.includes(r[RUN_COL])) : parsedData;
         allData.columns = parsedData.columns;
       }
       function computeAverageData(rawData) {
-        if (!SETUPS || setupNames.length < 2) return { data: [], datasetNames: {} };
-        // Build mapping: displayName -> [rawName1, rawName2, ...]
         const displayToRaws = {};
         for (const sName of setupNames) {
-          const dn = SETUPS[sName].datasetNames;
-          for (const [raw, display] of Object.entries(dn)) {
-            if (!displayToRaws[display]) displayToRaws[display] = [];
-            displayToRaws[display].push(raw);
           }
         }
-        // Only average display names that appear in ALL setups
         const fullDisplay = Object.entries(displayToRaws)
           .filter(([, raws]) => raws.length >= setupNames.length);
-        // Index raw data by runname+step for fast lookup
         const byRunStep = {};
         for (const row of rawData) byRunStep[row[RUN_COL] + '|' + row[STEP_COL]] = row;
         const steps = Array.from(new Set(rawData.map(r => +r[STEP_COL]))).sort((a, b) => a - b);
         const cols = rawData.columns || Object.keys(rawData[0] || {});
         const result = [];
-        const dnMap = {};
         for (const [display, raws] of fullDisplay) {
           const avgRaw = '__avg__' + display.replace(/[^a-zA-Z0-9]/g, '_');
-          dnMap[avgRaw] = display;
           for (const step of steps) {
             const rows = raws.map(r => byRunStep[r + '|' + step]).filter(Boolean);
             if (!rows.length) continue;
@@ -337,26 +343,23 @@
             result.push(avgRow);
           }
         }
-        return { data: result, datasetNames: dnMap };
       }
       function switchSetup(name) {
         currentSetup = name;
         if (name === AVG_SETUP_KEY) {
-          DATASET_NAMES = { ...avgDatasetNames };
         } else {
-          DATASET_NAMES = { ...SETUPS[name].datasetNames };
         }
-        // Re-add baselines that may be shared across setups
-        const baselineNames = cfg.baselines || ['dclm', 'fw_edu_hq'];
-        for (const bRaw of baselineNames) {
-          if (parsedData.some(r => r[RUN_COL] === bRaw) && !DATASET_NAMES[bRaw]) {
-            // Find display name from any setup or use raw
-            let bDisplay = bRaw;
-            for (const sName of setupNames) {
-              if (SETUPS[sName].datasetNames[bRaw]) { bDisplay = SETUPS[sName].datasetNames[bRaw]; break; }
             }
-            DATASET_NAMES[bRaw] = bDisplay;
           }
         }
         colorMap = {};
@@ -384,6 +387,10 @@
         gRoot.selectAll('text.value-label').classed('ghost', d => highlight && d.name !== highlight);
         gRoot.selectAll('.line-path').classed('ghost', d => highlight && d.name !== highlight);
         gRoot.selectAll('.line-dot').classed('ghost', d => highlight && d.name !== highlight);
         container.querySelectorAll('.legend .item').forEach(el => {
           el.classList.toggle('ghost', highlight && el.getAttribute('data-name') !== highlight);
         });
@@ -392,7 +399,6 @@
       // ─── AUTO-DETECT METRICS from CSV columns ───
       function detectMetrics(columns) {
         const skip = new Set([RUN_COL, STEP_COL, 'seed']);
-        // Ordered: aggregate first, then individual
         const aggOrder = ['agg_score_macro', 'agg_score_micro', 'agg_score_RC', 'agg_score_GK', 'agg_score_NLU', 'agg_score_MATH', 'agg_score_TABLE', 'agg_score_RES'];
         const agg = aggOrder.filter(k => columns.includes(k));
         const ind = columns.filter(k => !skip.has(k) && !agg.includes(k) && !isNaN(+allData[0][k]));
@@ -402,7 +408,8 @@
       // ─── BAR CHART ───
       function renderBar() {
         const width = container.clientWidth || 800;
-        const margin = { top: 12, right: 56, bottom: 32, left: 190 };
         const grouped = d3.group(allData, d => d[RUN_COL]);
         const finalData = [];
@@ -413,8 +420,11 @@
         }
         finalData.sort((a, b) => b.value - a.value);
         const barHeight = 28, barGap = 8;
-        const height = margin.top + margin.bottom + finalData.length * (barHeight + barGap);
         svg.attr('width', width).attr('height', height);
         gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
@@ -422,7 +432,7 @@
         const innerHeight = height - margin.top - margin.bottom;
         const x = d3.scaleLinear().domain([0, d3.max(finalData, d => d.value) * 1.05]).range([0, innerWidth]);
-        const y = d3.scaleBand().domain(finalData.map(d => d.name)).range([0, innerHeight]).padding(0.2);
         // Grid
         gRoot.selectAll('.grid').data([0]).join('g').attr('class', 'grid').call(g => {
@@ -447,10 +457,9 @@
             g.selectAll('path, line').attr('stroke', 'var(--axis-color)');
           });
-        // Bars
-        // Stripe patterns for baseline bars
-        finalData.forEach(d => {
-          if (!isBaseline(d.rawName)) return;
           const c = colorMap[d.rawName] || '#999';
           const pat = defs.append('pattern').attr('id', stripePatternId(d.rawName))
             .attr('width', 6).attr('height', 6).attr('patternUnits', 'userSpaceOnUse').attr('patternTransform', 'rotate(45)');
@@ -458,11 +467,17 @@
           pat.append('line').attr('x1', 0).attr('y1', 0).attr('x2', 0).attr('y2', 6).attr('stroke', c).attr('stroke-width', 2.5);
         });
         const barTip = (ev, d) => {
           const [mx, my] = d3.pointer(ev, container);
-          showTip(`<strong>${d.name}</strong><br/>${metricName(currentMetric)}: <strong>${d.value.toFixed(4)}</strong>`, mx, my);
         };
-        gRoot.selectAll('rect.bar').data(finalData, d => d.name).join(
           enter => enter.append('rect').attr('class', 'bar')
             .attr('x', 0).attr('y', d => y(d.name)).attr('height', y.bandwidth()).attr('rx', 3)
             .attr('fill', d => barFill(d))
@@ -483,14 +498,39 @@
         );
         // Value labels
-        gRoot.selectAll('text.value-label').data(finalData, d => d.name).join(
           enter => enter.append('text').attr('class', 'value-label')
             .attr('x', d => x(d.value) + 5).attr('y', d => y(d.name) + y.bandwidth() / 2)
             .attr('dy', '0.35em').attr('fill', 'var(--text-color)').attr('font-size', 11)
-            .text(d => d.value.toFixed(4)),
           update => update.transition().duration(300)
             .attr('x', d => x(d.value) + 5).attr('y', d => y(d.name) + y.bandwidth() / 2)
-            .text(d => d.value.toFixed(4)),
           exit => exit.remove()
         );
       }
@@ -498,6 +538,7 @@
       // ─── LINE CHART ───
       function renderLine() {
         const width = container.clientWidth || 800;
         const margin = { top: 16, right: 50, bottom: 48, left: 60 };
         const height = Math.max(300, Math.round(width / 2.5));
         svg.attr('width', width).attr('height', height);
@@ -509,13 +550,20 @@
         // Build series
         const grouped = d3.group(allData, d => d[RUN_COL]);
         const series = [];
         for (const [raw, rows] of grouped) {
           const pts = rows.map(r => ({ step: +r[STEP_COL], value: +r[currentMetric] })).sort((a, b) => a.step - b.step);
-          series.push({ name: displayName(raw), rawName: raw, values: pts });
         }
-        const allSteps = Array.from(new Set(allData.map(r => +r[STEP_COL]))).sort((a, b) => a - b);
-        const allValues = series.flatMap(s => s.values.map(v => v.value));
         const x = d3.scaleLinear().domain(d3.extent(allSteps)).range([0, innerWidth]);
         const yMin = d3.min(allValues), yMax = d3.max(allValues), yPad = (yMax - yMin) * 0.08;
@@ -555,28 +603,52 @@
           .attr('text-anchor', 'middle').attr('fill', 'var(--text-color)').attr('font-size', 12)
           .text(metricName(currentMetric));
-        // Lines
         const line = d3.line().x(d => x(d.step)).y(d => y(d.value)).curve(d3.curveMonotoneX);
         gRoot.selectAll('.line-path').data(series, d => d.name).join(
-          enter => enter.append('path').attr('class', d => 'line-path' + (isBaseline(d.rawName) ? ' baseline' : ''))
             .attr('stroke', d => colorMap[d.rawName] || 'var(--primary-color)')
             .attr('d', d => line(d.values)),
-          update => update.attr('class', d => 'line-path' + (isBaseline(d.rawName) ? ' baseline' : ''))
-            .transition().duration(300)
             .attr('stroke', d => colorMap[d.rawName] || 'var(--primary-color)')
             .attr('d', d => line(d.values)),
           exit => exit.remove()
         );
-        // Dots
         const dotData = series.flatMap(s => s.values.map(v => ({ name: s.name, rawName: s.rawName, step: v.step, value: v.value })));
         gRoot.selectAll('.line-dot').data(dotData, d => d.name + '-' + d.step).join(
-          enter => enter.append('circle').attr('class', d => 'line-dot' + (isBaseline(d.rawName) ? ' baseline' : ''))
             .attr('cx', d => x(d.step)).attr('cy', d => y(d.value)).attr('r', 3)
             .attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)')
             .attr('stroke', 'var(--surface-bg)').attr('stroke-width', 1),
-          update => update.attr('class', d => 'line-dot' + (isBaseline(d.rawName) ? ' baseline' : ''))
-            .transition().duration(300)
             .attr('cx', d => x(d.step)).attr('cy', d => y(d.value))
             .attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)'),
           exit => exit.remove()
@@ -597,11 +669,15 @@
             const entries = series.map(s => {
               const pt = s.values.find(v => v.step === nearest);
               return pt ? { name: s.name, rawName: s.rawName, value: pt.value } : null;
-            }).filter(Boolean).sort((a, b) => b.value - a.value);
             let html = `<div style="font-weight:700;margin-bottom:4px;">${stepLabelLong(nearest)}</div>`;
             entries.forEach(e => {
-              html += `<div><span class="tip-dot" style="background:${colorMap[e.rawName]}"></span>${e.name}: <strong>${e.value.toFixed(4)}</strong></div>`;
             });
             const [cx, cy] = d3.pointer(ev, container);
             showTip(html, cx, cy);
@@ -625,7 +701,6 @@
       function buildUI() {
         const controls = document.createElement('div'); controls.className = 'controls';
-        // Setup selector (only shown when setups config is present)
         if (SETUPS && setupNames.length > 0) {
           const setupGroup = document.createElement('div'); setupGroup.className = 'control-group';
           const setupLabel = document.createElement('label'); setupLabel.setAttribute('for', 'setup-' + uid); setupLabel.textContent = 'Setup';
@@ -635,7 +710,6 @@
             if (name === currentSetup) opt.selected = true;
             setupSelect.appendChild(opt);
           });
-          // Add Average option
           if (setupNames.length >= 2) {
             const avgOpt = document.createElement('option'); avgOpt.value = AVG_SETUP_KEY; avgOpt.textContent = AVG_SETUP_KEY;
             setupSelect.appendChild(avgOpt);
@@ -645,7 +719,6 @@
           controls.appendChild(setupGroup);
         }
-        // View toggle
         const viewGroup = document.createElement('div'); viewGroup.className = 'control-group';
         const viewLabel = document.createElement('label'); viewLabel.setAttribute('for', 'view-' + uid); viewLabel.textContent = 'View';
         const viewSelect = document.createElement('select'); viewSelect.id = 'view-' + uid;
@@ -658,7 +731,6 @@
         viewGroup.appendChild(viewLabel); viewGroup.appendChild(viewSelect);
         controls.appendChild(viewGroup);
-        // Metric select (populated after data load)
         const metricGroup = document.createElement('div'); metricGroup.className = 'control-group';
         const metricLabel = document.createElement('label'); metricLabel.setAttribute('for', 'metric-' + uid); metricLabel.textContent = 'Metric';
         const metricSelect = document.createElement('select'); metricSelect.id = 'metric-' + uid;
@@ -667,7 +739,6 @@
         container.appendChild(controls);
-        // Legend
         const legend = document.createElement('div'); legend.className = 'legend';
         legend.innerHTML = '<div class="legend-title">Legend</div><div class="items"></div>';
         container.appendChild(legend);
@@ -693,7 +764,6 @@
         const items = container.querySelector('.legend .items');
         if (!items) return;
         items.innerHTML = '';
-        // Sort by final score (max step) on current default metric, descending
         const grouped = d3.group(allData, d => d[RUN_COL]);
         const sorted = Array.from(grouped.entries())
           .map(([raw, rows]) => {
@@ -703,13 +773,17 @@
           })
           .sort((a, b) => b.score - a.score)
           .map(d => d.raw);
-        sorted.forEach(raw => {
           const name = displayName(raw);
           const el = document.createElement('span'); el.className = 'item'; el.setAttribute('data-name', name);
           const sw = document.createElement('span'); sw.className = 'swatch';
-          const swColor = colorMap[raw] || '#999';
-          sw.style.background = swColor;
-          if (isBaseline(raw)) sw.style.backgroundImage = 'repeating-linear-gradient(45deg, transparent, transparent 2px, rgba(255,255,255,0.4) 2px, rgba(255,255,255,0.4) 4px)';
           const txt = document.createElement('span'); txt.textContent = name;
           el.appendChild(sw); el.appendChild(txt); items.appendChild(el);
           el.addEventListener('mouseenter', () => { highlight = name; updateHighlight(); });
@@ -745,17 +819,14 @@
           const text = await fetchFirstAvailable(csvPaths);
           const parsed = d3.csvParse(text);
           parsedData = parsed;
-          // Compute average data for setup mode
           if (SETUPS && setupNames.length >= 2) {
             const avg = computeAverageData(parsed);
-            avgDatasetNames = avg.datasetNames;
             parsedData = parsed.concat(avg.data);
             parsedData.columns = parsed.columns;
           }
-          // Filter to only datasets with configured display names
           filterData();
           metricKeys = detectMetrics(allData.columns);
-          // Ensure defaultMetric is valid; fall back to first available
           if (!metricKeys.includes(currentMetric)) currentMetric = metricKeys[0];
           populateMetricSelect();
           render();

   Configuration via data-config attribute:
   {
+    "datasets": {                                                           // required (unless using setups)
+      "raw_name": "Display Name",                                           //   shorthand: string = display name
+      "raw_name": { "display": "Name", "color": "#hex", "shaded": true, "baseline": true }
+                                                                            //   full form: display is required, rest optional
+    },
+    "setups":         { "Setup Label": { "datasets": {...} }, ... },        // optional, multi-setup mode with dropdown + average
+    "defaultMetric":  "agg_score_macro",                                    // optional, default: "agg_score_macro"
+    "defaultView":    "bar",                                                // optional, "bar" | "line", default: "bar"
+    "tokensPerStep":  2100000,                                              // optional, default: 2.1e6
+    "runColumn":      "runname",                                            // optional, CSV column for series, default: "runname"
+    "stepColumn":     "steps"                                               // optional, CSV column for x-axis, default: "steps"
   }
+  Per-dataset options (all optional except display):
+    display:   Display name shown in legend, axes, and tooltips
+    color:     Pinned hex color (otherwise auto-assigned from palette)
+    shaded:    If true, bar gets a diagonal-stripe pattern (useful for aggregate baselines)
+    baseline:  If true, rendered as a reference line (vertical in bar view, horizontal in line view)
+               instead of a regular bar/line. Not shown in the legend.
   Data: uses benchmark-results.csv by default (one CSV with all runs).
+  Only rows matching keys in datasets are displayed.
   Example usage in MDX:
   <HtmlEmbed
     src="d3-benchmark-comparison.html"
     title="Baseline Comparison"
     config={{
+      datasets: {
         cosmopedia: "Cosmopedia",
+        dclm: { display: "Baseline (DCLM)", baseline: true },
+        nemotron_hq_synth: { display: "Nemotron-HQ-Synth", color: "#76b900", shaded: true }
       }
     }}
   />
   .d3-benchmark-comparison .bar.ghost { opacity: .25; }
   .d3-benchmark-comparison .value-label.ghost { opacity: .25; }
   .d3-benchmark-comparison .line-path { fill: none; stroke-width: 2; opacity: 0.85; }
   .d3-benchmark-comparison .line-path.ghost { opacity: .15; }
   .d3-benchmark-comparison .line-dot.ghost { opacity: .15; }
+  .d3-benchmark-comparison .baseline.ghost { opacity: .1; }
   .d3-benchmark-comparison .axes path { display: none; }
   .d3-benchmark-comparison .axes line { stroke: var(--axis-color); }
   .d3-benchmark-comparison .axes text { fill: var(--tick-color); }
         if (raw && raw.trim()) cfg = raw.trim().startsWith('{') ? JSON.parse(raw) : {};
       } catch (_) {}
+      // ─── NORMALIZE DATASETS CONFIG ───
+      // Accepts: { "key": "Name" } or { "key": { display, color, shaded, baseline } }
+      // Returns: { key: { display, color, shaded, baseline } }
+      function normalizeDatasets(raw) {
+        const out = {};
+        for (const [k, v] of Object.entries(raw || {})) {
+          out[k] = typeof v === 'string' ? { display: v } : { ...v };
+        }
+        return out;
+      }
       // ─── SETUP SUPPORT ───
       const SETUPS = cfg.setups || null;
       const setupNames = SETUPS ? Object.keys(SETUPS) : [];
       let currentSetup = SETUPS ? setupNames[0] : null;
+      let DATASETS = SETUPS ? normalizeDatasets(SETUPS[setupNames[0]].datasets) : normalizeDatasets(cfg.datasets);
       const AVG_SETUP_KEY = 'Average (all setups)';
+      let avgDatasets = {};
       let parsedData = [];
       const RUN_COL       = cfg.runColumn    || 'runname';
       const TOKENS_PER_STEP = cfg.tokensPerStep || 2.1e6;
       const defaultMetric = cfg.defaultMetric || 'agg_score_macro';
       const defaultView   = cfg.defaultView   || 'bar';
       const uid = Math.random().toString(36).slice(2, 8);
+      // ─── DATASET ACCESSORS ───
+      function displayName(raw) { return DATASETS[raw] ? DATASETS[raw].display : raw; }
+      function isBaseline(raw) { return !!(DATASETS[raw] && DATASETS[raw].baseline); }
+      function isShaded(raw) { return !!(DATASETS[raw] && DATASETS[raw].shaded); }
+      function pinnedColor(raw) { return DATASETS[raw] && DATASETS[raw].color; }
       function stripePatternId(raw) { return 'stripe-' + uid + '-' + raw.replace(/[^a-zA-Z0-9]/g, '_'); }
       const METRIC_NAMES = {
         'agg_score_macro': 'Aggregate Score (Macro)',
         'agg_score_micro': 'Aggregate Score (Micro)',
       // State
       let allData = [];
+      let metricKeys = [];
       let currentMetric = defaultMetric;
       let currentView = defaultView;
       let colorMap = {};
       let highlight = null;
       // ─── HELPERS ───
       function metricName(key) { return METRIC_NAMES[key] || key; }
       function stepsToTokens(step) { return step * TOKENS_PER_STEP; }
       function initColors() {
         if (Object.keys(colorMap).length) return;
         const allRaw = Array.from(d3.group(allData, d => d[RUN_COL]).keys()).sort();
         const unpinned = [];
         allRaw.forEach(raw => {
+          const pc = pinnedColor(raw);
+          if (pc) { colorMap[raw] = pc; }
           else { unpinned.push(raw); }
         });
         const palette = getCategoricalColors(unpinned.length);
         unpinned.forEach((raw, i) => { colorMap[raw] = palette[i % palette.length]; });
       }
       // ─── SETUP HELPERS ───
       function filterData() {
+        const knownNames = Object.keys(DATASETS);
         allData = knownNames.length ? parsedData.filter(r => knownNames.includes(r[RUN_COL])) : parsedData;
         allData.columns = parsedData.columns;
       }
       function computeAverageData(rawData) {
+        if (!SETUPS || setupNames.length < 2) return { data: [], datasets: {} };
         const displayToRaws = {};
         for (const sName of setupNames) {
+          const ds = normalizeDatasets(SETUPS[sName].datasets);
+          for (const [raw, opts] of Object.entries(ds)) {
+            if (!displayToRaws[opts.display]) displayToRaws[opts.display] = [];
+            displayToRaws[opts.display].push(raw);
           }
         }
         const fullDisplay = Object.entries(displayToRaws)
           .filter(([, raws]) => raws.length >= setupNames.length);
         const byRunStep = {};
         for (const row of rawData) byRunStep[row[RUN_COL] + '|' + row[STEP_COL]] = row;
         const steps = Array.from(new Set(rawData.map(r => +r[STEP_COL]))).sort((a, b) => a - b);
         const cols = rawData.columns || Object.keys(rawData[0] || {});
         const result = [];
+        const dsMap = {};
         for (const [display, raws] of fullDisplay) {
           const avgRaw = '__avg__' + display.replace(/[^a-zA-Z0-9]/g, '_');
+          // Merge options from first setup that has this display name
+          const firstOpts = Object.values(normalizeDatasets(SETUPS[setupNames[0]].datasets)).find(o => o.display === display) || {};
+          dsMap[avgRaw] = { display, ...firstOpts };
           for (const step of steps) {
             const rows = raws.map(r => byRunStep[r + '|' + step]).filter(Boolean);
             if (!rows.length) continue;
             result.push(avgRow);
           }
         }
+        return { data: result, datasets: dsMap };
       }
       function switchSetup(name) {
         currentSetup = name;
         if (name === AVG_SETUP_KEY) {
+          DATASETS = { ...avgDatasets };
         } else {
+          DATASETS = normalizeDatasets(SETUPS[name].datasets);
         }
+        // Re-add baselines from any setup
+        for (const sName of setupNames) {
+          const ds = normalizeDatasets(SETUPS[sName].datasets);
+          for (const [raw, opts] of Object.entries(ds)) {
+            if (opts.baseline && !DATASETS[raw] && parsedData.some(r => r[RUN_COL] === raw)) {
+              DATASETS[raw] = { ...opts };
             }
           }
         }
         colorMap = {};
         gRoot.selectAll('text.value-label').classed('ghost', d => highlight && d.name !== highlight);
         gRoot.selectAll('.line-path').classed('ghost', d => highlight && d.name !== highlight);
         gRoot.selectAll('.line-dot').classed('ghost', d => highlight && d.name !== highlight);
+        gRoot.selectAll('.baseline-vline').classed('ghost', d => highlight && d.name !== highlight);
+        gRoot.selectAll('.baseline-vlabel').classed('ghost', d => highlight && d.name !== highlight);
+        gRoot.selectAll('.baseline-hline').classed('ghost', d => highlight && d.name !== highlight);
+        gRoot.selectAll('.baseline-hlabel').classed('ghost', d => highlight && d.name !== highlight);
         container.querySelectorAll('.legend .item').forEach(el => {
           el.classList.toggle('ghost', highlight && el.getAttribute('data-name') !== highlight);
         });
       // ─── AUTO-DETECT METRICS from CSV columns ───
       function detectMetrics(columns) {
         const skip = new Set([RUN_COL, STEP_COL, 'seed']);
         const aggOrder = ['agg_score_macro', 'agg_score_micro', 'agg_score_RC', 'agg_score_GK', 'agg_score_NLU', 'agg_score_MATH', 'agg_score_TABLE', 'agg_score_RES'];
         const agg = aggOrder.filter(k => columns.includes(k));
         const ind = columns.filter(k => !skip.has(k) && !agg.includes(k) && !isNaN(+allData[0][k]));
       // ─── BAR CHART ───
       function renderBar() {
         const width = container.clientWidth || 800;
+        const hasBaselines = allData.some(r => isBaseline(r[RUN_COL]));
+        const margin = { top: hasBaselines ? 20 : 12, right: 56, bottom: 32, left: 190 };
         const grouped = d3.group(allData, d => d[RUN_COL]);
         const finalData = [];
         }
         finalData.sort((a, b) => b.value - a.value);
+        const barData = finalData.filter(d => !isBaseline(d.rawName));
+        const baselineData = finalData.filter(d => isBaseline(d.rawName));
         const barHeight = 28, barGap = 8;
+        const height = margin.top + margin.bottom + barData.length * (barHeight + barGap);
         svg.attr('width', width).attr('height', height);
         gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
         const innerHeight = height - margin.top - margin.bottom;
         const x = d3.scaleLinear().domain([0, d3.max(finalData, d => d.value) * 1.05]).range([0, innerWidth]);
+        const y = d3.scaleBand().domain(barData.map(d => d.name)).range([0, innerHeight]).padding(0.2);
         // Grid
         gRoot.selectAll('.grid').data([0]).join('g').attr('class', 'grid').call(g => {
             g.selectAll('path, line').attr('stroke', 'var(--axis-color)');
           });
+        // Stripe patterns for shaded bars
+        barData.forEach(d => {
+          if (!isShaded(d.rawName)) return;
           const c = colorMap[d.rawName] || '#999';
           const pat = defs.append('pattern').attr('id', stripePatternId(d.rawName))
             .attr('width', 6).attr('height', 6).attr('patternUnits', 'userSpaceOnUse').attr('patternTransform', 'rotate(45)');
           pat.append('line').attr('x1', 0).attr('y1', 0).attr('x2', 0).attr('y2', 6).attr('stroke', c).attr('stroke-width', 2.5);
         });
+        function barFill(d) {
+          if (isShaded(d.rawName)) return `url(#${stripePatternId(d.rawName)})`;
+          return colorMap[d.rawName] || 'var(--primary-color)';
+        }
+        // Bars
         const barTip = (ev, d) => {
           const [mx, my] = d3.pointer(ev, container);
+          showTip(`<strong>${d.name}</strong><br/>${metricName(currentMetric)}: <strong>${d.value.toFixed(3)}</strong>`, mx, my);
         };
+        gRoot.selectAll('rect.bar').data(barData, d => d.name).join(
           enter => enter.append('rect').attr('class', 'bar')
             .attr('x', 0).attr('y', d => y(d.name)).attr('height', y.bandwidth()).attr('rx', 3)
             .attr('fill', d => barFill(d))
         );
         // Value labels
+        gRoot.selectAll('text.value-label').data(barData, d => d.name).join(
           enter => enter.append('text').attr('class', 'value-label')
             .attr('x', d => x(d.value) + 5).attr('y', d => y(d.name) + y.bandwidth() / 2)
             .attr('dy', '0.35em').attr('fill', 'var(--text-color)').attr('font-size', 11)
+            .text(d => d.value.toFixed(3)),
           update => update.transition().duration(300)
             .attr('x', d => x(d.value) + 5).attr('y', d => y(d.name) + y.bandwidth() / 2)
+            .text(d => d.value.toFixed(3)),
+          exit => exit.remove()
+        );
+        // Baseline vertical reference lines
+        gRoot.selectAll('.baseline-vline').data(baselineData, d => d.name).join(
+          enter => enter.append('line').attr('class', 'baseline-vline baseline')
+            .attr('x1', d => x(d.value)).attr('x2', d => x(d.value))
+            .attr('y1', 0).attr('y2', innerHeight)
+            .attr('stroke', d => colorMap[d.rawName] || '#999')
+            .attr('stroke-width', 2).attr('stroke-dasharray', '6,4').attr('opacity', 0.7),
+          update => update.transition().duration(300)
+            .attr('x1', d => x(d.value)).attr('x2', d => x(d.value))
+            .attr('y1', 0).attr('y2', innerHeight)
+            .attr('stroke', d => colorMap[d.rawName] || '#999'),
+          exit => exit.remove()
+        );
+        gRoot.selectAll('.baseline-vlabel').data(baselineData, d => d.name).join(
+          enter => enter.append('text').attr('class', 'baseline-vlabel baseline')
+            .attr('x', d => x(d.value)).attr('y', -4)
+            .attr('text-anchor', 'middle').attr('fill', d => colorMap[d.rawName] || '#999')
+            .attr('font-size', 11).attr('font-weight', 600)
+            .text(d => `${d.name} (${d.value.toFixed(3)})`),
+          update => update.transition().duration(300)
+            .attr('x', d => x(d.value))
+            .text(d => `${d.name} (${d.value.toFixed(3)})`),
           exit => exit.remove()
         );
       }
       // ─── LINE CHART ───
       function renderLine() {
         const width = container.clientWidth || 800;
+        const hasBaselines = allData.some(r => isBaseline(r[RUN_COL]));
         const margin = { top: 16, right: 50, bottom: 48, left: 60 };
         const height = Math.max(300, Math.round(width / 2.5));
         svg.attr('width', width).attr('height', height);
         // Build series
         const grouped = d3.group(allData, d => d[RUN_COL]);
         const series = [];
+        const baselineSeries = [];
         for (const [raw, rows] of grouped) {
           const pts = rows.map(r => ({ step: +r[STEP_COL], value: +r[currentMetric] })).sort((a, b) => a.step - b.step);
+          const entry = { name: displayName(raw), rawName: raw, values: pts };
+          if (isBaseline(raw)) {
+            entry.finalValue = pts[pts.length - 1].value;
+            baselineSeries.push(entry);
+          } else {
+            series.push(entry);
+          }
         }
+        const allSteps = Array.from(new Set(allData.filter(r => !isBaseline(r[RUN_COL])).map(r => +r[STEP_COL]))).sort((a, b) => a - b);
+        const allValues = [...series, ...baselineSeries].flatMap(s => s.finalValue != null ? [s.finalValue] : s.values.map(v => v.value));
         const x = d3.scaleLinear().domain(d3.extent(allSteps)).range([0, innerWidth]);
         const yMin = d3.min(allValues), yMax = d3.max(allValues), yPad = (yMax - yMin) * 0.08;
           .attr('text-anchor', 'middle').attr('fill', 'var(--text-color)').attr('font-size', 12)
           .text(metricName(currentMetric));
+        // Baseline horizontal reference lines
+        gRoot.selectAll('.baseline-hline').data(baselineSeries, d => d.name).join(
+          enter => enter.append('line').attr('class', 'baseline-hline baseline')
+            .attr('x1', 0).attr('x2', innerWidth)
+            .attr('y1', d => y(d.finalValue)).attr('y2', d => y(d.finalValue))
+            .attr('stroke', d => colorMap[d.rawName] || '#999')
+            .attr('stroke-width', 2).attr('stroke-dasharray', '6,4').attr('opacity', 0.7),
+          update => update.transition().duration(300)
+            .attr('x1', 0).attr('x2', innerWidth)
+            .attr('y1', d => y(d.finalValue)).attr('y2', d => y(d.finalValue))
+            .attr('stroke', d => colorMap[d.rawName] || '#999'),
+          exit => exit.remove()
+        );
+        gRoot.selectAll('.baseline-hlabel').data(baselineSeries, d => d.name).join(
+          enter => enter.append('text').attr('class', 'baseline-hlabel baseline')
+            .attr('x', 4).attr('y', d => y(d.finalValue) - 6)
+            .attr('text-anchor', 'start')
+            .attr('fill', d => colorMap[d.rawName] || '#999')
+            .attr('font-size', 10).attr('font-weight', 600)
+            .text(d => `${d.name} (${d.finalValue.toFixed(3)})`),
+          update => update.transition().duration(300)
+            .attr('x', 4).attr('y', d => y(d.finalValue) - 6)
+            .text(d => `${d.name} (${d.finalValue.toFixed(3)})`),
+          exit => exit.remove()
+        );
+        // Lines (non-baseline)
         const line = d3.line().x(d => x(d.step)).y(d => y(d.value)).curve(d3.curveMonotoneX);
         gRoot.selectAll('.line-path').data(series, d => d.name).join(
+          enter => enter.append('path').attr('class', 'line-path')
             .attr('stroke', d => colorMap[d.rawName] || 'var(--primary-color)')
             .attr('d', d => line(d.values)),
+          update => update.transition().duration(300)
             .attr('stroke', d => colorMap[d.rawName] || 'var(--primary-color)')
             .attr('d', d => line(d.values)),
           exit => exit.remove()
         );
+        // Dots (non-baseline)
         const dotData = series.flatMap(s => s.values.map(v => ({ name: s.name, rawName: s.rawName, step: v.step, value: v.value })));
         gRoot.selectAll('.line-dot').data(dotData, d => d.name + '-' + d.step).join(
+          enter => enter.append('circle').attr('class', 'line-dot')
             .attr('cx', d => x(d.step)).attr('cy', d => y(d.value)).attr('r', 3)
             .attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)')
             .attr('stroke', 'var(--surface-bg)').attr('stroke-width', 1),
+          update => update.transition().duration(300)
             .attr('cx', d => x(d.step)).attr('cy', d => y(d.value))
             .attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)'),
           exit => exit.remove()
             const entries = series.map(s => {
               const pt = s.values.find(v => v.step === nearest);
               return pt ? { name: s.name, rawName: s.rawName, value: pt.value } : null;
+            }).filter(Boolean);
+            baselineSeries.forEach(s => {
+              entries.push({ name: s.name, rawName: s.rawName, value: s.finalValue });
+            });
+            entries.sort((a, b) => b.value - a.value);
             let html = `<div style="font-weight:700;margin-bottom:4px;">${stepLabelLong(nearest)}</div>`;
             entries.forEach(e => {
+              html += `<div><span class="tip-dot" style="background:${colorMap[e.rawName]}"></span>${e.name}: <strong>${e.value.toFixed(3)}</strong></div>`;
             });
             const [cx, cy] = d3.pointer(ev, container);
             showTip(html, cx, cy);
       function buildUI() {
         const controls = document.createElement('div'); controls.className = 'controls';
         if (SETUPS && setupNames.length > 0) {
           const setupGroup = document.createElement('div'); setupGroup.className = 'control-group';
           const setupLabel = document.createElement('label'); setupLabel.setAttribute('for', 'setup-' + uid); setupLabel.textContent = 'Setup';
             if (name === currentSetup) opt.selected = true;
             setupSelect.appendChild(opt);
           });
           if (setupNames.length >= 2) {
             const avgOpt = document.createElement('option'); avgOpt.value = AVG_SETUP_KEY; avgOpt.textContent = AVG_SETUP_KEY;
             setupSelect.appendChild(avgOpt);
           controls.appendChild(setupGroup);
         }
         const viewGroup = document.createElement('div'); viewGroup.className = 'control-group';
         const viewLabel = document.createElement('label'); viewLabel.setAttribute('for', 'view-' + uid); viewLabel.textContent = 'View';
         const viewSelect = document.createElement('select'); viewSelect.id = 'view-' + uid;
         viewGroup.appendChild(viewLabel); viewGroup.appendChild(viewSelect);
         controls.appendChild(viewGroup);
         const metricGroup = document.createElement('div'); metricGroup.className = 'control-group';
         const metricLabel = document.createElement('label'); metricLabel.setAttribute('for', 'metric-' + uid); metricLabel.textContent = 'Metric';
         const metricSelect = document.createElement('select'); metricSelect.id = 'metric-' + uid;
         container.appendChild(controls);
         const legend = document.createElement('div'); legend.className = 'legend';
         legend.innerHTML = '<div class="legend-title">Legend</div><div class="items"></div>';
         container.appendChild(legend);
         const items = container.querySelector('.legend .items');
         if (!items) return;
         items.innerHTML = '';
         const grouped = d3.group(allData, d => d[RUN_COL]);
         const sorted = Array.from(grouped.entries())
           .map(([raw, rows]) => {
           })
           .sort((a, b) => b.score - a.score)
           .map(d => d.raw);
+        sorted.filter(raw => !isBaseline(raw)).forEach(raw => {
           const name = displayName(raw);
           const el = document.createElement('span'); el.className = 'item'; el.setAttribute('data-name', name);
           const sw = document.createElement('span'); sw.className = 'swatch';
+          const c = colorMap[raw] || '#999';
+          if (isShaded(raw)) {
+            sw.style.background = c;
+            sw.style.backgroundImage = 'repeating-linear-gradient(45deg, transparent, transparent 2px, rgba(255,255,255,0.4) 2px, rgba(255,255,255,0.4) 4px)';
+          } else {
+            sw.style.background = c;
+          }
           const txt = document.createElement('span'); txt.textContent = name;
           el.appendChild(sw); el.appendChild(txt); items.appendChild(el);
           el.addEventListener('mouseenter', () => { highlight = name; updateHighlight(); });
           const text = await fetchFirstAvailable(csvPaths);
           const parsed = d3.csvParse(text);
           parsedData = parsed;
           if (SETUPS && setupNames.length >= 2) {
             const avg = computeAverageData(parsed);
+            avgDatasets = avg.datasets;
             parsedData = parsed.concat(avg.data);
             parsedData.columns = parsed.columns;
           }
           filterData();
           metricKeys = detectMetrics(allData.columns);
           if (!metricKeys.includes(currentMetric)) currentMetric = metricKeys[0];
           populateMetricSelect();
           render();