Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Commit ·
fdfc515
1
Parent(s): 621688d
prettified experiment plots
Browse files
app/src/content/chapters/experiments.mdx
CHANGED
|
@@ -8,11 +8,10 @@ import FigRef from "../../components/FigRef.astro";
|
|
| 8 |
{/* TODO: read through entire blog post and make improvements */}
|
| 9 |
{/* TODO: potentially make a widget for data exploration: look at the same few samples generated by different models or transformed with different prompts */}
|
| 10 |
{/* TODO: Integrate decay experiment as another analysis for proxy */}
|
| 11 |
-
{/* TODO:
|
| 12 |
-
{/* TODO:
|
| 13 |
-
{/* TODO:
|
| 14 |
-
{/* TODO:
|
| 15 |
-
{/* TODO: share on a bunch of discords/slacks */}
|
| 16 |
|
| 17 |
{/*
|
| 18 |
Notes:
|
|
@@ -40,8 +39,7 @@ We train on eight datasets under identical conditions and compare their final ev
|
|
| 40 |
src="d3-benchmark-comparison.html"
|
| 41 |
desc="Comparison of baseline datasets across different evaluation metrics. Use the dropdown to switch metrics."
|
| 42 |
config={{
|
| 43 |
-
|
| 44 |
-
datasetNames: {
|
| 45 |
cosmopedia: "Cosmopedia",
|
| 46 |
dclm: "DCLM",
|
| 47 |
fw_edu_hq: "FineWeb-Edu-HQ",
|
|
@@ -67,33 +65,20 @@ The BeyondWeb dataset was never released and the paper omits key details, yet cl
|
|
| 67 |
<HtmlEmbed
|
| 68 |
id="dissecting-baselines"
|
| 69 |
src="d3-benchmark-comparison.html"
|
| 70 |
-
desc="Individual prompt performance from existing synthetic datasets compared to DCLM
|
| 71 |
config={{
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
"
|
| 75 |
-
|
| 76 |
-
"mix-fw_edu_hq-
|
| 77 |
-
"
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
"mix-fw_edu_hq-
|
| 81 |
-
"mix-fw_edu_hq-
|
| 82 |
-
"mix-fw_edu_hq-
|
| 83 |
-
"mix-fw_edu_hq-
|
| 84 |
-
"mix-fw_edu_hq-summarize_1b_hq": "Summarize"
|
| 85 |
-
},
|
| 86 |
-
pinnedColors: {
|
| 87 |
-
"Nemotron-HQ-Synth": "#76b900",
|
| 88 |
-
"Diverse QA Pairs": "#c5e384",
|
| 89 |
-
"Distill": "#a0c95c",
|
| 90 |
-
"Wikipedia Rephrasing": "#7fb034",
|
| 91 |
-
"Knowledge List": "#5e960e",
|
| 92 |
-
"Extract Knowledge": "#3d6b00",
|
| 93 |
-
"REWIRE": "#1877F2",
|
| 94 |
-
"Guided Rewrite": "#6aabff",
|
| 95 |
-
"Continue (BeyondWeb)": "#e8713a",
|
| 96 |
-
"Summarize (BeyondWeb)": "#c4451c"
|
| 97 |
}
|
| 98 |
}}
|
| 99 |
/>
|
|
@@ -102,14 +87,14 @@ Can we design prompts that consistently beat DCLM?
|
|
| 102 |
|
| 103 |
### Can New Prompts Beat DCLM?
|
| 104 |
|
| 105 |
-
Since most existing prompts fail to beat DCLM, we designed seven novel prompt formats targeting different skills ([math](#math), [table](#table), [faq](#faq), [tutorial](#tutorial), [article](#article), [commentary](#commentary), [discussion](#discussion)), all using Gemma-3-1B on FineWeb-Edu-HQ. Four prompts ([math](#math), [table](#table), [faq](#faq), [tutorial](#tutorial)) outperform
|
| 106 |
|
| 107 |
<HtmlEmbed
|
| 108 |
id="new-prompts"
|
| 109 |
src="d3-benchmark-comparison.html"
|
| 110 |
-
desc="Seven new prompts compared against DCLM
|
| 111 |
config={{
|
| 112 |
-
|
| 113 |
"mix-fw_edu_hq-math_1b_hq": "Math",
|
| 114 |
"mix-fw_edu_hq-table_1b_hq": "Table",
|
| 115 |
"mix-fw_edu_hq-faq_1b_hq": "FAQ",
|
|
@@ -117,8 +102,7 @@ Since most existing prompts fail to beat DCLM, we designed seven novel prompt fo
|
|
| 117 |
"mix-fw_edu_hq-article_1b_hq": "Article",
|
| 118 |
"mix-fw_edu_hq-commentary_1b_hq": "Commentary",
|
| 119 |
"mix-fw_edu_hq-discussion_1b_hq": "Discussion",
|
| 120 |
-
dclm: "DCLM",
|
| 121 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 122 |
}
|
| 123 |
}}
|
| 124 |
/>
|
|
@@ -149,45 +133,41 @@ It is possible that larger models produce richer or more nuanced rephrasings tha
|
|
| 149 |
config={{
|
| 150 |
setups: {
|
| 151 |
"Gemma-3: Tutorial": {
|
| 152 |
-
|
| 153 |
"mix-fw_edu_hq-tutorial_27b_hq": "Gemma-3 27B",
|
| 154 |
"mix-fw_edu_hq-tutorial_12b_hq": "Gemma-3 12B",
|
| 155 |
"mix-fw_edu_hq-tutorial_4b_hq": "Gemma-3 4B",
|
| 156 |
"mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3 1B",
|
| 157 |
"mix-fw_edu_hq-tutorial_270m_hq": "Gemma-3 270M",
|
| 158 |
-
dclm: "DCLM",
|
| 159 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 160 |
}
|
| 161 |
},
|
| 162 |
"Gemma-3: Math": {
|
| 163 |
-
|
| 164 |
"mix-fw_edu_hq-math_27b_hq": "Gemma-3 27B",
|
| 165 |
"mix-fw_edu_hq-math_12b_hq": "Gemma-3 12B",
|
| 166 |
"mix-fw_edu_hq-math_4b_hq": "Gemma-3 4B",
|
| 167 |
"mix-fw_edu_hq-math_1b_hq": "Gemma-3 1B",
|
| 168 |
"mix-fw_edu_hq-math_270m_hq": "Gemma-3 270M",
|
| 169 |
-
dclm: "DCLM",
|
| 170 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 171 |
}
|
| 172 |
},
|
| 173 |
"Gemma-3: REWIRE": {
|
| 174 |
-
|
| 175 |
"mix-fw_edu_hq-guided_rewrite_original_27b_hq": "Gemma-3 27B",
|
| 176 |
"mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Gemma-3 12B",
|
| 177 |
"mix-fw_edu_hq-guided_rewrite_original_4b_hq": "Gemma-3 4B",
|
| 178 |
"mix-fw_edu_hq-guided_rewrite_original_1b_hq": "Gemma-3 1B",
|
| 179 |
"mix-fw_edu_hq-guided_rewrite_original_270m_hq": "Gemma-3 270M",
|
| 180 |
-
dclm: "DCLM",
|
| 181 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 182 |
}
|
| 183 |
},
|
| 184 |
"SmolLM2: Tutorial": {
|
| 185 |
-
|
| 186 |
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2 1.7B",
|
| 187 |
"mix-fw_edu_hq-tutorial_smollm2_360m_hq": "SmolLM2 360M",
|
| 188 |
"mix-fw_edu_hq-tutorial_smollm2_135m_hq": "SmolLM2 135M",
|
| 189 |
-
dclm: "DCLM",
|
| 190 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 191 |
}
|
| 192 |
}
|
| 193 |
}
|
|
@@ -207,43 +187,39 @@ The REWIRE [@rewire] paper claims that upcycling low-quality data requires large
|
|
| 207 |
config={{
|
| 208 |
setups: {
|
| 209 |
"Continue Prompt": {
|
| 210 |
-
|
| 211 |
"mix-fw_edu_hq-continue_12b_hq": "12B, HQ Source",
|
| 212 |
"mix-fw_edu_hq-continue_1b_hq": "1B, HQ Source",
|
| 213 |
"mix-fw_edu_hq-continue_1b_lq": "1B, LQ Source",
|
| 214 |
"mix-fw_edu_hq-continue_12b_lq": "12B, LQ Source",
|
| 215 |
-
dclm: "DCLM",
|
| 216 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 217 |
}
|
| 218 |
},
|
| 219 |
"Summarize Prompt": {
|
| 220 |
-
|
| 221 |
"mix-fw_edu_hq-summarize_1b_hq": "1B, HQ Source",
|
| 222 |
"mix-fw_edu_hq-summarize_12b_hq": "12B, HQ Source",
|
| 223 |
"mix-fw_edu_hq-summarize_1b_lq": "1B, LQ Source",
|
| 224 |
"mix-fw_edu_hq-summarize_12b_lq": "12B, LQ Source",
|
| 225 |
-
dclm: "DCLM",
|
| 226 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 227 |
}
|
| 228 |
},
|
| 229 |
"Tutorial Prompt": {
|
| 230 |
-
|
| 231 |
"mix-fw_edu_hq-tutorial_1b_hq": "1B, HQ Source",
|
| 232 |
"mix-fw_edu_hq-tutorial_12b_hq": "12B, HQ Source",
|
| 233 |
"mix-fw_edu_hq-tutorial_12b_lq": "12B, LQ Source",
|
| 234 |
"mix-fw_edu_hq-tutorial_1b_lq": "1B, LQ Source",
|
| 235 |
-
dclm: "DCLM",
|
| 236 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 237 |
}
|
| 238 |
},
|
| 239 |
"FAQ Prompt": {
|
| 240 |
-
|
| 241 |
"mix-fw_edu_hq-faq_1b_hq": "1B, HQ Source",
|
| 242 |
"mix-fw_edu_hq-faq_1b_lq": "1B, LQ Source",
|
| 243 |
"mix-fw_edu_hq-faq_12b_hq": "12B, HQ Source",
|
| 244 |
"mix-fw_edu_hq-faq_12b_lq": "12B, LQ Source",
|
| 245 |
-
dclm: "DCLM",
|
| 246 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 247 |
}
|
| 248 |
}
|
| 249 |
}
|
|
@@ -267,51 +243,47 @@ We hypothesize that SmolLM2's consistently strong rephrasing performance origina
|
|
| 267 |
config={{
|
| 268 |
setups: {
|
| 269 |
"Tutorial Prompt": {
|
| 270 |
-
|
| 271 |
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
|
| 272 |
"mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Falcon3",
|
| 273 |
"mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3",
|
| 274 |
"mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3",
|
| 275 |
"mix-fw_edu_hq-tutorial_granite3_1b_hq": "Granite3",
|
| 276 |
"mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2",
|
| 277 |
-
dclm: "DCLM",
|
| 278 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 279 |
}
|
| 280 |
},
|
| 281 |
"FAQ Prompt": {
|
| 282 |
-
|
| 283 |
"mix-fw_edu_hq-faq_smollm2_1.7b_hq": "SmolLM2",
|
| 284 |
"mix-fw_edu_hq-faq_llama3.2_1b_hq": "Llama-3.2",
|
| 285 |
"mix-fw_edu_hq-faq_falcon3_1b_hq": "Falcon3",
|
| 286 |
"mix-fw_edu_hq-faq_1b_hq": "Gemma-3",
|
| 287 |
"mix-fw_edu_hq-faq_granite3_1b_hq": "Granite3",
|
| 288 |
"mix-fw_edu_hq-faq_qwen3_1.7b_hq": "Qwen3",
|
| 289 |
-
dclm: "DCLM",
|
| 290 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 291 |
}
|
| 292 |
},
|
| 293 |
"Table Prompt": {
|
| 294 |
-
|
| 295 |
"mix-fw_edu_hq-table_smollm2_1.7b_hq": "SmolLM2",
|
| 296 |
"mix-fw_edu_hq-table_falcon3_1b_hq": "Falcon3",
|
| 297 |
"mix-fw_edu_hq-table_granite3_1b_hq": "Granite3",
|
| 298 |
"mix-fw_edu_hq-table_qwen3_1.7b_hq": "Qwen3",
|
| 299 |
"mix-fw_edu_hq-table_llama3.2_1b_hq": "Llama-3.2",
|
| 300 |
"mix-fw_edu_hq-table_1b_hq": "Gemma-3",
|
| 301 |
-
dclm: "DCLM",
|
| 302 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 303 |
}
|
| 304 |
},
|
| 305 |
"Math Prompt": {
|
| 306 |
-
|
| 307 |
"mix-fw_edu_hq-math_smollm2_1.7b_hq": "SmolLM2",
|
| 308 |
"mix-fw_edu_hq-math_falcon3_1b_hq": "Falcon3",
|
| 309 |
"mix-fw_edu_hq-math_granite3_1b_hq": "Granite3",
|
| 310 |
"mix-fw_edu_hq-math_1b_hq": "Gemma-3",
|
| 311 |
"mix-fw_edu_hq-math_llama3.2_1b_hq": "Llama-3.2",
|
| 312 |
"mix-fw_edu_hq-math_qwen3_1.7b_hq": "Qwen3",
|
| 313 |
-
dclm: "DCLM",
|
| 314 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 315 |
}
|
| 316 |
}
|
| 317 |
}
|
|
@@ -329,13 +301,12 @@ We compare Qwen models from versions 1.5 [@qwen], 2 [@qwen2], 2.5 [@qwen25], and
|
|
| 329 |
src="d3-benchmark-comparison.html"
|
| 330 |
desc="Qwen model generations (1.5 to 3) on the tutorial prompt."
|
| 331 |
config={{
|
| 332 |
-
|
| 333 |
"mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3 (1.7B)",
|
| 334 |
"mix-fw_edu_hq-tutorial_qwen2.5_1.5b_hq": "Qwen2.5 (1.5B)",
|
| 335 |
"mix-fw_edu_hq-tutorial_qwen2_1.5b_hq": "Qwen2 (1.5B)",
|
| 336 |
-
dclm: "DCLM",
|
| 337 |
-
"mix-fw_edu_hq-tutorial_qwen1.5_1.8b_hq": "Qwen1.5 (1.8B)"
|
| 338 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 339 |
}
|
| 340 |
}}
|
| 341 |
/>
|
|
@@ -355,7 +326,7 @@ So far we've always mixed synthetic data with a <Glossary term="source dataset"
|
|
| 355 |
|
| 356 |
#### Is synthetic data enough?
|
| 357 |
|
| 358 |
-
We compare synthetic-only training vs mixed training (synthetic + source) for [tutorial](#tutorial) and [faq](#faq) prompts on DCLM and FineWeb-Edu-HQ sources. Synthetic-only training
|
| 359 |
|
| 360 |
<HtmlEmbed
|
| 361 |
id="synthetic-only"
|
|
@@ -364,23 +335,21 @@ We compare synthetic-only training vs mixed training (synthetic + source) for [t
|
|
| 364 |
config={{
|
| 365 |
setups: {
|
| 366 |
"DCLM Source": {
|
| 367 |
-
|
| 368 |
"mix-dclm-faq_1b_dclm": "Mix: FAQ + DCLM",
|
| 369 |
-
dclm: "DCLM",
|
| 370 |
"mix-dclm-tutorial_1b_dclm": "Mix: Tutorial + DCLM",
|
| 371 |
faq_1b_dclm: "FAQ Only",
|
| 372 |
-
tutorial_1b_dclm: "Tutorial Only"
|
| 373 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 374 |
}
|
| 375 |
},
|
| 376 |
"FineWeb-Edu-HQ Source": {
|
| 377 |
-
|
| 378 |
"mix-fw_edu_hq-faq_1b_hq": "Mix: FAQ + FineWeb-Edu-HQ",
|
| 379 |
"mix-fw_edu_hq-tutorial_1b_hq": "Mix: Tutorial + FineWeb-Edu-HQ",
|
| 380 |
-
dclm: "DCLM",
|
| 381 |
faq_1b_hq: "FAQ Only",
|
| 382 |
-
tutorial_1b_hq: "Tutorial Only"
|
| 383 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 384 |
}
|
| 385 |
}
|
| 386 |
}
|
|
@@ -391,7 +360,7 @@ So synthetic data alone does not seem to be enough. But how much does the specif
|
|
| 391 |
|
| 392 |
#### Does the mix-in dataset matter?
|
| 393 |
|
| 394 |
-
We apply the [tutorial](#tutorial) prompt using Gemma-3-1B on FineWeb-Edu-HQ, then mix in one of four datasets: DCLM, Cosmopedia, FineWeb-Edu-HQ, or FineWeb-Edu-LQ. Use the Setup dropdown to also see results with LQ source data. DCLM
|
| 395 |
|
| 396 |
<HtmlEmbed
|
| 397 |
id="mixin-dataset"
|
|
@@ -400,31 +369,30 @@ We apply the [tutorial](#tutorial) prompt using Gemma-3-1B on FineWeb-Edu-HQ, th
|
|
| 400 |
config={{
|
| 401 |
setups: {
|
| 402 |
"HQ Source": {
|
| 403 |
-
|
| 404 |
-
"mix-dclm-tutorial_1b_hq": "Mix-in: DCLM",
|
| 405 |
-
"mix-fw_edu_hq-tutorial_1b_hq": "Mix-in: FineWeb-Edu-HQ",
|
| 406 |
-
dclm: "DCLM",
|
| 407 |
-
"mix-fw_edu_lq-tutorial_1b_hq": "Mix-in: FineWeb-Edu-LQ",
|
| 408 |
-
"mix-cosmopedia-tutorial_1b_hq": "Mix-in: Cosmopedia",
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
fw_edu_lq: "FineWeb-Edu-LQ"
|
| 412 |
}
|
| 413 |
},
|
| 414 |
"LQ Source": {
|
| 415 |
-
|
| 416 |
-
dclm: "DCLM",
|
| 417 |
-
"mix-fw_edu_hq-tutorial_1b_lq": "Mix-in: FineWeb-Edu-HQ",
|
| 418 |
-
"mix-dclm-tutorial_1b_lq": "Mix-in: DCLM",
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
fw_edu_lq: "FineWeb-Edu-LQ"
|
| 424 |
}
|
| 425 |
}
|
| 426 |
-
}
|
| 427 |
-
baselines: ["dclm", "fw_edu_hq", "cosmopedia", "fw_edu_lq"]
|
| 428 |
}}
|
| 429 |
/>
|
| 430 |
|
|
@@ -441,23 +409,21 @@ We rephrase four datasets (DCLM, Cosmopedia, FineWeb-Edu-HQ, FineWeb-Edu-LQ) wit
|
|
| 441 |
config={{
|
| 442 |
setups: {
|
| 443 |
"Tutorial Prompt": {
|
| 444 |
-
|
| 445 |
"mix-fw_edu_hq-tutorial_1b_hq": "Source: FineWeb-Edu-HQ",
|
| 446 |
"mix-dclm-tutorial_1b_dclm": "Source: DCLM",
|
| 447 |
"mix-cosmopedia-tutorial_1b_cosmopedia": "Source: Cosmopedia",
|
| 448 |
"mix-fw_edu_lq-tutorial_1b_lq": "Source: FineWeb-Edu-LQ",
|
| 449 |
-
dclm: "DCLM",
|
| 450 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 451 |
}
|
| 452 |
},
|
| 453 |
"FAQ Prompt": {
|
| 454 |
-
|
| 455 |
"mix-dclm-faq_1b_dclm": "Source: DCLM",
|
| 456 |
"mix-fw_edu_hq-faq_1b_hq": "Source: FineWeb-Edu-HQ",
|
| 457 |
"mix-fw_edu_lq-faq_1b_lq": "Source: FineWeb-Edu-LQ",
|
| 458 |
"mix-cosmopedia-faq_1b_cosmopedia": "Source: Cosmopedia",
|
| 459 |
-
dclm: "DCLM",
|
| 460 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 461 |
}
|
| 462 |
}
|
| 463 |
}
|
|
@@ -471,23 +437,21 @@ We rephrase four datasets (DCLM, Cosmopedia, FineWeb-Edu-HQ, FineWeb-Edu-LQ) wit
|
|
| 471 |
config={{
|
| 472 |
setups: {
|
| 473 |
"Tutorial Prompt": {
|
| 474 |
-
|
| 475 |
"mix-fw_edu_hq-tutorial_1b_dclm": "Source: DCLM",
|
| 476 |
"mix-fw_edu_hq-tutorial_1b_hq": "Source: FineWeb-Edu-HQ",
|
| 477 |
"mix-fw_edu_hq-tutorial_1b_cosmopedia": "Source: Cosmopedia",
|
| 478 |
"mix-fw_edu_hq-tutorial_1b_lq": "Source: FineWeb-Edu-LQ",
|
| 479 |
-
dclm: "DCLM",
|
| 480 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 481 |
}
|
| 482 |
},
|
| 483 |
"FAQ Prompt": {
|
| 484 |
-
|
| 485 |
"mix-fw_edu_hq-faq_1b_dclm": "Source: DCLM",
|
| 486 |
"mix-fw_edu_hq-faq_1b_hq": "Source: FineWeb-Edu-HQ",
|
| 487 |
"mix-fw_edu_hq-faq_1b_lq": "Source: FineWeb-Edu-LQ",
|
| 488 |
"mix-fw_edu_hq-faq_1b_cosmopedia": "Source: Cosmopedia",
|
| 489 |
-
dclm: "DCLM",
|
| 490 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 491 |
}
|
| 492 |
}
|
| 493 |
}
|
|
@@ -511,38 +475,35 @@ Interestingly, when mixing enough different prompts together, we don't seem to n
|
|
| 511 |
config={{
|
| 512 |
setups: {
|
| 513 |
"Mixing Prompts": {
|
| 514 |
-
|
| 515 |
"mix-fw_edu_hq-tutorial_1b_hq-fw_edu_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts + FineWeb-Edu-HQ",
|
| 516 |
"mix-fw_edu_hq-math_1b_hq": "Math + FineWeb-Edu-HQ",
|
| 517 |
"mix-tutorial_1b_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts (No Source)",
|
| 518 |
"mix-fw_edu_hq-table_1b_hq": "Table + FineWeb-Edu-HQ",
|
| 519 |
"mix-fw_edu_hq-faq_1b_hq": "FAQ + FineWeb-Edu-HQ",
|
| 520 |
"mix-fw_edu_hq-tutorial_1b_hq": "Tutorial + FineWeb-Edu-HQ",
|
| 521 |
-
dclm: "DCLM",
|
| 522 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 523 |
}
|
| 524 |
},
|
| 525 |
"Mixing Models": {
|
| 526 |
-
|
| 527 |
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
|
| 528 |
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "SmolLM2 + Falcon3",
|
| 529 |
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_llama3.2_1b_hq": "SmolLM2 + Llama-3.2",
|
| 530 |
"mix-fw_edu_hq-tutorial_llama3.2_1b_hq-tutorial_granite3_1b_hq": "Llama-3.2 + Granite3",
|
| 531 |
"mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2",
|
| 532 |
-
dclm: "DCLM",
|
| 533 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 534 |
}
|
| 535 |
},
|
| 536 |
"Mixing Both": {
|
| 537 |
-
|
| 538 |
"mix-fw_edu_hq-faq_smollm2_1.7b_hq": "FAQ (SmolLM2)",
|
| 539 |
"mix-fw_edu_hq-faq_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "FAQ (SmolLM2) + Tutorial (Falcon3)",
|
| 540 |
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "Tutorial (SmolLM2)",
|
| 541 |
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "Tutorial (SmolLM2) + Tutorial (Falcon3)",
|
| 542 |
"mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Tutorial (Falcon3)",
|
| 543 |
"mix-fw_edu_hq-faq_falcon3_1b_hq": "FAQ (Falcon3)",
|
| 544 |
-
dclm: "DCLM",
|
| 545 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 546 |
}
|
| 547 |
}
|
| 548 |
}
|
|
@@ -568,13 +529,12 @@ We compare REWIRE's [original prompt](#guided_rewrite_original) (with typos) aga
|
|
| 568 |
src="d3-benchmark-comparison.html"
|
| 569 |
desc="REWIRE prompt with original typos vs improved version at 1B and 12B scale."
|
| 570 |
config={{
|
| 571 |
-
|
| 572 |
"mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Original (12B)",
|
| 573 |
"mix-fw_edu_hq-guided_rewrite_improved_12b_hq": "Improved (12B)",
|
| 574 |
-
dclm: "DCLM",
|
| 575 |
"mix-fw_edu_hq-guided_rewrite_original_1b_hq": "Original (1B)",
|
| 576 |
-
"mix-fw_edu_hq-guided_rewrite_improved_1b_hq": "Improved (1B)"
|
| 577 |
-
fw_edu_hq: "FineWeb-Edu-HQ"
|
| 578 |
}
|
| 579 |
}}
|
| 580 |
/>
|
|
@@ -609,4 +569,3 @@ Here are the key takeaways from our experiments:
|
|
| 609 |
A: No. Typos have no negative effect on downstream performance.
|
| 610 |
|
| 611 |
The bottom line: the details of synthetic rephrasing matter a lot, and knowing which ones matter is the key to scaling it up. Prompt design is the single biggest lever, with structured formats like Math, Table, FAQ, and Tutorial consistently beating curated baselines. But equally important is knowing where you can cut corners without losing quality. You don't need a large rephrasing model (1B is enough for simple prompts, 4B for complex ones). You don't need pristine source data (even low-quality sources work with a strong mix-in). Smaller models generate faster, directly translating into higher throughput. And tolerating lower-quality sources opens up a much bigger and more diverse data pool to draw from. The practical recipe is straightforward: pick a strong structured prompt, use the smallest model that handles it, blend with high-quality original data, and spend your remaining compute on volume.
|
| 612 |
-
|
|
|
|
| 8 |
{/* TODO: read through entire blog post and make improvements */}
|
| 9 |
{/* TODO: potentially make a widget for data exploration: look at the same few samples generated by different models or transformed with different prompts */}
|
| 10 |
{/* TODO: Integrate decay experiment as another analysis for proxy */}
|
| 11 |
+
{/* TODO: share on a bunch of discords/slacks/hackernews/locallama */}
|
| 12 |
+
{/* TODO: brainstorm better banner, be artsy */}
|
| 13 |
+
{/* TODO: only explain datatrove additions when we need them (for generating the final finephrase) */}
|
| 14 |
+
{/* TODO: move infrastructure section after analyses as precursor and explanation for finephrase */}
|
|
|
|
| 15 |
|
| 16 |
{/*
|
| 17 |
Notes:
|
|
|
|
| 39 |
src="d3-benchmark-comparison.html"
|
| 40 |
desc="Comparison of baseline datasets across different evaluation metrics. Use the dropdown to switch metrics."
|
| 41 |
config={{
|
| 42 |
+
datasets: {
|
|
|
|
| 43 |
cosmopedia: "Cosmopedia",
|
| 44 |
dclm: "DCLM",
|
| 45 |
fw_edu_hq: "FineWeb-Edu-HQ",
|
|
|
|
| 65 |
<HtmlEmbed
|
| 66 |
id="dissecting-baselines"
|
| 67 |
src="d3-benchmark-comparison.html"
|
| 68 |
+
desc="Individual prompt performance from existing synthetic datasets compared to the DCLM baseline."
|
| 69 |
config={{
|
| 70 |
+
datasets: {
|
| 71 |
+
"mix-fw_edu_hq-diverse_qa_pairs_1b_hq": { display: "Diverse QA Pairs", color: "#c5e384" },
|
| 72 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
|
| 73 |
+
"mix-fw_edu_hq-extract_knowledge_1b_hq": { display: "Extract Knowledge", color: "#3d6b00" },
|
| 74 |
+
"mix-fw_edu_hq-guided_rewrite_original_1b_hq": { display: "Guided Rewrite", color: "#6aabff" },
|
| 75 |
+
nemotron_hq_synth: { display: "Nemotron-HQ-Synth", color: "#76b900", shaded: true },
|
| 76 |
+
rewire: { display: "REWIRE", color: "#1877F2", shaded: true },
|
| 77 |
+
"mix-fw_edu_hq-distill_1b_hq": { display: "Distill", color: "#a0c95c" },
|
| 78 |
+
"mix-fw_edu_hq-wikipedia_style_rephrasing_1b_hq": { display: "Wikipedia Rephrasing", color: "#7fb034" },
|
| 79 |
+
"mix-fw_edu_hq-knowledge_list_1b_hq": { display: "Knowledge List", color: "#5e960e" },
|
| 80 |
+
"mix-fw_edu_hq-continue_1b_hq": { display: "Continue", color: "#e8713a" },
|
| 81 |
+
"mix-fw_edu_hq-summarize_1b_hq": { display: "Summarize", color: "#c4451c" }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
}
|
| 83 |
}}
|
| 84 |
/>
|
|
|
|
| 87 |
|
| 88 |
### Can New Prompts Beat DCLM?
|
| 89 |
|
| 90 |
+
Since most existing prompts fail to beat DCLM, we designed seven novel prompt formats targeting different skills ([math](#math), [table](#table), [faq](#faq), [tutorial](#tutorial), [article](#article), [commentary](#commentary), [discussion](#discussion)), all using Gemma-3-1B on FineWeb-Edu-HQ. Four prompts ([math](#math), [table](#table), [faq](#faq), [tutorial](#tutorial)) outperform DCLM, while [article](#article), [commentary](#commentary), and [discussion](#discussion) are at or below DCLM level (see <FigRef target="new-prompts" />). The best-performing prompts all restructure the source content into pedagogically rich formats.
|
| 91 |
|
| 92 |
<HtmlEmbed
|
| 93 |
id="new-prompts"
|
| 94 |
src="d3-benchmark-comparison.html"
|
| 95 |
+
desc="Seven new prompts compared against the DCLM baseline."
|
| 96 |
config={{
|
| 97 |
+
datasets: {
|
| 98 |
"mix-fw_edu_hq-math_1b_hq": "Math",
|
| 99 |
"mix-fw_edu_hq-table_1b_hq": "Table",
|
| 100 |
"mix-fw_edu_hq-faq_1b_hq": "FAQ",
|
|
|
|
| 102 |
"mix-fw_edu_hq-article_1b_hq": "Article",
|
| 103 |
"mix-fw_edu_hq-commentary_1b_hq": "Commentary",
|
| 104 |
"mix-fw_edu_hq-discussion_1b_hq": "Discussion",
|
| 105 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 106 |
}
|
| 107 |
}}
|
| 108 |
/>
|
|
|
|
| 133 |
config={{
|
| 134 |
setups: {
|
| 135 |
"Gemma-3: Tutorial": {
|
| 136 |
+
datasets: {
|
| 137 |
"mix-fw_edu_hq-tutorial_27b_hq": "Gemma-3 27B",
|
| 138 |
"mix-fw_edu_hq-tutorial_12b_hq": "Gemma-3 12B",
|
| 139 |
"mix-fw_edu_hq-tutorial_4b_hq": "Gemma-3 4B",
|
| 140 |
"mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3 1B",
|
| 141 |
"mix-fw_edu_hq-tutorial_270m_hq": "Gemma-3 270M",
|
| 142 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 143 |
}
|
| 144 |
},
|
| 145 |
"Gemma-3: Math": {
|
| 146 |
+
datasets: {
|
| 147 |
"mix-fw_edu_hq-math_27b_hq": "Gemma-3 27B",
|
| 148 |
"mix-fw_edu_hq-math_12b_hq": "Gemma-3 12B",
|
| 149 |
"mix-fw_edu_hq-math_4b_hq": "Gemma-3 4B",
|
| 150 |
"mix-fw_edu_hq-math_1b_hq": "Gemma-3 1B",
|
| 151 |
"mix-fw_edu_hq-math_270m_hq": "Gemma-3 270M",
|
| 152 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 153 |
}
|
| 154 |
},
|
| 155 |
"Gemma-3: REWIRE": {
|
| 156 |
+
datasets: {
|
| 157 |
"mix-fw_edu_hq-guided_rewrite_original_27b_hq": "Gemma-3 27B",
|
| 158 |
"mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Gemma-3 12B",
|
| 159 |
"mix-fw_edu_hq-guided_rewrite_original_4b_hq": "Gemma-3 4B",
|
| 160 |
"mix-fw_edu_hq-guided_rewrite_original_1b_hq": "Gemma-3 1B",
|
| 161 |
"mix-fw_edu_hq-guided_rewrite_original_270m_hq": "Gemma-3 270M",
|
| 162 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 163 |
}
|
| 164 |
},
|
| 165 |
"SmolLM2: Tutorial": {
|
| 166 |
+
datasets: {
|
| 167 |
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2 1.7B",
|
| 168 |
"mix-fw_edu_hq-tutorial_smollm2_360m_hq": "SmolLM2 360M",
|
| 169 |
"mix-fw_edu_hq-tutorial_smollm2_135m_hq": "SmolLM2 135M",
|
| 170 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 171 |
}
|
| 172 |
}
|
| 173 |
}
|
|
|
|
| 187 |
config={{
|
| 188 |
setups: {
|
| 189 |
"Continue Prompt": {
|
| 190 |
+
datasets: {
|
| 191 |
"mix-fw_edu_hq-continue_12b_hq": "12B, HQ Source",
|
| 192 |
"mix-fw_edu_hq-continue_1b_hq": "1B, HQ Source",
|
| 193 |
"mix-fw_edu_hq-continue_1b_lq": "1B, LQ Source",
|
| 194 |
"mix-fw_edu_hq-continue_12b_lq": "12B, LQ Source",
|
| 195 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 196 |
}
|
| 197 |
},
|
| 198 |
"Summarize Prompt": {
|
| 199 |
+
datasets: {
|
| 200 |
"mix-fw_edu_hq-summarize_1b_hq": "1B, HQ Source",
|
| 201 |
"mix-fw_edu_hq-summarize_12b_hq": "12B, HQ Source",
|
| 202 |
"mix-fw_edu_hq-summarize_1b_lq": "1B, LQ Source",
|
| 203 |
"mix-fw_edu_hq-summarize_12b_lq": "12B, LQ Source",
|
| 204 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 205 |
}
|
| 206 |
},
|
| 207 |
"Tutorial Prompt": {
|
| 208 |
+
datasets: {
|
| 209 |
"mix-fw_edu_hq-tutorial_1b_hq": "1B, HQ Source",
|
| 210 |
"mix-fw_edu_hq-tutorial_12b_hq": "12B, HQ Source",
|
| 211 |
"mix-fw_edu_hq-tutorial_12b_lq": "12B, LQ Source",
|
| 212 |
"mix-fw_edu_hq-tutorial_1b_lq": "1B, LQ Source",
|
| 213 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 214 |
}
|
| 215 |
},
|
| 216 |
"FAQ Prompt": {
|
| 217 |
+
datasets: {
|
| 218 |
"mix-fw_edu_hq-faq_1b_hq": "1B, HQ Source",
|
| 219 |
"mix-fw_edu_hq-faq_1b_lq": "1B, LQ Source",
|
| 220 |
"mix-fw_edu_hq-faq_12b_hq": "12B, HQ Source",
|
| 221 |
"mix-fw_edu_hq-faq_12b_lq": "12B, LQ Source",
|
| 222 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 223 |
}
|
| 224 |
}
|
| 225 |
}
|
|
|
|
| 243 |
config={{
|
| 244 |
setups: {
|
| 245 |
"Tutorial Prompt": {
|
| 246 |
+
datasets: {
|
| 247 |
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
|
| 248 |
"mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Falcon3",
|
| 249 |
"mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3",
|
| 250 |
"mix-fw_edu_hq-tutorial_1b_hq": "Gemma-3",
|
| 251 |
"mix-fw_edu_hq-tutorial_granite3_1b_hq": "Granite3",
|
| 252 |
"mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2",
|
| 253 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 254 |
}
|
| 255 |
},
|
| 256 |
"FAQ Prompt": {
|
| 257 |
+
datasets: {
|
| 258 |
"mix-fw_edu_hq-faq_smollm2_1.7b_hq": "SmolLM2",
|
| 259 |
"mix-fw_edu_hq-faq_llama3.2_1b_hq": "Llama-3.2",
|
| 260 |
"mix-fw_edu_hq-faq_falcon3_1b_hq": "Falcon3",
|
| 261 |
"mix-fw_edu_hq-faq_1b_hq": "Gemma-3",
|
| 262 |
"mix-fw_edu_hq-faq_granite3_1b_hq": "Granite3",
|
| 263 |
"mix-fw_edu_hq-faq_qwen3_1.7b_hq": "Qwen3",
|
| 264 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 265 |
}
|
| 266 |
},
|
| 267 |
"Table Prompt": {
|
| 268 |
+
datasets: {
|
| 269 |
"mix-fw_edu_hq-table_smollm2_1.7b_hq": "SmolLM2",
|
| 270 |
"mix-fw_edu_hq-table_falcon3_1b_hq": "Falcon3",
|
| 271 |
"mix-fw_edu_hq-table_granite3_1b_hq": "Granite3",
|
| 272 |
"mix-fw_edu_hq-table_qwen3_1.7b_hq": "Qwen3",
|
| 273 |
"mix-fw_edu_hq-table_llama3.2_1b_hq": "Llama-3.2",
|
| 274 |
"mix-fw_edu_hq-table_1b_hq": "Gemma-3",
|
| 275 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 276 |
}
|
| 277 |
},
|
| 278 |
"Math Prompt": {
|
| 279 |
+
datasets: {
|
| 280 |
"mix-fw_edu_hq-math_smollm2_1.7b_hq": "SmolLM2",
|
| 281 |
"mix-fw_edu_hq-math_falcon3_1b_hq": "Falcon3",
|
| 282 |
"mix-fw_edu_hq-math_granite3_1b_hq": "Granite3",
|
| 283 |
"mix-fw_edu_hq-math_1b_hq": "Gemma-3",
|
| 284 |
"mix-fw_edu_hq-math_llama3.2_1b_hq": "Llama-3.2",
|
| 285 |
"mix-fw_edu_hq-math_qwen3_1.7b_hq": "Qwen3",
|
| 286 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 287 |
}
|
| 288 |
}
|
| 289 |
}
|
|
|
|
| 301 |
src="d3-benchmark-comparison.html"
|
| 302 |
desc="Qwen model generations (1.5 to 3) on the tutorial prompt."
|
| 303 |
config={{
|
| 304 |
+
datasets: {
|
| 305 |
"mix-fw_edu_hq-tutorial_qwen3_1.7b_hq": "Qwen3 (1.7B)",
|
| 306 |
"mix-fw_edu_hq-tutorial_qwen2.5_1.5b_hq": "Qwen2.5 (1.5B)",
|
| 307 |
"mix-fw_edu_hq-tutorial_qwen2_1.5b_hq": "Qwen2 (1.5B)",
|
| 308 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
|
| 309 |
+
"mix-fw_edu_hq-tutorial_qwen1.5_1.8b_hq": "Qwen1.5 (1.8B)"
|
|
|
|
| 310 |
}
|
| 311 |
}}
|
| 312 |
/>
|
|
|
|
| 326 |
|
| 327 |
#### Is synthetic data enough?
|
| 328 |
|
| 329 |
+
We compare synthetic-only training vs mixed training (synthetic + source) for [tutorial](#tutorial) and [faq](#faq) prompts on DCLM and FineWeb-Edu-HQ sources. Synthetic-only training falls short of both DCLM and mixed training (see <FigRef target="synthetic-only" />). Mixed training consistently improves over both the synthetic-only and original-data-only baselines.
|
| 330 |
|
| 331 |
<HtmlEmbed
|
| 332 |
id="synthetic-only"
|
|
|
|
| 335 |
config={{
|
| 336 |
setups: {
|
| 337 |
"DCLM Source": {
|
| 338 |
+
datasets: {
|
| 339 |
"mix-dclm-faq_1b_dclm": "Mix: FAQ + DCLM",
|
| 340 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
|
| 341 |
"mix-dclm-tutorial_1b_dclm": "Mix: Tutorial + DCLM",
|
| 342 |
faq_1b_dclm: "FAQ Only",
|
| 343 |
+
tutorial_1b_dclm: "Tutorial Only"
|
|
|
|
| 344 |
}
|
| 345 |
},
|
| 346 |
"FineWeb-Edu-HQ Source": {
|
| 347 |
+
datasets: {
|
| 348 |
"mix-fw_edu_hq-faq_1b_hq": "Mix: FAQ + FineWeb-Edu-HQ",
|
| 349 |
"mix-fw_edu_hq-tutorial_1b_hq": "Mix: Tutorial + FineWeb-Edu-HQ",
|
| 350 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
|
| 351 |
faq_1b_hq: "FAQ Only",
|
| 352 |
+
tutorial_1b_hq: "Tutorial Only"
|
|
|
|
| 353 |
}
|
| 354 |
}
|
| 355 |
}
|
|
|
|
| 360 |
|
| 361 |
#### Does the mix-in dataset matter?
|
| 362 |
|
| 363 |
+
We apply the [tutorial](#tutorial) prompt using Gemma-3-1B on FineWeb-Edu-HQ, then mix in one of four datasets: DCLM, Cosmopedia, FineWeb-Edu-HQ, or FineWeb-Edu-LQ. Use the Setup dropdown to also see results with LQ source data. DCLM outperforms other mix-in datasets. Adding synthetic data improves performance for all mix-in datasets, with the effect especially pronounced for the weaker ones (see <FigRef target="mixin-dataset" />). The mix-in dataset is a major performance driver, sometimes more important than the synthetic data itself.
|
| 364 |
|
| 365 |
<HtmlEmbed
|
| 366 |
id="mixin-dataset"
|
|
|
|
| 369 |
config={{
|
| 370 |
setups: {
|
| 371 |
"HQ Source": {
|
| 372 |
+
datasets: {
|
| 373 |
+
"mix-dclm-tutorial_1b_hq": { display: "Mix-in: DCLM", color: "#4e79a7" },
|
| 374 |
+
"mix-fw_edu_hq-tutorial_1b_hq": { display: "Mix-in: FineWeb-Edu-HQ", color: "#59a14f" },
|
| 375 |
+
dclm: { display: "DCLM", color: "#4e79a7", shaded: true },
|
| 376 |
+
"mix-fw_edu_lq-tutorial_1b_hq": { display: "Mix-in: FineWeb-Edu-LQ", color: "#e15759" },
|
| 377 |
+
"mix-cosmopedia-tutorial_1b_hq": { display: "Mix-in: Cosmopedia", color: "#f28e2b" },
|
| 378 |
+
cosmopedia: { display: "Cosmopedia", color: "#f28e2b", shaded: true },
|
| 379 |
+
fw_edu_hq: { display: "FineWeb-Edu-HQ", color: "#59a14f", shaded: true },
|
| 380 |
+
fw_edu_lq: { display: "FineWeb-Edu-LQ", color: "#e15759", shaded: true }
|
| 381 |
}
|
| 382 |
},
|
| 383 |
"LQ Source": {
|
| 384 |
+
datasets: {
|
| 385 |
+
dclm: { display: "DCLM", color: "#4e79a7", shaded: true },
|
| 386 |
+
"mix-fw_edu_hq-tutorial_1b_lq": { display: "Mix-in: FineWeb-Edu-HQ", color: "#59a14f" },
|
| 387 |
+
"mix-dclm-tutorial_1b_lq": { display: "Mix-in: DCLM", color: "#4e79a7" },
|
| 388 |
+
"mix-cosmopedia-tutorial_1b_lq": { display: "Mix-in: Cosmopedia", color: "#f28e2b" },
|
| 389 |
+
cosmopedia: { display: "Cosmopedia", color: "#f28e2b", shaded: true },
|
| 390 |
+
"mix-fw_edu_lq-tutorial_1b_lq": { display: "Mix-in: FineWeb-Edu-LQ", color: "#e15759" },
|
| 391 |
+
fw_edu_hq: { display: "FineWeb-Edu-HQ", color: "#59a14f", shaded: true },
|
| 392 |
+
fw_edu_lq: { display: "FineWeb-Edu-LQ", color: "#e15759", shaded: true }
|
| 393 |
}
|
| 394 |
}
|
| 395 |
+
}
|
|
|
|
| 396 |
}}
|
| 397 |
/>
|
| 398 |
|
|
|
|
| 409 |
config={{
|
| 410 |
setups: {
|
| 411 |
"Tutorial Prompt": {
|
| 412 |
+
datasets: {
|
| 413 |
"mix-fw_edu_hq-tutorial_1b_hq": "Source: FineWeb-Edu-HQ",
|
| 414 |
"mix-dclm-tutorial_1b_dclm": "Source: DCLM",
|
| 415 |
"mix-cosmopedia-tutorial_1b_cosmopedia": "Source: Cosmopedia",
|
| 416 |
"mix-fw_edu_lq-tutorial_1b_lq": "Source: FineWeb-Edu-LQ",
|
| 417 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 418 |
}
|
| 419 |
},
|
| 420 |
"FAQ Prompt": {
|
| 421 |
+
datasets: {
|
| 422 |
"mix-dclm-faq_1b_dclm": "Source: DCLM",
|
| 423 |
"mix-fw_edu_hq-faq_1b_hq": "Source: FineWeb-Edu-HQ",
|
| 424 |
"mix-fw_edu_lq-faq_1b_lq": "Source: FineWeb-Edu-LQ",
|
| 425 |
"mix-cosmopedia-faq_1b_cosmopedia": "Source: Cosmopedia",
|
| 426 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 427 |
}
|
| 428 |
}
|
| 429 |
}
|
|
|
|
| 437 |
config={{
|
| 438 |
setups: {
|
| 439 |
"Tutorial Prompt": {
|
| 440 |
+
datasets: {
|
| 441 |
"mix-fw_edu_hq-tutorial_1b_dclm": "Source: DCLM",
|
| 442 |
"mix-fw_edu_hq-tutorial_1b_hq": "Source: FineWeb-Edu-HQ",
|
| 443 |
"mix-fw_edu_hq-tutorial_1b_cosmopedia": "Source: Cosmopedia",
|
| 444 |
"mix-fw_edu_hq-tutorial_1b_lq": "Source: FineWeb-Edu-LQ",
|
| 445 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 446 |
}
|
| 447 |
},
|
| 448 |
"FAQ Prompt": {
|
| 449 |
+
datasets: {
|
| 450 |
"mix-fw_edu_hq-faq_1b_dclm": "Source: DCLM",
|
| 451 |
"mix-fw_edu_hq-faq_1b_hq": "Source: FineWeb-Edu-HQ",
|
| 452 |
"mix-fw_edu_hq-faq_1b_lq": "Source: FineWeb-Edu-LQ",
|
| 453 |
"mix-fw_edu_hq-faq_1b_cosmopedia": "Source: Cosmopedia",
|
| 454 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 455 |
}
|
| 456 |
}
|
| 457 |
}
|
|
|
|
| 475 |
config={{
|
| 476 |
setups: {
|
| 477 |
"Mixing Prompts": {
|
| 478 |
+
datasets: {
|
| 479 |
"mix-fw_edu_hq-tutorial_1b_hq-fw_edu_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts + FineWeb-Edu-HQ",
|
| 480 |
"mix-fw_edu_hq-math_1b_hq": "Math + FineWeb-Edu-HQ",
|
| 481 |
"mix-tutorial_1b_hq-faq_1b_hq-table_1b_hq-math_1b_hq": "All Prompts (No Source)",
|
| 482 |
"mix-fw_edu_hq-table_1b_hq": "Table + FineWeb-Edu-HQ",
|
| 483 |
"mix-fw_edu_hq-faq_1b_hq": "FAQ + FineWeb-Edu-HQ",
|
| 484 |
"mix-fw_edu_hq-tutorial_1b_hq": "Tutorial + FineWeb-Edu-HQ",
|
| 485 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 486 |
}
|
| 487 |
},
|
| 488 |
"Mixing Models": {
|
| 489 |
+
datasets: {
|
| 490 |
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "SmolLM2",
|
| 491 |
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "SmolLM2 + Falcon3",
|
| 492 |
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_llama3.2_1b_hq": "SmolLM2 + Llama-3.2",
|
| 493 |
"mix-fw_edu_hq-tutorial_llama3.2_1b_hq-tutorial_granite3_1b_hq": "Llama-3.2 + Granite3",
|
| 494 |
"mix-fw_edu_hq-tutorial_llama3.2_1b_hq": "Llama-3.2",
|
| 495 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 496 |
}
|
| 497 |
},
|
| 498 |
"Mixing Both": {
|
| 499 |
+
datasets: {
|
| 500 |
"mix-fw_edu_hq-faq_smollm2_1.7b_hq": "FAQ (SmolLM2)",
|
| 501 |
"mix-fw_edu_hq-faq_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "FAQ (SmolLM2) + Tutorial (Falcon3)",
|
| 502 |
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq": "Tutorial (SmolLM2)",
|
| 503 |
"mix-fw_edu_hq-tutorial_smollm2_1.7b_hq-tutorial_falcon3_1b_hq": "Tutorial (SmolLM2) + Tutorial (Falcon3)",
|
| 504 |
"mix-fw_edu_hq-tutorial_falcon3_1b_hq": "Tutorial (Falcon3)",
|
| 505 |
"mix-fw_edu_hq-faq_falcon3_1b_hq": "FAQ (Falcon3)",
|
| 506 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true }
|
|
|
|
| 507 |
}
|
| 508 |
}
|
| 509 |
}
|
|
|
|
| 529 |
src="d3-benchmark-comparison.html"
|
| 530 |
desc="REWIRE prompt with original typos vs improved version at 1B and 12B scale."
|
| 531 |
config={{
|
| 532 |
+
datasets: {
|
| 533 |
"mix-fw_edu_hq-guided_rewrite_original_12b_hq": "Original (12B)",
|
| 534 |
"mix-fw_edu_hq-guided_rewrite_improved_12b_hq": "Improved (12B)",
|
| 535 |
+
dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
|
| 536 |
"mix-fw_edu_hq-guided_rewrite_original_1b_hq": "Original (1B)",
|
| 537 |
+
"mix-fw_edu_hq-guided_rewrite_improved_1b_hq": "Improved (1B)"
|
|
|
|
| 538 |
}
|
| 539 |
}}
|
| 540 |
/>
|
|
|
|
| 569 |
A: No. Typos have no negative effect on downstream performance.
|
| 570 |
|
| 571 |
The bottom line: the details of synthetic rephrasing matter a lot, and knowing which ones matter is the key to scaling it up. Prompt design is the single biggest lever, with structured formats like Math, Table, FAQ, and Tutorial consistently beating curated baselines. But equally important is knowing where you can cut corners without losing quality. You don't need a large rephrasing model (1B is enough for simple prompts, 4B for complex ones). You don't need pristine source data (even low-quality sources work with a strong mix-in). Smaller models generate faster, directly translating into higher throughput. And tolerating lower-quality sources opens up a much bigger and more diverse data pool to draw from. The practical recipe is straightforward: pick a strong structured prompt, use the smallest model that handles it, blend with high-quality original data, and spend your remaining compute on volume.
|
|
|
app/src/content/chapters/introduction.mdx
CHANGED
|
@@ -44,11 +44,9 @@ Here's a preview of where we end up: FinePhrase, our best configuration, clearly
|
|
| 44 |
desc="FinePhrase compared against synthetic data baselines across evaluation metrics."
|
| 45 |
config={{
|
| 46 |
defaultView: "line",
|
| 47 |
-
|
| 48 |
-
baselines: ["cosmopedia", "nemotron_hq_synth", "rewire", "synth_query_reasoning_answer"],
|
| 49 |
-
datasetNames: {
|
| 50 |
cosmopedia: "Cosmopedia",
|
| 51 |
-
"mix-fw_edu_hq-table_smollm2_1.7b_hq": "FinePhrase",
|
| 52 |
nemotron_hq_synth: "Nemotron-HQ-Synth",
|
| 53 |
rewire: "REWIRE",
|
| 54 |
synth_query_reasoning_answer: "SYNTH"
|
|
|
|
| 44 |
desc="FinePhrase compared against synthetic data baselines across evaluation metrics."
|
| 45 |
config={{
|
| 46 |
defaultView: "line",
|
| 47 |
+
datasets: {
|
|
|
|
|
|
|
| 48 |
cosmopedia: "Cosmopedia",
|
| 49 |
+
"mix-fw_edu_hq-table_smollm2_1.7b_hq": { display: "FinePhrase", color: "#EBA937" },
|
| 50 |
nemotron_hq_synth: "Nemotron-HQ-Synth",
|
| 51 |
rewire: "REWIRE",
|
| 52 |
synth_query_reasoning_answer: "SYNTH"
|
app/src/content/embeds/d3-benchmark-comparison.html
CHANGED
|
@@ -3,29 +3,38 @@
|
|
| 3 |
|
| 4 |
Configuration via data-config attribute:
|
| 5 |
{
|
| 6 |
-
"
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
"
|
| 12 |
-
"
|
| 13 |
-
"
|
| 14 |
-
"
|
|
|
|
|
|
|
| 15 |
}
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
Data: uses benchmark-results.csv by default (one CSV with all runs).
|
| 18 |
-
Only rows matching keys in
|
| 19 |
|
| 20 |
Example usage in MDX:
|
| 21 |
<HtmlEmbed
|
| 22 |
src="d3-benchmark-comparison.html"
|
| 23 |
title="Baseline Comparison"
|
| 24 |
config={{
|
| 25 |
-
|
| 26 |
cosmopedia: "Cosmopedia",
|
| 27 |
-
dclm: "DCLM",
|
| 28 |
-
|
| 29 |
}
|
| 30 |
}}
|
| 31 |
/>
|
|
@@ -107,12 +116,9 @@
|
|
| 107 |
.d3-benchmark-comparison .bar.ghost { opacity: .25; }
|
| 108 |
.d3-benchmark-comparison .value-label.ghost { opacity: .25; }
|
| 109 |
.d3-benchmark-comparison .line-path { fill: none; stroke-width: 2; opacity: 0.85; }
|
| 110 |
-
.d3-benchmark-comparison .line-path.baseline { stroke-dasharray: 6,4; opacity: 0.5; }
|
| 111 |
-
.d3-benchmark-comparison .line-path.baseline.ghost { opacity: .1; }
|
| 112 |
.d3-benchmark-comparison .line-path.ghost { opacity: .15; }
|
| 113 |
-
.d3-benchmark-comparison .line-dot.baseline { opacity: 0.5; }
|
| 114 |
-
.d3-benchmark-comparison .line-dot.baseline.ghost { opacity: .1; }
|
| 115 |
.d3-benchmark-comparison .line-dot.ghost { opacity: .15; }
|
|
|
|
| 116 |
.d3-benchmark-comparison .axes path { display: none; }
|
| 117 |
.d3-benchmark-comparison .axes line { stroke: var(--axis-color); }
|
| 118 |
.d3-benchmark-comparison .axes text { fill: var(--tick-color); }
|
|
@@ -183,14 +189,24 @@
|
|
| 183 |
if (raw && raw.trim()) cfg = raw.trim().startsWith('{') ? JSON.parse(raw) : {};
|
| 184 |
} catch (_) {}
|
| 185 |
|
| 186 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
// ─── SETUP SUPPORT ───
|
| 188 |
const SETUPS = cfg.setups || null;
|
| 189 |
const setupNames = SETUPS ? Object.keys(SETUPS) : [];
|
| 190 |
let currentSetup = SETUPS ? setupNames[0] : null;
|
| 191 |
-
let
|
| 192 |
const AVG_SETUP_KEY = 'Average (all setups)';
|
| 193 |
-
let
|
| 194 |
let parsedData = [];
|
| 195 |
|
| 196 |
const RUN_COL = cfg.runColumn || 'runname';
|
|
@@ -198,21 +214,15 @@
|
|
| 198 |
const TOKENS_PER_STEP = cfg.tokensPerStep || 2.1e6;
|
| 199 |
const defaultMetric = cfg.defaultMetric || 'agg_score_macro';
|
| 200 |
const defaultView = cfg.defaultView || 'bar';
|
| 201 |
-
// Stable baseline colors, merged with per-chart overrides
|
| 202 |
-
const PINNED_COLORS = Object.assign({ 'DCLM': '#8b8b8b', 'FineWeb-Edu (HQ)': '#86a1a9' }, cfg.pinnedColors || {});
|
| 203 |
-
// Unique ID suffix for multiple instances on same page
|
| 204 |
const uid = Math.random().toString(36).slice(2, 8);
|
| 205 |
|
| 206 |
-
//
|
| 207 |
-
|
| 208 |
-
function isBaseline(raw) { return
|
|
|
|
|
|
|
| 209 |
function stripePatternId(raw) { return 'stripe-' + uid + '-' + raw.replace(/[^a-zA-Z0-9]/g, '_'); }
|
| 210 |
-
function barFill(d) {
|
| 211 |
-
if (isBaseline(d.rawName)) return `url(#${stripePatternId(d.rawName)})`;
|
| 212 |
-
return colorMap[d.rawName] || 'var(--primary-color)';
|
| 213 |
-
}
|
| 214 |
|
| 215 |
-
// Standard metric display names (shared across all CSVs from this benchmark suite)
|
| 216 |
const METRIC_NAMES = {
|
| 217 |
'agg_score_macro': 'Aggregate Score (Macro)',
|
| 218 |
'agg_score_micro': 'Aggregate Score (Micro)',
|
|
@@ -251,14 +261,13 @@
|
|
| 251 |
|
| 252 |
// State
|
| 253 |
let allData = [];
|
| 254 |
-
let metricKeys = [];
|
| 255 |
let currentMetric = defaultMetric;
|
| 256 |
let currentView = defaultView;
|
| 257 |
let colorMap = {};
|
| 258 |
let highlight = null;
|
| 259 |
|
| 260 |
// ─── HELPERS ───
|
| 261 |
-
function displayName(raw) { return DATASET_NAMES[raw] || raw; }
|
| 262 |
function metricName(key) { return METRIC_NAMES[key] || key; }
|
| 263 |
|
| 264 |
function stepsToTokens(step) { return step * TOKENS_PER_STEP; }
|
|
@@ -282,49 +291,46 @@
|
|
| 282 |
function initColors() {
|
| 283 |
if (Object.keys(colorMap).length) return;
|
| 284 |
const allRaw = Array.from(d3.group(allData, d => d[RUN_COL]).keys()).sort();
|
| 285 |
-
// Assign pinned colors first (keyed by display name)
|
| 286 |
const unpinned = [];
|
| 287 |
allRaw.forEach(raw => {
|
| 288 |
-
const
|
| 289 |
-
if (
|
| 290 |
else { unpinned.push(raw); }
|
| 291 |
});
|
| 292 |
-
// Fill remaining from categorical palette
|
| 293 |
const palette = getCategoricalColors(unpinned.length);
|
| 294 |
unpinned.forEach((raw, i) => { colorMap[raw] = palette[i % palette.length]; });
|
| 295 |
}
|
| 296 |
|
| 297 |
// ─── SETUP HELPERS ───
|
| 298 |
function filterData() {
|
| 299 |
-
const knownNames = Object.keys(
|
| 300 |
allData = knownNames.length ? parsedData.filter(r => knownNames.includes(r[RUN_COL])) : parsedData;
|
| 301 |
allData.columns = parsedData.columns;
|
| 302 |
}
|
| 303 |
|
| 304 |
function computeAverageData(rawData) {
|
| 305 |
-
if (!SETUPS || setupNames.length < 2) return { data: [],
|
| 306 |
-
// Build mapping: displayName -> [rawName1, rawName2, ...]
|
| 307 |
const displayToRaws = {};
|
| 308 |
for (const sName of setupNames) {
|
| 309 |
-
const
|
| 310 |
-
for (const [raw,
|
| 311 |
-
if (!displayToRaws[display]) displayToRaws[display] = [];
|
| 312 |
-
displayToRaws[display].push(raw);
|
| 313 |
}
|
| 314 |
}
|
| 315 |
-
// Only average display names that appear in ALL setups
|
| 316 |
const fullDisplay = Object.entries(displayToRaws)
|
| 317 |
.filter(([, raws]) => raws.length >= setupNames.length);
|
| 318 |
-
// Index raw data by runname+step for fast lookup
|
| 319 |
const byRunStep = {};
|
| 320 |
for (const row of rawData) byRunStep[row[RUN_COL] + '|' + row[STEP_COL]] = row;
|
| 321 |
const steps = Array.from(new Set(rawData.map(r => +r[STEP_COL]))).sort((a, b) => a - b);
|
| 322 |
const cols = rawData.columns || Object.keys(rawData[0] || {});
|
| 323 |
const result = [];
|
| 324 |
-
const
|
| 325 |
for (const [display, raws] of fullDisplay) {
|
| 326 |
const avgRaw = '__avg__' + display.replace(/[^a-zA-Z0-9]/g, '_');
|
| 327 |
-
|
|
|
|
|
|
|
| 328 |
for (const step of steps) {
|
| 329 |
const rows = raws.map(r => byRunStep[r + '|' + step]).filter(Boolean);
|
| 330 |
if (!rows.length) continue;
|
|
@@ -337,26 +343,23 @@
|
|
| 337 |
result.push(avgRow);
|
| 338 |
}
|
| 339 |
}
|
| 340 |
-
return { data: result,
|
| 341 |
}
|
| 342 |
|
| 343 |
function switchSetup(name) {
|
| 344 |
currentSetup = name;
|
| 345 |
if (name === AVG_SETUP_KEY) {
|
| 346 |
-
|
| 347 |
} else {
|
| 348 |
-
|
| 349 |
}
|
| 350 |
-
// Re-add baselines
|
| 351 |
-
const
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
for (const sName of setupNames) {
|
| 357 |
-
if (SETUPS[sName].datasetNames[bRaw]) { bDisplay = SETUPS[sName].datasetNames[bRaw]; break; }
|
| 358 |
}
|
| 359 |
-
DATASET_NAMES[bRaw] = bDisplay;
|
| 360 |
}
|
| 361 |
}
|
| 362 |
colorMap = {};
|
|
@@ -384,6 +387,10 @@
|
|
| 384 |
gRoot.selectAll('text.value-label').classed('ghost', d => highlight && d.name !== highlight);
|
| 385 |
gRoot.selectAll('.line-path').classed('ghost', d => highlight && d.name !== highlight);
|
| 386 |
gRoot.selectAll('.line-dot').classed('ghost', d => highlight && d.name !== highlight);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
container.querySelectorAll('.legend .item').forEach(el => {
|
| 388 |
el.classList.toggle('ghost', highlight && el.getAttribute('data-name') !== highlight);
|
| 389 |
});
|
|
@@ -392,7 +399,6 @@
|
|
| 392 |
// ─── AUTO-DETECT METRICS from CSV columns ───
|
| 393 |
function detectMetrics(columns) {
|
| 394 |
const skip = new Set([RUN_COL, STEP_COL, 'seed']);
|
| 395 |
-
// Ordered: aggregate first, then individual
|
| 396 |
const aggOrder = ['agg_score_macro', 'agg_score_micro', 'agg_score_RC', 'agg_score_GK', 'agg_score_NLU', 'agg_score_MATH', 'agg_score_TABLE', 'agg_score_RES'];
|
| 397 |
const agg = aggOrder.filter(k => columns.includes(k));
|
| 398 |
const ind = columns.filter(k => !skip.has(k) && !agg.includes(k) && !isNaN(+allData[0][k]));
|
|
@@ -402,7 +408,8 @@
|
|
| 402 |
// ─── BAR CHART ───
|
| 403 |
function renderBar() {
|
| 404 |
const width = container.clientWidth || 800;
|
| 405 |
-
const
|
|
|
|
| 406 |
|
| 407 |
const grouped = d3.group(allData, d => d[RUN_COL]);
|
| 408 |
const finalData = [];
|
|
@@ -413,8 +420,11 @@
|
|
| 413 |
}
|
| 414 |
finalData.sort((a, b) => b.value - a.value);
|
| 415 |
|
|
|
|
|
|
|
|
|
|
| 416 |
const barHeight = 28, barGap = 8;
|
| 417 |
-
const height = margin.top + margin.bottom +
|
| 418 |
svg.attr('width', width).attr('height', height);
|
| 419 |
gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
|
| 420 |
|
|
@@ -422,7 +432,7 @@
|
|
| 422 |
const innerHeight = height - margin.top - margin.bottom;
|
| 423 |
|
| 424 |
const x = d3.scaleLinear().domain([0, d3.max(finalData, d => d.value) * 1.05]).range([0, innerWidth]);
|
| 425 |
-
const y = d3.scaleBand().domain(
|
| 426 |
|
| 427 |
// Grid
|
| 428 |
gRoot.selectAll('.grid').data([0]).join('g').attr('class', 'grid').call(g => {
|
|
@@ -447,10 +457,9 @@
|
|
| 447 |
g.selectAll('path, line').attr('stroke', 'var(--axis-color)');
|
| 448 |
});
|
| 449 |
|
| 450 |
-
//
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
if (!isBaseline(d.rawName)) return;
|
| 454 |
const c = colorMap[d.rawName] || '#999';
|
| 455 |
const pat = defs.append('pattern').attr('id', stripePatternId(d.rawName))
|
| 456 |
.attr('width', 6).attr('height', 6).attr('patternUnits', 'userSpaceOnUse').attr('patternTransform', 'rotate(45)');
|
|
@@ -458,11 +467,17 @@
|
|
| 458 |
pat.append('line').attr('x1', 0).attr('y1', 0).attr('x2', 0).attr('y2', 6).attr('stroke', c).attr('stroke-width', 2.5);
|
| 459 |
});
|
| 460 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
const barTip = (ev, d) => {
|
| 462 |
const [mx, my] = d3.pointer(ev, container);
|
| 463 |
-
showTip(`<strong>${d.name}</strong><br/>${metricName(currentMetric)}: <strong>${d.value.toFixed(
|
| 464 |
};
|
| 465 |
-
gRoot.selectAll('rect.bar').data(
|
| 466 |
enter => enter.append('rect').attr('class', 'bar')
|
| 467 |
.attr('x', 0).attr('y', d => y(d.name)).attr('height', y.bandwidth()).attr('rx', 3)
|
| 468 |
.attr('fill', d => barFill(d))
|
|
@@ -483,14 +498,39 @@
|
|
| 483 |
);
|
| 484 |
|
| 485 |
// Value labels
|
| 486 |
-
gRoot.selectAll('text.value-label').data(
|
| 487 |
enter => enter.append('text').attr('class', 'value-label')
|
| 488 |
.attr('x', d => x(d.value) + 5).attr('y', d => y(d.name) + y.bandwidth() / 2)
|
| 489 |
.attr('dy', '0.35em').attr('fill', 'var(--text-color)').attr('font-size', 11)
|
| 490 |
-
.text(d => d.value.toFixed(
|
| 491 |
update => update.transition().duration(300)
|
| 492 |
.attr('x', d => x(d.value) + 5).attr('y', d => y(d.name) + y.bandwidth() / 2)
|
| 493 |
-
.text(d => d.value.toFixed(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
exit => exit.remove()
|
| 495 |
);
|
| 496 |
}
|
|
@@ -498,6 +538,7 @@
|
|
| 498 |
// ─── LINE CHART ───
|
| 499 |
function renderLine() {
|
| 500 |
const width = container.clientWidth || 800;
|
|
|
|
| 501 |
const margin = { top: 16, right: 50, bottom: 48, left: 60 };
|
| 502 |
const height = Math.max(300, Math.round(width / 2.5));
|
| 503 |
svg.attr('width', width).attr('height', height);
|
|
@@ -509,13 +550,20 @@
|
|
| 509 |
// Build series
|
| 510 |
const grouped = d3.group(allData, d => d[RUN_COL]);
|
| 511 |
const series = [];
|
|
|
|
| 512 |
for (const [raw, rows] of grouped) {
|
| 513 |
const pts = rows.map(r => ({ step: +r[STEP_COL], value: +r[currentMetric] })).sort((a, b) => a.step - b.step);
|
| 514 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 515 |
}
|
| 516 |
|
| 517 |
-
const allSteps = Array.from(new Set(allData.map(r => +r[STEP_COL]))).sort((a, b) => a - b);
|
| 518 |
-
const allValues = series.flatMap(s => s.values.map(v => v.value));
|
| 519 |
|
| 520 |
const x = d3.scaleLinear().domain(d3.extent(allSteps)).range([0, innerWidth]);
|
| 521 |
const yMin = d3.min(allValues), yMax = d3.max(allValues), yPad = (yMax - yMin) * 0.08;
|
|
@@ -555,28 +603,52 @@
|
|
| 555 |
.attr('text-anchor', 'middle').attr('fill', 'var(--text-color)').attr('font-size', 12)
|
| 556 |
.text(metricName(currentMetric));
|
| 557 |
|
| 558 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 559 |
const line = d3.line().x(d => x(d.step)).y(d => y(d.value)).curve(d3.curveMonotoneX);
|
| 560 |
gRoot.selectAll('.line-path').data(series, d => d.name).join(
|
| 561 |
-
enter => enter.append('path').attr('class',
|
| 562 |
.attr('stroke', d => colorMap[d.rawName] || 'var(--primary-color)')
|
| 563 |
.attr('d', d => line(d.values)),
|
| 564 |
-
update => update.
|
| 565 |
-
.transition().duration(300)
|
| 566 |
.attr('stroke', d => colorMap[d.rawName] || 'var(--primary-color)')
|
| 567 |
.attr('d', d => line(d.values)),
|
| 568 |
exit => exit.remove()
|
| 569 |
);
|
| 570 |
|
| 571 |
-
// Dots
|
| 572 |
const dotData = series.flatMap(s => s.values.map(v => ({ name: s.name, rawName: s.rawName, step: v.step, value: v.value })));
|
| 573 |
gRoot.selectAll('.line-dot').data(dotData, d => d.name + '-' + d.step).join(
|
| 574 |
-
enter => enter.append('circle').attr('class',
|
| 575 |
.attr('cx', d => x(d.step)).attr('cy', d => y(d.value)).attr('r', 3)
|
| 576 |
.attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)')
|
| 577 |
.attr('stroke', 'var(--surface-bg)').attr('stroke-width', 1),
|
| 578 |
-
update => update.
|
| 579 |
-
.transition().duration(300)
|
| 580 |
.attr('cx', d => x(d.step)).attr('cy', d => y(d.value))
|
| 581 |
.attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)'),
|
| 582 |
exit => exit.remove()
|
|
@@ -597,11 +669,15 @@
|
|
| 597 |
const entries = series.map(s => {
|
| 598 |
const pt = s.values.find(v => v.step === nearest);
|
| 599 |
return pt ? { name: s.name, rawName: s.rawName, value: pt.value } : null;
|
| 600 |
-
}).filter(Boolean)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
|
| 602 |
let html = `<div style="font-weight:700;margin-bottom:4px;">${stepLabelLong(nearest)}</div>`;
|
| 603 |
entries.forEach(e => {
|
| 604 |
-
html += `<div><span class="tip-dot" style="background:${colorMap[e.rawName]}"></span>${e.name}: <strong>${e.value.toFixed(
|
| 605 |
});
|
| 606 |
const [cx, cy] = d3.pointer(ev, container);
|
| 607 |
showTip(html, cx, cy);
|
|
@@ -625,7 +701,6 @@
|
|
| 625 |
function buildUI() {
|
| 626 |
const controls = document.createElement('div'); controls.className = 'controls';
|
| 627 |
|
| 628 |
-
// Setup selector (only shown when setups config is present)
|
| 629 |
if (SETUPS && setupNames.length > 0) {
|
| 630 |
const setupGroup = document.createElement('div'); setupGroup.className = 'control-group';
|
| 631 |
const setupLabel = document.createElement('label'); setupLabel.setAttribute('for', 'setup-' + uid); setupLabel.textContent = 'Setup';
|
|
@@ -635,7 +710,6 @@
|
|
| 635 |
if (name === currentSetup) opt.selected = true;
|
| 636 |
setupSelect.appendChild(opt);
|
| 637 |
});
|
| 638 |
-
// Add Average option
|
| 639 |
if (setupNames.length >= 2) {
|
| 640 |
const avgOpt = document.createElement('option'); avgOpt.value = AVG_SETUP_KEY; avgOpt.textContent = AVG_SETUP_KEY;
|
| 641 |
setupSelect.appendChild(avgOpt);
|
|
@@ -645,7 +719,6 @@
|
|
| 645 |
controls.appendChild(setupGroup);
|
| 646 |
}
|
| 647 |
|
| 648 |
-
// View toggle
|
| 649 |
const viewGroup = document.createElement('div'); viewGroup.className = 'control-group';
|
| 650 |
const viewLabel = document.createElement('label'); viewLabel.setAttribute('for', 'view-' + uid); viewLabel.textContent = 'View';
|
| 651 |
const viewSelect = document.createElement('select'); viewSelect.id = 'view-' + uid;
|
|
@@ -658,7 +731,6 @@
|
|
| 658 |
viewGroup.appendChild(viewLabel); viewGroup.appendChild(viewSelect);
|
| 659 |
controls.appendChild(viewGroup);
|
| 660 |
|
| 661 |
-
// Metric select (populated after data load)
|
| 662 |
const metricGroup = document.createElement('div'); metricGroup.className = 'control-group';
|
| 663 |
const metricLabel = document.createElement('label'); metricLabel.setAttribute('for', 'metric-' + uid); metricLabel.textContent = 'Metric';
|
| 664 |
const metricSelect = document.createElement('select'); metricSelect.id = 'metric-' + uid;
|
|
@@ -667,7 +739,6 @@
|
|
| 667 |
|
| 668 |
container.appendChild(controls);
|
| 669 |
|
| 670 |
-
// Legend
|
| 671 |
const legend = document.createElement('div'); legend.className = 'legend';
|
| 672 |
legend.innerHTML = '<div class="legend-title">Legend</div><div class="items"></div>';
|
| 673 |
container.appendChild(legend);
|
|
@@ -693,7 +764,6 @@
|
|
| 693 |
const items = container.querySelector('.legend .items');
|
| 694 |
if (!items) return;
|
| 695 |
items.innerHTML = '';
|
| 696 |
-
// Sort by final score (max step) on current default metric, descending
|
| 697 |
const grouped = d3.group(allData, d => d[RUN_COL]);
|
| 698 |
const sorted = Array.from(grouped.entries())
|
| 699 |
.map(([raw, rows]) => {
|
|
@@ -703,13 +773,17 @@
|
|
| 703 |
})
|
| 704 |
.sort((a, b) => b.score - a.score)
|
| 705 |
.map(d => d.raw);
|
| 706 |
-
sorted.forEach(raw => {
|
| 707 |
const name = displayName(raw);
|
| 708 |
const el = document.createElement('span'); el.className = 'item'; el.setAttribute('data-name', name);
|
| 709 |
const sw = document.createElement('span'); sw.className = 'swatch';
|
| 710 |
-
const
|
| 711 |
-
|
| 712 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 713 |
const txt = document.createElement('span'); txt.textContent = name;
|
| 714 |
el.appendChild(sw); el.appendChild(txt); items.appendChild(el);
|
| 715 |
el.addEventListener('mouseenter', () => { highlight = name; updateHighlight(); });
|
|
@@ -745,17 +819,14 @@
|
|
| 745 |
const text = await fetchFirstAvailable(csvPaths);
|
| 746 |
const parsed = d3.csvParse(text);
|
| 747 |
parsedData = parsed;
|
| 748 |
-
// Compute average data for setup mode
|
| 749 |
if (SETUPS && setupNames.length >= 2) {
|
| 750 |
const avg = computeAverageData(parsed);
|
| 751 |
-
|
| 752 |
parsedData = parsed.concat(avg.data);
|
| 753 |
parsedData.columns = parsed.columns;
|
| 754 |
}
|
| 755 |
-
// Filter to only datasets with configured display names
|
| 756 |
filterData();
|
| 757 |
metricKeys = detectMetrics(allData.columns);
|
| 758 |
-
// Ensure defaultMetric is valid; fall back to first available
|
| 759 |
if (!metricKeys.includes(currentMetric)) currentMetric = metricKeys[0];
|
| 760 |
populateMetricSelect();
|
| 761 |
render();
|
|
|
|
| 3 |
|
| 4 |
Configuration via data-config attribute:
|
| 5 |
{
|
| 6 |
+
"datasets": { // required (unless using setups)
|
| 7 |
+
"raw_name": "Display Name", // shorthand: string = display name
|
| 8 |
+
"raw_name": { "display": "Name", "color": "#hex", "shaded": true, "baseline": true }
|
| 9 |
+
// full form: display is required, rest optional
|
| 10 |
+
},
|
| 11 |
+
"setups": { "Setup Label": { "datasets": {...} }, ... }, // optional, multi-setup mode with dropdown + average
|
| 12 |
+
"defaultMetric": "agg_score_macro", // optional, default: "agg_score_macro"
|
| 13 |
+
"defaultView": "bar", // optional, "bar" | "line", default: "bar"
|
| 14 |
+
"tokensPerStep": 2100000, // optional, default: 2.1e6
|
| 15 |
+
"runColumn": "runname", // optional, CSV column for series, default: "runname"
|
| 16 |
+
"stepColumn": "steps" // optional, CSV column for x-axis, default: "steps"
|
| 17 |
}
|
| 18 |
|
| 19 |
+
Per-dataset options (all optional except display):
|
| 20 |
+
display: Display name shown in legend, axes, and tooltips
|
| 21 |
+
color: Pinned hex color (otherwise auto-assigned from palette)
|
| 22 |
+
shaded: If true, bar gets a diagonal-stripe pattern (useful for aggregate baselines)
|
| 23 |
+
baseline: If true, rendered as a reference line (vertical in bar view, horizontal in line view)
|
| 24 |
+
instead of a regular bar/line. Not shown in the legend.
|
| 25 |
+
|
| 26 |
Data: uses benchmark-results.csv by default (one CSV with all runs).
|
| 27 |
+
Only rows matching keys in datasets are displayed.
|
| 28 |
|
| 29 |
Example usage in MDX:
|
| 30 |
<HtmlEmbed
|
| 31 |
src="d3-benchmark-comparison.html"
|
| 32 |
title="Baseline Comparison"
|
| 33 |
config={{
|
| 34 |
+
datasets: {
|
| 35 |
cosmopedia: "Cosmopedia",
|
| 36 |
+
dclm: { display: "Baseline (DCLM)", baseline: true },
|
| 37 |
+
nemotron_hq_synth: { display: "Nemotron-HQ-Synth", color: "#76b900", shaded: true }
|
| 38 |
}
|
| 39 |
}}
|
| 40 |
/>
|
|
|
|
| 116 |
.d3-benchmark-comparison .bar.ghost { opacity: .25; }
|
| 117 |
.d3-benchmark-comparison .value-label.ghost { opacity: .25; }
|
| 118 |
.d3-benchmark-comparison .line-path { fill: none; stroke-width: 2; opacity: 0.85; }
|
|
|
|
|
|
|
| 119 |
.d3-benchmark-comparison .line-path.ghost { opacity: .15; }
|
|
|
|
|
|
|
| 120 |
.d3-benchmark-comparison .line-dot.ghost { opacity: .15; }
|
| 121 |
+
.d3-benchmark-comparison .baseline.ghost { opacity: .1; }
|
| 122 |
.d3-benchmark-comparison .axes path { display: none; }
|
| 123 |
.d3-benchmark-comparison .axes line { stroke: var(--axis-color); }
|
| 124 |
.d3-benchmark-comparison .axes text { fill: var(--tick-color); }
|
|
|
|
| 189 |
if (raw && raw.trim()) cfg = raw.trim().startsWith('{') ? JSON.parse(raw) : {};
|
| 190 |
} catch (_) {}
|
| 191 |
|
| 192 |
+
// ─── NORMALIZE DATASETS CONFIG ───
|
| 193 |
+
// Accepts: { "key": "Name" } or { "key": { display, color, shaded, baseline } }
|
| 194 |
+
// Returns: { key: { display, color, shaded, baseline } }
|
| 195 |
+
function normalizeDatasets(raw) {
|
| 196 |
+
const out = {};
|
| 197 |
+
for (const [k, v] of Object.entries(raw || {})) {
|
| 198 |
+
out[k] = typeof v === 'string' ? { display: v } : { ...v };
|
| 199 |
+
}
|
| 200 |
+
return out;
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
// ─── SETUP SUPPORT ───
|
| 204 |
const SETUPS = cfg.setups || null;
|
| 205 |
const setupNames = SETUPS ? Object.keys(SETUPS) : [];
|
| 206 |
let currentSetup = SETUPS ? setupNames[0] : null;
|
| 207 |
+
let DATASETS = SETUPS ? normalizeDatasets(SETUPS[setupNames[0]].datasets) : normalizeDatasets(cfg.datasets);
|
| 208 |
const AVG_SETUP_KEY = 'Average (all setups)';
|
| 209 |
+
let avgDatasets = {};
|
| 210 |
let parsedData = [];
|
| 211 |
|
| 212 |
const RUN_COL = cfg.runColumn || 'runname';
|
|
|
|
| 214 |
const TOKENS_PER_STEP = cfg.tokensPerStep || 2.1e6;
|
| 215 |
const defaultMetric = cfg.defaultMetric || 'agg_score_macro';
|
| 216 |
const defaultView = cfg.defaultView || 'bar';
|
|
|
|
|
|
|
|
|
|
| 217 |
const uid = Math.random().toString(36).slice(2, 8);
|
| 218 |
|
| 219 |
+
// ─── DATASET ACCESSORS ───
|
| 220 |
+
function displayName(raw) { return DATASETS[raw] ? DATASETS[raw].display : raw; }
|
| 221 |
+
function isBaseline(raw) { return !!(DATASETS[raw] && DATASETS[raw].baseline); }
|
| 222 |
+
function isShaded(raw) { return !!(DATASETS[raw] && DATASETS[raw].shaded); }
|
| 223 |
+
function pinnedColor(raw) { return DATASETS[raw] && DATASETS[raw].color; }
|
| 224 |
function stripePatternId(raw) { return 'stripe-' + uid + '-' + raw.replace(/[^a-zA-Z0-9]/g, '_'); }
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
|
|
|
|
| 226 |
const METRIC_NAMES = {
|
| 227 |
'agg_score_macro': 'Aggregate Score (Macro)',
|
| 228 |
'agg_score_micro': 'Aggregate Score (Micro)',
|
|
|
|
| 261 |
|
| 262 |
// State
|
| 263 |
let allData = [];
|
| 264 |
+
let metricKeys = [];
|
| 265 |
let currentMetric = defaultMetric;
|
| 266 |
let currentView = defaultView;
|
| 267 |
let colorMap = {};
|
| 268 |
let highlight = null;
|
| 269 |
|
| 270 |
// ─── HELPERS ───
|
|
|
|
| 271 |
function metricName(key) { return METRIC_NAMES[key] || key; }
|
| 272 |
|
| 273 |
function stepsToTokens(step) { return step * TOKENS_PER_STEP; }
|
|
|
|
| 291 |
function initColors() {
|
| 292 |
if (Object.keys(colorMap).length) return;
|
| 293 |
const allRaw = Array.from(d3.group(allData, d => d[RUN_COL]).keys()).sort();
|
|
|
|
| 294 |
const unpinned = [];
|
| 295 |
allRaw.forEach(raw => {
|
| 296 |
+
const pc = pinnedColor(raw);
|
| 297 |
+
if (pc) { colorMap[raw] = pc; }
|
| 298 |
else { unpinned.push(raw); }
|
| 299 |
});
|
|
|
|
| 300 |
const palette = getCategoricalColors(unpinned.length);
|
| 301 |
unpinned.forEach((raw, i) => { colorMap[raw] = palette[i % palette.length]; });
|
| 302 |
}
|
| 303 |
|
| 304 |
// ─── SETUP HELPERS ───
|
| 305 |
function filterData() {
|
| 306 |
+
const knownNames = Object.keys(DATASETS);
|
| 307 |
allData = knownNames.length ? parsedData.filter(r => knownNames.includes(r[RUN_COL])) : parsedData;
|
| 308 |
allData.columns = parsedData.columns;
|
| 309 |
}
|
| 310 |
|
| 311 |
function computeAverageData(rawData) {
|
| 312 |
+
if (!SETUPS || setupNames.length < 2) return { data: [], datasets: {} };
|
|
|
|
| 313 |
const displayToRaws = {};
|
| 314 |
for (const sName of setupNames) {
|
| 315 |
+
const ds = normalizeDatasets(SETUPS[sName].datasets);
|
| 316 |
+
for (const [raw, opts] of Object.entries(ds)) {
|
| 317 |
+
if (!displayToRaws[opts.display]) displayToRaws[opts.display] = [];
|
| 318 |
+
displayToRaws[opts.display].push(raw);
|
| 319 |
}
|
| 320 |
}
|
|
|
|
| 321 |
const fullDisplay = Object.entries(displayToRaws)
|
| 322 |
.filter(([, raws]) => raws.length >= setupNames.length);
|
|
|
|
| 323 |
const byRunStep = {};
|
| 324 |
for (const row of rawData) byRunStep[row[RUN_COL] + '|' + row[STEP_COL]] = row;
|
| 325 |
const steps = Array.from(new Set(rawData.map(r => +r[STEP_COL]))).sort((a, b) => a - b);
|
| 326 |
const cols = rawData.columns || Object.keys(rawData[0] || {});
|
| 327 |
const result = [];
|
| 328 |
+
const dsMap = {};
|
| 329 |
for (const [display, raws] of fullDisplay) {
|
| 330 |
const avgRaw = '__avg__' + display.replace(/[^a-zA-Z0-9]/g, '_');
|
| 331 |
+
// Merge options from first setup that has this display name
|
| 332 |
+
const firstOpts = Object.values(normalizeDatasets(SETUPS[setupNames[0]].datasets)).find(o => o.display === display) || {};
|
| 333 |
+
dsMap[avgRaw] = { display, ...firstOpts };
|
| 334 |
for (const step of steps) {
|
| 335 |
const rows = raws.map(r => byRunStep[r + '|' + step]).filter(Boolean);
|
| 336 |
if (!rows.length) continue;
|
|
|
|
| 343 |
result.push(avgRow);
|
| 344 |
}
|
| 345 |
}
|
| 346 |
+
return { data: result, datasets: dsMap };
|
| 347 |
}
|
| 348 |
|
| 349 |
function switchSetup(name) {
|
| 350 |
currentSetup = name;
|
| 351 |
if (name === AVG_SETUP_KEY) {
|
| 352 |
+
DATASETS = { ...avgDatasets };
|
| 353 |
} else {
|
| 354 |
+
DATASETS = normalizeDatasets(SETUPS[name].datasets);
|
| 355 |
}
|
| 356 |
+
// Re-add baselines from any setup
|
| 357 |
+
for (const sName of setupNames) {
|
| 358 |
+
const ds = normalizeDatasets(SETUPS[sName].datasets);
|
| 359 |
+
for (const [raw, opts] of Object.entries(ds)) {
|
| 360 |
+
if (opts.baseline && !DATASETS[raw] && parsedData.some(r => r[RUN_COL] === raw)) {
|
| 361 |
+
DATASETS[raw] = { ...opts };
|
|
|
|
|
|
|
| 362 |
}
|
|
|
|
| 363 |
}
|
| 364 |
}
|
| 365 |
colorMap = {};
|
|
|
|
| 387 |
gRoot.selectAll('text.value-label').classed('ghost', d => highlight && d.name !== highlight);
|
| 388 |
gRoot.selectAll('.line-path').classed('ghost', d => highlight && d.name !== highlight);
|
| 389 |
gRoot.selectAll('.line-dot').classed('ghost', d => highlight && d.name !== highlight);
|
| 390 |
+
gRoot.selectAll('.baseline-vline').classed('ghost', d => highlight && d.name !== highlight);
|
| 391 |
+
gRoot.selectAll('.baseline-vlabel').classed('ghost', d => highlight && d.name !== highlight);
|
| 392 |
+
gRoot.selectAll('.baseline-hline').classed('ghost', d => highlight && d.name !== highlight);
|
| 393 |
+
gRoot.selectAll('.baseline-hlabel').classed('ghost', d => highlight && d.name !== highlight);
|
| 394 |
container.querySelectorAll('.legend .item').forEach(el => {
|
| 395 |
el.classList.toggle('ghost', highlight && el.getAttribute('data-name') !== highlight);
|
| 396 |
});
|
|
|
|
| 399 |
// ─── AUTO-DETECT METRICS from CSV columns ───
|
| 400 |
function detectMetrics(columns) {
|
| 401 |
const skip = new Set([RUN_COL, STEP_COL, 'seed']);
|
|
|
|
| 402 |
const aggOrder = ['agg_score_macro', 'agg_score_micro', 'agg_score_RC', 'agg_score_GK', 'agg_score_NLU', 'agg_score_MATH', 'agg_score_TABLE', 'agg_score_RES'];
|
| 403 |
const agg = aggOrder.filter(k => columns.includes(k));
|
| 404 |
const ind = columns.filter(k => !skip.has(k) && !agg.includes(k) && !isNaN(+allData[0][k]));
|
|
|
|
| 408 |
// ─── BAR CHART ───
|
| 409 |
function renderBar() {
|
| 410 |
const width = container.clientWidth || 800;
|
| 411 |
+
const hasBaselines = allData.some(r => isBaseline(r[RUN_COL]));
|
| 412 |
+
const margin = { top: hasBaselines ? 20 : 12, right: 56, bottom: 32, left: 190 };
|
| 413 |
|
| 414 |
const grouped = d3.group(allData, d => d[RUN_COL]);
|
| 415 |
const finalData = [];
|
|
|
|
| 420 |
}
|
| 421 |
finalData.sort((a, b) => b.value - a.value);
|
| 422 |
|
| 423 |
+
const barData = finalData.filter(d => !isBaseline(d.rawName));
|
| 424 |
+
const baselineData = finalData.filter(d => isBaseline(d.rawName));
|
| 425 |
+
|
| 426 |
const barHeight = 28, barGap = 8;
|
| 427 |
+
const height = margin.top + margin.bottom + barData.length * (barHeight + barGap);
|
| 428 |
svg.attr('width', width).attr('height', height);
|
| 429 |
gRoot.attr('transform', `translate(${margin.left},${margin.top})`);
|
| 430 |
|
|
|
|
| 432 |
const innerHeight = height - margin.top - margin.bottom;
|
| 433 |
|
| 434 |
const x = d3.scaleLinear().domain([0, d3.max(finalData, d => d.value) * 1.05]).range([0, innerWidth]);
|
| 435 |
+
const y = d3.scaleBand().domain(barData.map(d => d.name)).range([0, innerHeight]).padding(0.2);
|
| 436 |
|
| 437 |
// Grid
|
| 438 |
gRoot.selectAll('.grid').data([0]).join('g').attr('class', 'grid').call(g => {
|
|
|
|
| 457 |
g.selectAll('path, line').attr('stroke', 'var(--axis-color)');
|
| 458 |
});
|
| 459 |
|
| 460 |
+
// Stripe patterns for shaded bars
|
| 461 |
+
barData.forEach(d => {
|
| 462 |
+
if (!isShaded(d.rawName)) return;
|
|
|
|
| 463 |
const c = colorMap[d.rawName] || '#999';
|
| 464 |
const pat = defs.append('pattern').attr('id', stripePatternId(d.rawName))
|
| 465 |
.attr('width', 6).attr('height', 6).attr('patternUnits', 'userSpaceOnUse').attr('patternTransform', 'rotate(45)');
|
|
|
|
| 467 |
pat.append('line').attr('x1', 0).attr('y1', 0).attr('x2', 0).attr('y2', 6).attr('stroke', c).attr('stroke-width', 2.5);
|
| 468 |
});
|
| 469 |
|
| 470 |
+
function barFill(d) {
|
| 471 |
+
if (isShaded(d.rawName)) return `url(#${stripePatternId(d.rawName)})`;
|
| 472 |
+
return colorMap[d.rawName] || 'var(--primary-color)';
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
// Bars
|
| 476 |
const barTip = (ev, d) => {
|
| 477 |
const [mx, my] = d3.pointer(ev, container);
|
| 478 |
+
showTip(`<strong>${d.name}</strong><br/>${metricName(currentMetric)}: <strong>${d.value.toFixed(3)}</strong>`, mx, my);
|
| 479 |
};
|
| 480 |
+
gRoot.selectAll('rect.bar').data(barData, d => d.name).join(
|
| 481 |
enter => enter.append('rect').attr('class', 'bar')
|
| 482 |
.attr('x', 0).attr('y', d => y(d.name)).attr('height', y.bandwidth()).attr('rx', 3)
|
| 483 |
.attr('fill', d => barFill(d))
|
|
|
|
| 498 |
);
|
| 499 |
|
| 500 |
// Value labels
|
| 501 |
+
gRoot.selectAll('text.value-label').data(barData, d => d.name).join(
|
| 502 |
enter => enter.append('text').attr('class', 'value-label')
|
| 503 |
.attr('x', d => x(d.value) + 5).attr('y', d => y(d.name) + y.bandwidth() / 2)
|
| 504 |
.attr('dy', '0.35em').attr('fill', 'var(--text-color)').attr('font-size', 11)
|
| 505 |
+
.text(d => d.value.toFixed(3)),
|
| 506 |
update => update.transition().duration(300)
|
| 507 |
.attr('x', d => x(d.value) + 5).attr('y', d => y(d.name) + y.bandwidth() / 2)
|
| 508 |
+
.text(d => d.value.toFixed(3)),
|
| 509 |
+
exit => exit.remove()
|
| 510 |
+
);
|
| 511 |
+
|
| 512 |
+
// Baseline vertical reference lines
|
| 513 |
+
gRoot.selectAll('.baseline-vline').data(baselineData, d => d.name).join(
|
| 514 |
+
enter => enter.append('line').attr('class', 'baseline-vline baseline')
|
| 515 |
+
.attr('x1', d => x(d.value)).attr('x2', d => x(d.value))
|
| 516 |
+
.attr('y1', 0).attr('y2', innerHeight)
|
| 517 |
+
.attr('stroke', d => colorMap[d.rawName] || '#999')
|
| 518 |
+
.attr('stroke-width', 2).attr('stroke-dasharray', '6,4').attr('opacity', 0.7),
|
| 519 |
+
update => update.transition().duration(300)
|
| 520 |
+
.attr('x1', d => x(d.value)).attr('x2', d => x(d.value))
|
| 521 |
+
.attr('y1', 0).attr('y2', innerHeight)
|
| 522 |
+
.attr('stroke', d => colorMap[d.rawName] || '#999'),
|
| 523 |
+
exit => exit.remove()
|
| 524 |
+
);
|
| 525 |
+
gRoot.selectAll('.baseline-vlabel').data(baselineData, d => d.name).join(
|
| 526 |
+
enter => enter.append('text').attr('class', 'baseline-vlabel baseline')
|
| 527 |
+
.attr('x', d => x(d.value)).attr('y', -4)
|
| 528 |
+
.attr('text-anchor', 'middle').attr('fill', d => colorMap[d.rawName] || '#999')
|
| 529 |
+
.attr('font-size', 11).attr('font-weight', 600)
|
| 530 |
+
.text(d => `${d.name} (${d.value.toFixed(3)})`),
|
| 531 |
+
update => update.transition().duration(300)
|
| 532 |
+
.attr('x', d => x(d.value))
|
| 533 |
+
.text(d => `${d.name} (${d.value.toFixed(3)})`),
|
| 534 |
exit => exit.remove()
|
| 535 |
);
|
| 536 |
}
|
|
|
|
| 538 |
// ─── LINE CHART ───
|
| 539 |
function renderLine() {
|
| 540 |
const width = container.clientWidth || 800;
|
| 541 |
+
const hasBaselines = allData.some(r => isBaseline(r[RUN_COL]));
|
| 542 |
const margin = { top: 16, right: 50, bottom: 48, left: 60 };
|
| 543 |
const height = Math.max(300, Math.round(width / 2.5));
|
| 544 |
svg.attr('width', width).attr('height', height);
|
|
|
|
| 550 |
// Build series
|
| 551 |
const grouped = d3.group(allData, d => d[RUN_COL]);
|
| 552 |
const series = [];
|
| 553 |
+
const baselineSeries = [];
|
| 554 |
for (const [raw, rows] of grouped) {
|
| 555 |
const pts = rows.map(r => ({ step: +r[STEP_COL], value: +r[currentMetric] })).sort((a, b) => a.step - b.step);
|
| 556 |
+
const entry = { name: displayName(raw), rawName: raw, values: pts };
|
| 557 |
+
if (isBaseline(raw)) {
|
| 558 |
+
entry.finalValue = pts[pts.length - 1].value;
|
| 559 |
+
baselineSeries.push(entry);
|
| 560 |
+
} else {
|
| 561 |
+
series.push(entry);
|
| 562 |
+
}
|
| 563 |
}
|
| 564 |
|
| 565 |
+
const allSteps = Array.from(new Set(allData.filter(r => !isBaseline(r[RUN_COL])).map(r => +r[STEP_COL]))).sort((a, b) => a - b);
|
| 566 |
+
const allValues = [...series, ...baselineSeries].flatMap(s => s.finalValue != null ? [s.finalValue] : s.values.map(v => v.value));
|
| 567 |
|
| 568 |
const x = d3.scaleLinear().domain(d3.extent(allSteps)).range([0, innerWidth]);
|
| 569 |
const yMin = d3.min(allValues), yMax = d3.max(allValues), yPad = (yMax - yMin) * 0.08;
|
|
|
|
| 603 |
.attr('text-anchor', 'middle').attr('fill', 'var(--text-color)').attr('font-size', 12)
|
| 604 |
.text(metricName(currentMetric));
|
| 605 |
|
| 606 |
+
// Baseline horizontal reference lines
|
| 607 |
+
gRoot.selectAll('.baseline-hline').data(baselineSeries, d => d.name).join(
|
| 608 |
+
enter => enter.append('line').attr('class', 'baseline-hline baseline')
|
| 609 |
+
.attr('x1', 0).attr('x2', innerWidth)
|
| 610 |
+
.attr('y1', d => y(d.finalValue)).attr('y2', d => y(d.finalValue))
|
| 611 |
+
.attr('stroke', d => colorMap[d.rawName] || '#999')
|
| 612 |
+
.attr('stroke-width', 2).attr('stroke-dasharray', '6,4').attr('opacity', 0.7),
|
| 613 |
+
update => update.transition().duration(300)
|
| 614 |
+
.attr('x1', 0).attr('x2', innerWidth)
|
| 615 |
+
.attr('y1', d => y(d.finalValue)).attr('y2', d => y(d.finalValue))
|
| 616 |
+
.attr('stroke', d => colorMap[d.rawName] || '#999'),
|
| 617 |
+
exit => exit.remove()
|
| 618 |
+
);
|
| 619 |
+
gRoot.selectAll('.baseline-hlabel').data(baselineSeries, d => d.name).join(
|
| 620 |
+
enter => enter.append('text').attr('class', 'baseline-hlabel baseline')
|
| 621 |
+
.attr('x', 4).attr('y', d => y(d.finalValue) - 6)
|
| 622 |
+
.attr('text-anchor', 'start')
|
| 623 |
+
.attr('fill', d => colorMap[d.rawName] || '#999')
|
| 624 |
+
.attr('font-size', 10).attr('font-weight', 600)
|
| 625 |
+
.text(d => `${d.name} (${d.finalValue.toFixed(3)})`),
|
| 626 |
+
update => update.transition().duration(300)
|
| 627 |
+
.attr('x', 4).attr('y', d => y(d.finalValue) - 6)
|
| 628 |
+
.text(d => `${d.name} (${d.finalValue.toFixed(3)})`),
|
| 629 |
+
exit => exit.remove()
|
| 630 |
+
);
|
| 631 |
+
|
| 632 |
+
// Lines (non-baseline)
|
| 633 |
const line = d3.line().x(d => x(d.step)).y(d => y(d.value)).curve(d3.curveMonotoneX);
|
| 634 |
gRoot.selectAll('.line-path').data(series, d => d.name).join(
|
| 635 |
+
enter => enter.append('path').attr('class', 'line-path')
|
| 636 |
.attr('stroke', d => colorMap[d.rawName] || 'var(--primary-color)')
|
| 637 |
.attr('d', d => line(d.values)),
|
| 638 |
+
update => update.transition().duration(300)
|
|
|
|
| 639 |
.attr('stroke', d => colorMap[d.rawName] || 'var(--primary-color)')
|
| 640 |
.attr('d', d => line(d.values)),
|
| 641 |
exit => exit.remove()
|
| 642 |
);
|
| 643 |
|
| 644 |
+
// Dots (non-baseline)
|
| 645 |
const dotData = series.flatMap(s => s.values.map(v => ({ name: s.name, rawName: s.rawName, step: v.step, value: v.value })));
|
| 646 |
gRoot.selectAll('.line-dot').data(dotData, d => d.name + '-' + d.step).join(
|
| 647 |
+
enter => enter.append('circle').attr('class', 'line-dot')
|
| 648 |
.attr('cx', d => x(d.step)).attr('cy', d => y(d.value)).attr('r', 3)
|
| 649 |
.attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)')
|
| 650 |
.attr('stroke', 'var(--surface-bg)').attr('stroke-width', 1),
|
| 651 |
+
update => update.transition().duration(300)
|
|
|
|
| 652 |
.attr('cx', d => x(d.step)).attr('cy', d => y(d.value))
|
| 653 |
.attr('fill', d => colorMap[d.rawName] || 'var(--primary-color)'),
|
| 654 |
exit => exit.remove()
|
|
|
|
| 669 |
const entries = series.map(s => {
|
| 670 |
const pt = s.values.find(v => v.step === nearest);
|
| 671 |
return pt ? { name: s.name, rawName: s.rawName, value: pt.value } : null;
|
| 672 |
+
}).filter(Boolean);
|
| 673 |
+
baselineSeries.forEach(s => {
|
| 674 |
+
entries.push({ name: s.name, rawName: s.rawName, value: s.finalValue });
|
| 675 |
+
});
|
| 676 |
+
entries.sort((a, b) => b.value - a.value);
|
| 677 |
|
| 678 |
let html = `<div style="font-weight:700;margin-bottom:4px;">${stepLabelLong(nearest)}</div>`;
|
| 679 |
entries.forEach(e => {
|
| 680 |
+
html += `<div><span class="tip-dot" style="background:${colorMap[e.rawName]}"></span>${e.name}: <strong>${e.value.toFixed(3)}</strong></div>`;
|
| 681 |
});
|
| 682 |
const [cx, cy] = d3.pointer(ev, container);
|
| 683 |
showTip(html, cx, cy);
|
|
|
|
| 701 |
function buildUI() {
|
| 702 |
const controls = document.createElement('div'); controls.className = 'controls';
|
| 703 |
|
|
|
|
| 704 |
if (SETUPS && setupNames.length > 0) {
|
| 705 |
const setupGroup = document.createElement('div'); setupGroup.className = 'control-group';
|
| 706 |
const setupLabel = document.createElement('label'); setupLabel.setAttribute('for', 'setup-' + uid); setupLabel.textContent = 'Setup';
|
|
|
|
| 710 |
if (name === currentSetup) opt.selected = true;
|
| 711 |
setupSelect.appendChild(opt);
|
| 712 |
});
|
|
|
|
| 713 |
if (setupNames.length >= 2) {
|
| 714 |
const avgOpt = document.createElement('option'); avgOpt.value = AVG_SETUP_KEY; avgOpt.textContent = AVG_SETUP_KEY;
|
| 715 |
setupSelect.appendChild(avgOpt);
|
|
|
|
| 719 |
controls.appendChild(setupGroup);
|
| 720 |
}
|
| 721 |
|
|
|
|
| 722 |
const viewGroup = document.createElement('div'); viewGroup.className = 'control-group';
|
| 723 |
const viewLabel = document.createElement('label'); viewLabel.setAttribute('for', 'view-' + uid); viewLabel.textContent = 'View';
|
| 724 |
const viewSelect = document.createElement('select'); viewSelect.id = 'view-' + uid;
|
|
|
|
| 731 |
viewGroup.appendChild(viewLabel); viewGroup.appendChild(viewSelect);
|
| 732 |
controls.appendChild(viewGroup);
|
| 733 |
|
|
|
|
| 734 |
const metricGroup = document.createElement('div'); metricGroup.className = 'control-group';
|
| 735 |
const metricLabel = document.createElement('label'); metricLabel.setAttribute('for', 'metric-' + uid); metricLabel.textContent = 'Metric';
|
| 736 |
const metricSelect = document.createElement('select'); metricSelect.id = 'metric-' + uid;
|
|
|
|
| 739 |
|
| 740 |
container.appendChild(controls);
|
| 741 |
|
|
|
|
| 742 |
const legend = document.createElement('div'); legend.className = 'legend';
|
| 743 |
legend.innerHTML = '<div class="legend-title">Legend</div><div class="items"></div>';
|
| 744 |
container.appendChild(legend);
|
|
|
|
| 764 |
const items = container.querySelector('.legend .items');
|
| 765 |
if (!items) return;
|
| 766 |
items.innerHTML = '';
|
|
|
|
| 767 |
const grouped = d3.group(allData, d => d[RUN_COL]);
|
| 768 |
const sorted = Array.from(grouped.entries())
|
| 769 |
.map(([raw, rows]) => {
|
|
|
|
| 773 |
})
|
| 774 |
.sort((a, b) => b.score - a.score)
|
| 775 |
.map(d => d.raw);
|
| 776 |
+
sorted.filter(raw => !isBaseline(raw)).forEach(raw => {
|
| 777 |
const name = displayName(raw);
|
| 778 |
const el = document.createElement('span'); el.className = 'item'; el.setAttribute('data-name', name);
|
| 779 |
const sw = document.createElement('span'); sw.className = 'swatch';
|
| 780 |
+
const c = colorMap[raw] || '#999';
|
| 781 |
+
if (isShaded(raw)) {
|
| 782 |
+
sw.style.background = c;
|
| 783 |
+
sw.style.backgroundImage = 'repeating-linear-gradient(45deg, transparent, transparent 2px, rgba(255,255,255,0.4) 2px, rgba(255,255,255,0.4) 4px)';
|
| 784 |
+
} else {
|
| 785 |
+
sw.style.background = c;
|
| 786 |
+
}
|
| 787 |
const txt = document.createElement('span'); txt.textContent = name;
|
| 788 |
el.appendChild(sw); el.appendChild(txt); items.appendChild(el);
|
| 789 |
el.addEventListener('mouseenter', () => { highlight = name; updateHighlight(); });
|
|
|
|
| 819 |
const text = await fetchFirstAvailable(csvPaths);
|
| 820 |
const parsed = d3.csvParse(text);
|
| 821 |
parsedData = parsed;
|
|
|
|
| 822 |
if (SETUPS && setupNames.length >= 2) {
|
| 823 |
const avg = computeAverageData(parsed);
|
| 824 |
+
avgDatasets = avg.datasets;
|
| 825 |
parsedData = parsed.concat(avg.data);
|
| 826 |
parsedData.columns = parsed.columns;
|
| 827 |
}
|
|
|
|
| 828 |
filterData();
|
| 829 |
metricKeys = detectMetrics(allData.columns);
|
|
|
|
| 830 |
if (!metricKeys.includes(currentMetric)) currentMetric = metricKeys[0];
|
| 831 |
populateMetricSelect();
|
| 832 |
render();
|