joelniklaus HF Staff commited on
Commit
e575fa7
·
1 Parent(s): ae34010

updated synthetic only results

Browse files
app/src/content/assets/data/benchmark-results.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19eac7b4c7d51ef51fde0893bd2e5f646c501eafb4da20cff189bad2e2d45262
3
- size 1513555
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc7f86e2cd5b311eb1fec66972254890034acd866037b5096f1551ef877fe72e
3
+ size 1598658
app/src/content/chapters/3-experiments.mdx CHANGED
@@ -8,9 +8,9 @@ import ReadingTime from "../../components/ReadingTime.astro";
8
  {/* TODO: read through entire blog post and make improvements */}
9
  {/* TODO: Integrate decay experiment as another analysis for proxy */}
10
  {/* TODO: share on a bunch of discords/slacks/hackernews/locallama */}
11
- {/* TODO: brainstorm better banner, be artsy */}
12
  {/* TODO: run variance experiments with pretraining from scratch */}
13
  {/* TODO: go through the blog post and update the scale numbers for finephrase dataset */}
 
14
  {/* TODO: banner idea: 1T tokens = 8M books
15
  5cm pro buech = 400km
16
 
@@ -385,29 +385,50 @@ So far we've always mixed synthetic data with a <Glossary term="source dataset"
385
 
386
  #### Is synthetic data enough?
387
 
388
- The dream scenario would be generating all your training data synthetically, no curation needed. We test this by comparing synthetic-only training vs mixed training (synthetic + source) for [faq](#faq) and [tutorial](#tutorial) prompts on DCLM and FineWeb-Edu-HQ sources. Unfortunately, synthetic-only training falls short of both DCLM and mixed training (see <FigRef target="synthetic-only" />). Mixing consistently improves over both the synthetic-only and original-data-only baselines.
389
 
390
  <HtmlEmbed
391
  id="synthetic-only"
392
  src="d3-benchmark-comparison.html"
393
  desc="Synthetic-only vs mixed training. Use the Setup dropdown to compare across source datasets."
394
  config={{
 
395
  setups: {
396
  "DCLM Source": {
397
  datasets: {
 
 
 
398
  "mix-dclm-faq_1b_dclm": "Mix: FAQ + DCLM",
399
- dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
 
400
  "mix-dclm-tutorial_1b_dclm": "Mix: Tutorial + DCLM",
 
 
 
 
401
  faq_1b_dclm: "FAQ Only",
 
 
402
  tutorial_1b_dclm: "Tutorial Only"
403
  }
404
  },
405
  "FineWeb-Edu-HQ Source": {
406
  datasets: {
 
 
 
407
  "mix-fw_edu_hq-faq_1b_hq": "Mix: FAQ + FineWeb-Edu-HQ",
 
 
408
  "mix-fw_edu_hq-tutorial_1b_hq": "Mix: Tutorial + FineWeb-Edu-HQ",
409
  dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
 
 
 
410
  faq_1b_hq: "FAQ Only",
 
 
411
  tutorial_1b_hq: "Tutorial Only"
412
  }
413
  }
 
8
  {/* TODO: read through entire blog post and make improvements */}
9
  {/* TODO: Integrate decay experiment as another analysis for proxy */}
10
  {/* TODO: share on a bunch of discords/slacks/hackernews/locallama */}
 
11
  {/* TODO: run variance experiments with pretraining from scratch */}
12
  {/* TODO: go through the blog post and update the scale numbers for finephrase dataset */}
13
+ {/* TODO: brainstorm better banner, be artsy */}
14
  {/* TODO: banner idea: 1T tokens = 8M books
15
  5cm pro buech = 400km
16
 
 
385
 
386
  #### Is synthetic data enough?
387
 
388
+ The dream scenario would be generating all your training data synthetically, no curation needed. We test this by comparing synthetic-only training vs mixed training (synthetic + source) across all our prompts on DCLM and FineWeb-Edu-HQ sources. Unfortunately, synthetic-only training falls short of both DCLM and mixed training (see <FigRef target="synthetic-only" />). Mixing consistently improves over both the synthetic-only and original-data-only baselines, regardless of prompt type.
389
 
390
  <HtmlEmbed
391
  id="synthetic-only"
392
  src="d3-benchmark-comparison.html"
393
  desc="Synthetic-only vs mixed training. Use the Setup dropdown to compare across source datasets."
394
  config={{
395
+ hideAverage: true,
396
  setups: {
397
  "DCLM Source": {
398
  datasets: {
399
+ "mix-dclm-article_1b_dclm": "Mix: Article + DCLM",
400
+ "mix-dclm-commentary_1b_dclm": "Mix: Commentary + DCLM",
401
+ "mix-dclm-discussion_1b_dclm": "Mix: Discussion + DCLM",
402
  "mix-dclm-faq_1b_dclm": "Mix: FAQ + DCLM",
403
+ "mix-dclm-math_1b_dclm": "Mix: Math + DCLM",
404
+ "mix-dclm-table_1b_dclm": "Mix: Table + DCLM",
405
  "mix-dclm-tutorial_1b_dclm": "Mix: Tutorial + DCLM",
406
+ dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
407
+ article_1b_dclm: "Article Only",
408
+ commentary_1b_dclm: "Commentary Only",
409
+ discussion_1b_dclm: "Discussion Only",
410
  faq_1b_dclm: "FAQ Only",
411
+ math_1b_dclm: "Math Only",
412
+ table_1b_dclm: "Table Only",
413
  tutorial_1b_dclm: "Tutorial Only"
414
  }
415
  },
416
  "FineWeb-Edu-HQ Source": {
417
  datasets: {
418
+ "mix-fw_edu_hq-article_1b_hq": "Mix: Article + FineWeb-Edu-HQ",
419
+ "mix-fw_edu_hq-commentary_1b_hq": "Mix: Commentary + FineWeb-Edu-HQ",
420
+ "mix-fw_edu_hq-discussion_1b_hq": "Mix: Discussion + FineWeb-Edu-HQ",
421
  "mix-fw_edu_hq-faq_1b_hq": "Mix: FAQ + FineWeb-Edu-HQ",
422
+ "mix-fw_edu_hq-math_1b_hq": "Mix: Math + FineWeb-Edu-HQ",
423
+ "mix-fw_edu_hq-table_1b_hq": "Mix: Table + FineWeb-Edu-HQ",
424
  "mix-fw_edu_hq-tutorial_1b_hq": "Mix: Tutorial + FineWeb-Edu-HQ",
425
  dclm: { display: "Baseline (DCLM)", color: "#8b8b8b", baseline: true },
426
+ article_1b_hq: "Article Only",
427
+ commentary_1b_hq: "Commentary Only",
428
+ discussion_1b_hq: "Discussion Only",
429
  faq_1b_hq: "FAQ Only",
430
+ math_1b_hq: "Math Only",
431
+ table_1b_hq: "Table Only",
432
  tutorial_1b_hq: "Tutorial Only"
433
  }
434
  }
app/src/content/embeds/d3-benchmark-comparison.html CHANGED
@@ -205,7 +205,8 @@
205
  const SETUPS = cfg.setups || null;
206
  const setupNames = SETUPS ? Object.keys(SETUPS) : [];
207
  const AVG_SETUP_KEY = 'Average (all setups)';
208
- const defaultSetupCfg = cfg.defaultSetup || (setupNames.length >= 2 ? 'average' : null);
 
209
  let currentSetup = SETUPS ? (defaultSetupCfg === 'average' ? AVG_SETUP_KEY : (defaultSetupCfg && setupNames.includes(defaultSetupCfg) ? defaultSetupCfg : setupNames[0])) : null;
210
  let DATASETS = SETUPS ? (currentSetup === AVG_SETUP_KEY ? {} : normalizeDatasets(SETUPS[currentSetup].datasets)) : normalizeDatasets(cfg.datasets);
211
  let avgDatasets = {};
@@ -712,7 +713,7 @@
712
  if (name === currentSetup) opt.selected = true;
713
  setupSelect.appendChild(opt);
714
  });
715
- if (setupNames.length >= 2) {
716
  const avgOpt = document.createElement('option'); avgOpt.value = AVG_SETUP_KEY; avgOpt.textContent = AVG_SETUP_KEY;
717
  if (currentSetup === AVG_SETUP_KEY) avgOpt.selected = true;
718
  setupSelect.appendChild(avgOpt);
@@ -822,7 +823,7 @@
822
  const text = await fetchFirstAvailable(csvPaths);
823
  const parsed = d3.csvParse(text);
824
  parsedData = parsed;
825
- if (SETUPS && setupNames.length >= 2) {
826
  const avg = computeAverageData(parsed);
827
  avgDatasets = avg.datasets;
828
  const hasAvgData = Object.values(avgDatasets).some(o => !o.baseline);
 
205
  const SETUPS = cfg.setups || null;
206
  const setupNames = SETUPS ? Object.keys(SETUPS) : [];
207
  const AVG_SETUP_KEY = 'Average (all setups)';
208
+ const HIDE_AVERAGE = !!cfg.hideAverage;
209
+ const defaultSetupCfg = cfg.defaultSetup || (setupNames.length >= 2 && !HIDE_AVERAGE ? 'average' : null);
210
  let currentSetup = SETUPS ? (defaultSetupCfg === 'average' ? AVG_SETUP_KEY : (defaultSetupCfg && setupNames.includes(defaultSetupCfg) ? defaultSetupCfg : setupNames[0])) : null;
211
  let DATASETS = SETUPS ? (currentSetup === AVG_SETUP_KEY ? {} : normalizeDatasets(SETUPS[currentSetup].datasets)) : normalizeDatasets(cfg.datasets);
212
  let avgDatasets = {};
 
713
  if (name === currentSetup) opt.selected = true;
714
  setupSelect.appendChild(opt);
715
  });
716
+ if (setupNames.length >= 2 && !HIDE_AVERAGE) {
717
  const avgOpt = document.createElement('option'); avgOpt.value = AVG_SETUP_KEY; avgOpt.textContent = AVG_SETUP_KEY;
718
  if (currentSetup === AVG_SETUP_KEY) avgOpt.selected = true;
719
  setupSelect.appendChild(avgOpt);
 
823
  const text = await fetchFirstAvailable(csvPaths);
824
  const parsed = d3.csvParse(text);
825
  parsedData = parsed;
826
+ if (SETUPS && setupNames.length >= 2 && !HIDE_AVERAGE) {
827
  const avg = computeAverageData(parsed);
828
  avgDatasets = avg.datasets;
829
  const hasAvgData = Object.values(avgDatasets).some(o => !o.baseline);