joelniklaus HF Staff commited on
Commit
05b639b
·
1 Parent(s): 17a96eb

add difference in macro average on benchmarks and add percentage difference for dclm and edu scores

Browse files
app/src/content/assets/data/rephrasing_metadata.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ead55bd8a7db04855376728b9f0bfea398e445c9bad179fcb027dceddcff8a1
3
- size 69832
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65df95a2f19779d4958b9e68b7959deec89eb5da2daedc2a956b8b1863e42160
3
+ size 128848
app/src/content/chapters/experiments.mdx CHANGED
@@ -8,7 +8,7 @@ import FigRef from "../../components/FigRef.astro";
8
  {/* TODO: shorten the vllm inference benchmark or put stuff into the appendix */}
9
  {/* TODO: potentially make a widget for data exploration: look at the same few samples generated by different models or transformed with different prompts */}
10
  {/* TODO: add a plot for the table with the benchmark results */}
11
- {/* TODO: Analyze if certain models are more verbose than others (how many tokens did they produce per prompt?) (wait for last rephrasing job to be done) */}
12
  {/* TODO: Run dclm and edu score impact analysis on model verbosity data (wait for last rephrasing job to be done) */}
13
  {/* TODO: Add appendix section of weird unexplainable results? */}
14
 
 
8
  {/* TODO: shorten the vllm inference benchmark or put stuff into the appendix */}
9
  {/* TODO: potentially make a widget for data exploration: look at the same few samples generated by different models or transformed with different prompts */}
10
  {/* TODO: add a plot for the table with the benchmark results */}
11
+ {/* TODO: Analyze if certain models are more verbose than others (how many tokens did they produce per prompt?) */}
12
  {/* TODO: Run dclm and edu score impact analysis on model verbosity data (wait for last rephrasing job to be done) */}
13
  {/* TODO: Add appendix section of weird unexplainable results? */}
14
 
app/src/content/embeds/banner.html CHANGED
@@ -22,12 +22,13 @@
22
  container.style.position = 'relative';
23
 
24
  const JSON_PATHS = ['/data/rephrasing_metadata.json', './assets/data/rephrasing_metadata.json'];
 
25
 
26
- const fetchFirstAvailable = async (paths) => {
27
  for (const p of paths) {
28
- try { const r = await fetch(p, { cache: 'no-cache' }); if (r.ok) return r.json(); } catch(_) {}
29
  }
30
- throw new Error('rephrasing_metadata.json not found');
31
  };
32
 
33
  // Derive display fields from a JSON entry
@@ -58,12 +59,41 @@
58
  return d.toLocaleString() + ' days';
59
  }
60
 
61
- function parseEntry(d, i) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  const [cat, promptFile] = d.prompt.split('/');
63
  const promptKey = promptFile.replace('.md', '');
64
  const modelShort = d.model.split('/').pop();
65
  const modelLower = d.model.toLowerCase();
66
  const family = Object.entries(FAMILY_MAP).find(([k]) => modelLower.includes(k))?.[1] || 'Other';
 
 
67
  return {
68
  id: i,
69
  prompt: PROMPT_LABELS[promptKey] || promptKey,
@@ -78,7 +108,11 @@
78
  gpuTime: gpuDays(d.gpu_time_seconds),
79
  docsM: d.num_documents / 1e6,
80
  dclm: d.dclm_score_difference,
 
81
  edu: d.edu_score_difference,
 
 
 
82
  phase: (i * 2.399) % (Math.PI * 2)
83
  };
84
  }
@@ -88,8 +122,23 @@
88
  return n.toFixed(1) + 'B';
89
  }
90
 
91
- fetchFirstAvailable(JSON_PATHS).then(raw => {
92
- const data = raw.map(parseEntry);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  const totalOutputB = data.reduce((s, d) => s + d.compB, 0);
94
  const totalDocsM = data.reduce((s, d) => s + d.docsM, 0);
95
  const numExperiments = data.length;
@@ -299,6 +348,10 @@
299
  const c = col[d.family];
300
  const dc = d.dclm >= 0 ? '#16a34a' : '#dc2626';
301
  const ec = d.edu >= 0 ? '#16a34a' : '#dc2626';
 
 
 
 
302
  tip.html(
303
  `<div style="display:flex;align-items:center;gap:6px;margin-bottom:4px">` +
304
  `<span style="width:8px;height:8px;border-radius:50%;background:${c};opacity:.6;display:inline-block"></span>` +
@@ -309,8 +362,9 @@
309
  `<span style="opacity:.35">Input</span><span>${d.inputHuman}</span>` +
310
  `<span style="opacity:.35">GPU time</span><span>${d.gpuTime}</span>` +
311
  `<span style="opacity:.35">Docs</span><span>${d.docsM.toFixed(1)}M</span>` +
312
- `<span style="opacity:.35">DCLM</span><span style="color:${dc}">${d.dclm >= 0 ? '+' : ''}${d.dclm.toFixed(3)}</span>` +
313
- `<span style="opacity:.35">Edu</span><span style="color:${ec}">${d.edu >= 0 ? '+' : ''}${d.edu.toFixed(3)}</span></div>`
 
314
  ).style('opacity', 1);
315
  })
316
  .on('mousemove', function(event) {
 
22
  container.style.position = 'relative';
23
 
24
  const JSON_PATHS = ['/data/rephrasing_metadata.json', './assets/data/rephrasing_metadata.json'];
25
+ const CSV_PATHS = ['/data/benchmark-results.csv', './assets/data/benchmark-results.csv'];
26
 
27
+ const fetchFirstAvailable = async (paths, parse) => {
28
  for (const p of paths) {
29
+ try { const r = await fetch(p, { cache: 'no-cache' }); if (r.ok) return parse ? parse(await r.text()) : r.json(); } catch(_) {}
30
  }
31
+ throw new Error('Data not found: ' + paths.join(', '));
32
  };
33
 
34
  // Derive display fields from a JSON entry
 
59
  return d.toLocaleString() + ' days';
60
  }
61
 
62
+ // Map source_dataset names to baseline run names in the CSV
63
+ const SOURCE_TO_BASELINE_RUN = {
64
+ 'fineweb-edu-hq-20BT': 'fw_edu_hq',
65
+ 'fineweb-edu-lq-20BT': 'fw_edu_lq',
66
+ 'dclm-37BT': 'dclm',
67
+ 'cosmopedia-25BT': 'cosmopedia'
68
+ };
69
+
70
+ // Extract max-step agg_score_macro per baseline run from CSV rows
71
+ function buildBaselineMacro(csvRows) {
72
+ const baselineRuns = new Set(Object.values(SOURCE_TO_BASELINE_RUN));
73
+ const best = {};
74
+ for (const row of csvRows) {
75
+ if (!baselineRuns.has(row.runname)) continue;
76
+ const step = +row.steps;
77
+ const score = +row.agg_score_macro;
78
+ if (!(row.runname in best) || step > best[row.runname][0]) {
79
+ best[row.runname] = [step, score];
80
+ }
81
+ }
82
+ const out = {};
83
+ for (const [src, run] of Object.entries(SOURCE_TO_BASELINE_RUN)) {
84
+ if (run in best) out[src] = best[run][1];
85
+ }
86
+ return out;
87
+ }
88
+
89
+ function parseEntry(d, i, baselineMacro) {
90
  const [cat, promptFile] = d.prompt.split('/');
91
  const promptKey = promptFile.replace('.md', '');
92
  const modelShort = d.model.split('/').pop();
93
  const modelLower = d.model.toLowerCase();
94
  const family = Object.entries(FAMILY_MAP).find(([k]) => modelLower.includes(k))?.[1] || 'Other';
95
+ const aggMacro = d.results?.agg_score_macro;
96
+ const baseline = baselineMacro[d.source_dataset];
97
  return {
98
  id: i,
99
  prompt: PROMPT_LABELS[promptKey] || promptKey,
 
108
  gpuTime: gpuDays(d.gpu_time_seconds),
109
  docsM: d.num_documents / 1e6,
110
  dclm: d.dclm_score_difference,
111
+ dclmBase: d.input_dclm_score,
112
  edu: d.edu_score_difference,
113
+ eduBase: d.input_edu_score,
114
+ aggDiff: (aggMacro != null && baseline != null) ? aggMacro - baseline : null,
115
+ aggBase: baseline,
116
  phase: (i * 2.399) % (Math.PI * 2)
117
  };
118
  }
 
122
  return n.toFixed(1) + 'B';
123
  }
124
 
125
+ // Format absolute diff + relative % in brackets, e.g. "+0.018 (+12.3%)"
126
+ function fmtDelta(diff, base) {
127
+ const sign = diff >= 0 ? '+' : '';
128
+ const abs = `${sign}${diff.toFixed(3)}`;
129
+ if (base != null && base !== 0) {
130
+ const pct = (diff / base) * 100;
131
+ return `${abs} <span style="opacity:.5">(${pct >= 0 ? '+' : ''}${pct.toFixed(1)}%)</span>`;
132
+ }
133
+ return abs;
134
+ }
135
+
136
+ Promise.all([
137
+ fetchFirstAvailable(JSON_PATHS),
138
+ fetchFirstAvailable(CSV_PATHS, d3.csvParse)
139
+ ]).then(([raw, csvRows]) => {
140
+ const baselineMacro = buildBaselineMacro(csvRows);
141
+ const data = raw.map((d, i) => parseEntry(d, i, baselineMacro));
142
  const totalOutputB = data.reduce((s, d) => s + d.compB, 0);
143
  const totalDocsM = data.reduce((s, d) => s + d.docsM, 0);
144
  const numExperiments = data.length;
 
348
  const c = col[d.family];
349
  const dc = d.dclm >= 0 ? '#16a34a' : '#dc2626';
350
  const ec = d.edu >= 0 ? '#16a34a' : '#dc2626';
351
+ const ac = d.aggDiff != null ? (d.aggDiff >= 0 ? '#16a34a' : '#dc2626') : null;
352
+ const aggRow = d.aggDiff != null
353
+ ? `<span style="opacity:.35">Δ Macro</span><span style="color:${ac}">${fmtDelta(d.aggDiff, d.aggBase)}</span>`
354
+ : '';
355
  tip.html(
356
  `<div style="display:flex;align-items:center;gap:6px;margin-bottom:4px">` +
357
  `<span style="width:8px;height:8px;border-radius:50%;background:${c};opacity:.6;display:inline-block"></span>` +
 
362
  `<span style="opacity:.35">Input</span><span>${d.inputHuman}</span>` +
363
  `<span style="opacity:.35">GPU time</span><span>${d.gpuTime}</span>` +
364
  `<span style="opacity:.35">Docs</span><span>${d.docsM.toFixed(1)}M</span>` +
365
+ `<span style="opacity:.35">DCLM</span><span style="color:${dc}">${fmtDelta(d.dclm, d.dclmBase)}</span>` +
366
+ `<span style="opacity:.35">Edu</span><span style="color:${ec}">${fmtDelta(d.edu, d.eduBase)}</span>` +
367
+ aggRow + `</div>`
368
  ).style('opacity', 1);
369
  })
370
  .on('mousemove', function(event) {