Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Commit ·
05b639b
1
Parent(s): 17a96eb
add difference in macro average on benchmarks and add percentage difference for dclm and edu scores
Browse files
app/src/content/assets/data/rephrasing_metadata.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:65df95a2f19779d4958b9e68b7959deec89eb5da2daedc2a956b8b1863e42160
|
| 3 |
+
size 128848
|
app/src/content/chapters/experiments.mdx
CHANGED
|
@@ -8,7 +8,7 @@ import FigRef from "../../components/FigRef.astro";
|
|
| 8 |
{/* TODO: shorten the vllm inference benchmark or put stuff into the appendix */}
|
| 9 |
{/* TODO: potentially make a widget for data exploration: look at the same few samples generated by different models or transformed with different prompts */}
|
| 10 |
{/* TODO: add a plot for the table with the benchmark results */}
|
| 11 |
-
{/* TODO: Analyze if certain models are more verbose than others (how many tokens did they produce per prompt?)
|
| 12 |
{/* TODO: Run dclm and edu score impact analysis on model verbosity data (wait for last rephrasing job to be done) */}
|
| 13 |
{/* TODO: Add appendix section of weird unexplainable results? */}
|
| 14 |
|
|
|
|
| 8 |
{/* TODO: shorten the vllm inference benchmark or put stuff into the appendix */}
|
| 9 |
{/* TODO: potentially make a widget for data exploration: look at the same few samples generated by different models or transformed with different prompts */}
|
| 10 |
{/* TODO: add a plot for the table with the benchmark results */}
|
| 11 |
+
{/* TODO: Analyze if certain models are more verbose than others (how many tokens did they produce per prompt?) */}
|
| 12 |
{/* TODO: Run dclm and edu score impact analysis on model verbosity data (wait for last rephrasing job to be done) */}
|
| 13 |
{/* TODO: Add appendix section of weird unexplainable results? */}
|
| 14 |
|
app/src/content/embeds/banner.html
CHANGED
|
@@ -22,12 +22,13 @@
|
|
| 22 |
container.style.position = 'relative';
|
| 23 |
|
| 24 |
const JSON_PATHS = ['/data/rephrasing_metadata.json', './assets/data/rephrasing_metadata.json'];
|
|
|
|
| 25 |
|
| 26 |
-
const fetchFirstAvailable = async (paths) => {
|
| 27 |
for (const p of paths) {
|
| 28 |
-
try { const r = await fetch(p, { cache: 'no-cache' }); if (r.ok) return r.json(); } catch(_) {}
|
| 29 |
}
|
| 30 |
-
throw new Error('
|
| 31 |
};
|
| 32 |
|
| 33 |
// Derive display fields from a JSON entry
|
|
@@ -58,12 +59,41 @@
|
|
| 58 |
return d.toLocaleString() + ' days';
|
| 59 |
}
|
| 60 |
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
const [cat, promptFile] = d.prompt.split('/');
|
| 63 |
const promptKey = promptFile.replace('.md', '');
|
| 64 |
const modelShort = d.model.split('/').pop();
|
| 65 |
const modelLower = d.model.toLowerCase();
|
| 66 |
const family = Object.entries(FAMILY_MAP).find(([k]) => modelLower.includes(k))?.[1] || 'Other';
|
|
|
|
|
|
|
| 67 |
return {
|
| 68 |
id: i,
|
| 69 |
prompt: PROMPT_LABELS[promptKey] || promptKey,
|
|
@@ -78,7 +108,11 @@
|
|
| 78 |
gpuTime: gpuDays(d.gpu_time_seconds),
|
| 79 |
docsM: d.num_documents / 1e6,
|
| 80 |
dclm: d.dclm_score_difference,
|
|
|
|
| 81 |
edu: d.edu_score_difference,
|
|
|
|
|
|
|
|
|
|
| 82 |
phase: (i * 2.399) % (Math.PI * 2)
|
| 83 |
};
|
| 84 |
}
|
|
@@ -88,8 +122,23 @@
|
|
| 88 |
return n.toFixed(1) + 'B';
|
| 89 |
}
|
| 90 |
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
const totalOutputB = data.reduce((s, d) => s + d.compB, 0);
|
| 94 |
const totalDocsM = data.reduce((s, d) => s + d.docsM, 0);
|
| 95 |
const numExperiments = data.length;
|
|
@@ -299,6 +348,10 @@
|
|
| 299 |
const c = col[d.family];
|
| 300 |
const dc = d.dclm >= 0 ? '#16a34a' : '#dc2626';
|
| 301 |
const ec = d.edu >= 0 ? '#16a34a' : '#dc2626';
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
tip.html(
|
| 303 |
`<div style="display:flex;align-items:center;gap:6px;margin-bottom:4px">` +
|
| 304 |
`<span style="width:8px;height:8px;border-radius:50%;background:${c};opacity:.6;display:inline-block"></span>` +
|
|
@@ -309,8 +362,9 @@
|
|
| 309 |
`<span style="opacity:.35">Input</span><span>${d.inputHuman}</span>` +
|
| 310 |
`<span style="opacity:.35">GPU time</span><span>${d.gpuTime}</span>` +
|
| 311 |
`<span style="opacity:.35">Docs</span><span>${d.docsM.toFixed(1)}M</span>` +
|
| 312 |
-
`<span style="opacity:.35">DCLM</span><span style="color:${dc}">${d.dclm
|
| 313 |
-
`<span style="opacity:.35">Edu</span><span style="color:${ec}">${d.edu
|
|
|
|
| 314 |
).style('opacity', 1);
|
| 315 |
})
|
| 316 |
.on('mousemove', function(event) {
|
|
|
|
| 22 |
container.style.position = 'relative';
|
| 23 |
|
| 24 |
const JSON_PATHS = ['/data/rephrasing_metadata.json', './assets/data/rephrasing_metadata.json'];
|
| 25 |
+
const CSV_PATHS = ['/data/benchmark-results.csv', './assets/data/benchmark-results.csv'];
|
| 26 |
|
| 27 |
+
const fetchFirstAvailable = async (paths, parse) => {
|
| 28 |
for (const p of paths) {
|
| 29 |
+
try { const r = await fetch(p, { cache: 'no-cache' }); if (r.ok) return parse ? parse(await r.text()) : r.json(); } catch(_) {}
|
| 30 |
}
|
| 31 |
+
throw new Error('Data not found: ' + paths.join(', '));
|
| 32 |
};
|
| 33 |
|
| 34 |
// Derive display fields from a JSON entry
|
|
|
|
| 59 |
return d.toLocaleString() + ' days';
|
| 60 |
}
|
| 61 |
|
| 62 |
+
// Map source_dataset names to baseline run names in the CSV
|
| 63 |
+
const SOURCE_TO_BASELINE_RUN = {
|
| 64 |
+
'fineweb-edu-hq-20BT': 'fw_edu_hq',
|
| 65 |
+
'fineweb-edu-lq-20BT': 'fw_edu_lq',
|
| 66 |
+
'dclm-37BT': 'dclm',
|
| 67 |
+
'cosmopedia-25BT': 'cosmopedia'
|
| 68 |
+
};
|
| 69 |
+
|
| 70 |
+
// Extract max-step agg_score_macro per baseline run from CSV rows
|
| 71 |
+
function buildBaselineMacro(csvRows) {
|
| 72 |
+
const baselineRuns = new Set(Object.values(SOURCE_TO_BASELINE_RUN));
|
| 73 |
+
const best = {};
|
| 74 |
+
for (const row of csvRows) {
|
| 75 |
+
if (!baselineRuns.has(row.runname)) continue;
|
| 76 |
+
const step = +row.steps;
|
| 77 |
+
const score = +row.agg_score_macro;
|
| 78 |
+
if (!(row.runname in best) || step > best[row.runname][0]) {
|
| 79 |
+
best[row.runname] = [step, score];
|
| 80 |
+
}
|
| 81 |
+
}
|
| 82 |
+
const out = {};
|
| 83 |
+
for (const [src, run] of Object.entries(SOURCE_TO_BASELINE_RUN)) {
|
| 84 |
+
if (run in best) out[src] = best[run][1];
|
| 85 |
+
}
|
| 86 |
+
return out;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
function parseEntry(d, i, baselineMacro) {
|
| 90 |
const [cat, promptFile] = d.prompt.split('/');
|
| 91 |
const promptKey = promptFile.replace('.md', '');
|
| 92 |
const modelShort = d.model.split('/').pop();
|
| 93 |
const modelLower = d.model.toLowerCase();
|
| 94 |
const family = Object.entries(FAMILY_MAP).find(([k]) => modelLower.includes(k))?.[1] || 'Other';
|
| 95 |
+
const aggMacro = d.results?.agg_score_macro;
|
| 96 |
+
const baseline = baselineMacro[d.source_dataset];
|
| 97 |
return {
|
| 98 |
id: i,
|
| 99 |
prompt: PROMPT_LABELS[promptKey] || promptKey,
|
|
|
|
| 108 |
gpuTime: gpuDays(d.gpu_time_seconds),
|
| 109 |
docsM: d.num_documents / 1e6,
|
| 110 |
dclm: d.dclm_score_difference,
|
| 111 |
+
dclmBase: d.input_dclm_score,
|
| 112 |
edu: d.edu_score_difference,
|
| 113 |
+
eduBase: d.input_edu_score,
|
| 114 |
+
aggDiff: (aggMacro != null && baseline != null) ? aggMacro - baseline : null,
|
| 115 |
+
aggBase: baseline,
|
| 116 |
phase: (i * 2.399) % (Math.PI * 2)
|
| 117 |
};
|
| 118 |
}
|
|
|
|
| 122 |
return n.toFixed(1) + 'B';
|
| 123 |
}
|
| 124 |
|
| 125 |
+
// Format absolute diff + relative % in brackets, e.g. "+0.018 (+12.3%)"
|
| 126 |
+
function fmtDelta(diff, base) {
|
| 127 |
+
const sign = diff >= 0 ? '+' : '';
|
| 128 |
+
const abs = `${sign}${diff.toFixed(3)}`;
|
| 129 |
+
if (base != null && base !== 0) {
|
| 130 |
+
const pct = (diff / base) * 100;
|
| 131 |
+
return `${abs} <span style="opacity:.5">(${pct >= 0 ? '+' : ''}${pct.toFixed(1)}%)</span>`;
|
| 132 |
+
}
|
| 133 |
+
return abs;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
Promise.all([
|
| 137 |
+
fetchFirstAvailable(JSON_PATHS),
|
| 138 |
+
fetchFirstAvailable(CSV_PATHS, d3.csvParse)
|
| 139 |
+
]).then(([raw, csvRows]) => {
|
| 140 |
+
const baselineMacro = buildBaselineMacro(csvRows);
|
| 141 |
+
const data = raw.map((d, i) => parseEntry(d, i, baselineMacro));
|
| 142 |
const totalOutputB = data.reduce((s, d) => s + d.compB, 0);
|
| 143 |
const totalDocsM = data.reduce((s, d) => s + d.docsM, 0);
|
| 144 |
const numExperiments = data.length;
|
|
|
|
| 348 |
const c = col[d.family];
|
| 349 |
const dc = d.dclm >= 0 ? '#16a34a' : '#dc2626';
|
| 350 |
const ec = d.edu >= 0 ? '#16a34a' : '#dc2626';
|
| 351 |
+
const ac = d.aggDiff != null ? (d.aggDiff >= 0 ? '#16a34a' : '#dc2626') : null;
|
| 352 |
+
const aggRow = d.aggDiff != null
|
| 353 |
+
? `<span style="opacity:.35">Δ Macro</span><span style="color:${ac}">${fmtDelta(d.aggDiff, d.aggBase)}</span>`
|
| 354 |
+
: '';
|
| 355 |
tip.html(
|
| 356 |
`<div style="display:flex;align-items:center;gap:6px;margin-bottom:4px">` +
|
| 357 |
`<span style="width:8px;height:8px;border-radius:50%;background:${c};opacity:.6;display:inline-block"></span>` +
|
|
|
|
| 362 |
`<span style="opacity:.35">Input</span><span>${d.inputHuman}</span>` +
|
| 363 |
`<span style="opacity:.35">GPU time</span><span>${d.gpuTime}</span>` +
|
| 364 |
`<span style="opacity:.35">Docs</span><span>${d.docsM.toFixed(1)}M</span>` +
|
| 365 |
+
`<span style="opacity:.35">DCLM</span><span style="color:${dc}">${fmtDelta(d.dclm, d.dclmBase)}</span>` +
|
| 366 |
+
`<span style="opacity:.35">Edu</span><span style="color:${ec}">${fmtDelta(d.edu, d.eduBase)}</span>` +
|
| 367 |
+
aggRow + `</div>`
|
| 368 |
).style('opacity', 1);
|
| 369 |
})
|
| 370 |
.on('mousemove', function(event) {
|