finephrase

Running on CPU Upgrade

App Files Files Community

joelniklaus HF Staff commited on Feb 17

Commit

5ea1a40

1 Parent(s): a160373

add dclm/edu score correlation analysis

Browse files

Files changed (5) hide show

app/src/content/analysis/score_correlation.py +185 -0
app/src/content/analysis/score_correlation_results.json +3 -0
app/src/content/chapters/analyses.mdx +22 -4
app/src/content/chapters/experiments.mdx +2 -5
app/src/content/embeds/score-correlation.html +596 -0

app/src/content/analysis/score_correlation.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""Analyze whether edu-score or DCLM-score predict downstream benchmark performance."""
+import json
+import logging
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from scipy import stats
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+logger = logging.getLogger(__name__)
+DATA_PATH = Path(__file__).parent / "assets/data/rephrasing_metadata.json"
+# Individual benchmarks (not aggregates)
+INDIVIDUAL_BENCHMARKS = [
+    "squad_v2", "arc_cf:easy", "hellaswag_cf", "mmlu_redux_cf:_average",
+    "gsm8k", "drop", "wikitablequestions", "treb_qa",
+    "winogrande_cf", "piqa_cf", "openbookqa_cf", "xcsqa_cf",
+]
+# Aggregate scores
+AGG_SCORES = [
+    "agg_score_GK", "agg_score_RC", "agg_score_RES",
+    "agg_score_NLU", "agg_score_MATH", "agg_score_TABLE",
+    "agg_score_macro", "agg_score_micro",
+]
+ALL_TARGETS = INDIVIDUAL_BENCHMARKS + AGG_SCORES
+# Score predictors to test (both input and output variants)
+PREDICTORS = [
+    "input_edu_score", "output_edu_score", "edu_score_difference", "edu_score_improvement",
+    "input_dclm_score", "output_dclm_score", "dclm_score_difference", "dclm_score_improvement",
+]
+def load_data() -> pd.DataFrame:
+    """Load rephrasing metadata and flatten results into columns."""
+    with open(DATA_PATH) as f:
+        raw = json.load(f)
+    rows = []
+    for entry in raw:
+        row = {k: v for k, v in entry.items() if k != "results"}
+        row.update(entry["results"])
+        rows.append(row)
+    return pd.DataFrame(rows)
+def compute_correlations(df: pd.DataFrame) -> pd.DataFrame:
+    """Compute Pearson and Spearman correlations between predictors and targets."""
+    results = []
+    for predictor in PREDICTORS:
+        for target in ALL_TARGETS:
+            x = df[predictor].values
+            y = df[target].values
+            # Drop NaN pairs
+            mask = ~(np.isnan(x) | np.isnan(y))
+            x, y = x[mask], y[mask]
+            if len(x) < 5:
+                continue
+            pearson_r, pearson_p = stats.pearsonr(x, y)
+            spearman_r, spearman_p = stats.spearmanr(x, y)
+            results.append({
+                "predictor": predictor,
+                "target": target,
+                "is_aggregate": target in AGG_SCORES,
+                "pearson_r": pearson_r,
+                "pearson_p": pearson_p,
+                "spearman_r": spearman_r,
+                "spearman_p": spearman_p,
+                "n": len(x),
+            })
+    return pd.DataFrame(results)
+def print_correlation_table(corr_df: pd.DataFrame, title: str, sort_by: str = "spearman_r") -> None:
+    """Print a formatted correlation table sorted by absolute correlation."""
+    logger.info(f"\n{'='*90}")
+    logger.info(f" {title}")
+    logger.info(f"{'='*90}")
+    df = corr_df.copy()
+    df["abs_spearman"] = df["spearman_r"].abs()
+    df = df.sort_values("abs_spearman", ascending=False)
+    logger.info(f"{'Predictor':<28} {'Target':<28} {'Pearson r':>10} {'p':>10} {'Spearman r':>10} {'p':>10}")
+    logger.info("-" * 98)
+    for _, row in df.iterrows():
+        sig_marker = ""
+        if row["spearman_p"] < 0.001:
+            sig_marker = "***"
+        elif row["spearman_p"] < 0.01:
+            sig_marker = "**"
+        elif row["spearman_p"] < 0.05:
+            sig_marker = "*"
+        logger.info(
+            f"{row['predictor']:<28} {row['target']:<28} "
+            f"{row['pearson_r']:>9.4f} {row['pearson_p']:>10.4f} "
+            f"{row['spearman_r']:>9.4f} {row['spearman_p']:>10.4f} {sig_marker}"
+        )
+def main() -> None:
+    df = load_data()
+    logger.info(f"Loaded {len(df)} experiments")
+    corr_df = compute_correlations(df)
+    # 1. Overall best predictors for agg_score_macro
+    macro_corr = corr_df[corr_df["target"] == "agg_score_macro"]
+    print_correlation_table(macro_corr, "Correlations with agg_score_macro")
+    # 2. Best predictors for each aggregate score
+    agg_corr = corr_df[corr_df["is_aggregate"]]
+    print_correlation_table(agg_corr, "All predictor-aggregate correlations")
+    # 3. Best predictors for individual benchmarks
+    indiv_corr = corr_df[~corr_df["is_aggregate"]]
+    print_correlation_table(indiv_corr, "All predictor-individual benchmark correlations")
+    # 4. Summary: for each predictor, which targets correlate best?
+    logger.info(f"\n{'='*90}")
+    logger.info(" Summary: Best target for each predictor (by |Spearman r|)")
+    logger.info(f"{'='*90}")
+    for predictor in PREDICTORS:
+        sub = corr_df[corr_df["predictor"] == predictor].copy()
+        sub["abs_spearman"] = sub["spearman_r"].abs()
+        best = sub.sort_values("abs_spearman", ascending=False).head(3)
+        logger.info(f"\n  {predictor}:")
+        for _, row in best.iterrows():
+            sig = "***" if row["spearman_p"] < 0.001 else ("**" if row["spearman_p"] < 0.01 else ("*" if row["spearman_p"] < 0.05 else ""))
+            logger.info(f"    {row['target']:<28} r={row['spearman_r']:>7.4f}  p={row['spearman_p']:.4f} {sig}")
+    # 5. Summary: for each target, which predictor correlates best?
+    logger.info(f"\n{'='*90}")
+    logger.info(" Summary: Best predictor for each target (by |Spearman r|)")
+    logger.info(f"{'='*90}")
+    for target in ALL_TARGETS:
+        sub = corr_df[corr_df["target"] == target].copy()
+        sub["abs_spearman"] = sub["spearman_r"].abs()
+        best = sub.sort_values("abs_spearman", ascending=False).iloc[0]
+        sig = "***" if best["spearman_p"] < 0.001 else ("**" if best["spearman_p"] < 0.01 else ("*" if best["spearman_p"] < 0.05 else ""))
+        logger.info(
+            f"  {target:<28} <- {best['predictor']:<28} r={best['spearman_r']:>7.4f}  p={best['spearman_p']:.4f} {sig}"
+        )
+    # 6. Heatmap data: pivot table of Spearman correlations
+    logger.info(f"\n{'='*90}")
+    logger.info(" Spearman correlation heatmap (predictor x target)")
+    logger.info(f"{'='*90}")
+    pivot = corr_df.pivot(index="predictor", columns="target", values="spearman_r")
+    # Reorder
+    pivot = pivot.loc[PREDICTORS, ALL_TARGETS]
+    logger.info(pivot.round(3).to_string())
+    # Save heatmap data for potential D3 visualization
+    output_path = Path(__file__).parent / "score_correlation_results.json"
+    output = {
+        "heatmap": {
+            "predictors": PREDICTORS,
+            "targets": ALL_TARGETS,
+            "spearman_r": pivot.values.tolist(),
+            "individual_benchmarks": INDIVIDUAL_BENCHMARKS,
+            "aggregate_scores": AGG_SCORES,
+        },
+        "correlations": corr_df.to_dict(orient="records"),
+    }
+    with open(output_path, "w") as f:
+        json.dump(output, f, indent=2)
+    logger.info(f"\nSaved results to {output_path}")
+if __name__ == "__main__":
+    main()

app/src/content/analysis/score_correlation_results.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb63ac51a4ac538702c4943266c5848d8f8257fdc34268e95e1d3ce6e1847012
+size 52825

app/src/content/chapters/analyses.mdx CHANGED Viewed

@@ -1,20 +1,38 @@
 import HtmlEmbed from "../../components/HtmlEmbed.astro";
 import FigRef from "../../components/FigRef.astro";
 ## Analyses
-Our final experiment explores an even more counterintuitive finding.
-{/*
 ### Does edu-score or DCLM-score predict model performance?
-Running these ablations is super expensive. So we were looking for informative proxies that can predict whether a certain dataset will result in better downstream benchmark performance. Since the FineWeb-Edu-score and DCLM-score work well for human data, we surmised it could also work for synthetic data.
-TODO: Run this analysis and add a small report
 */}
 ### Math Rephrasing: When "Worse" Outputs Win
 We compared two ~1.7B parameter models for generating math word problems: SmolLM2 and Qwen3. SmolLM2's outputs looked objectively worse, yet models trained on them performed better.

 import HtmlEmbed from "../../components/HtmlEmbed.astro";
 import FigRef from "../../components/FigRef.astro";
+import Wide from "../../components/Wide.astro";
 ## Analyses
+TODO: Add entry and exit paragraph
 ### Does edu-score or DCLM-score predict model performance?
+Running these ablations is super expensive. So we were looking for informative proxies that can predict whether a certain dataset will result in better downstream benchmark performance. Since the FineWeb-Edu-score and DCLM-score work well for human data, we thought they might also work for synthetic data.
+We computed Spearman rank correlations between various edu-score and DCLM-score metrics (input scores, output scores, score differences, and relative improvements) and all downstream benchmark results across our 65 experiments. <FigRef target="score-correlation" /> shows the full correlation matrix.
+**DCLM-score is a moderate predictor of aggregate performance.** The output DCLM-score shows the strongest correlation with `agg_score_macro` (ρ = 0.55, p {'<'} 0.001), and DCLM-score difference (output minus input) is similarly predictive (ρ = 0.52). These are moderate correlations at best. The DCLM-score variants are particularly predictive for table understanding (ρ = 0.52–0.55) and reading comprehension (ρ = 0.45–0.47).
+**Edu-score tells a more nuanced story.** The input edu-score (the score of the original data before rephrasing) correlates with aggregate performance (ρ = 0.43), but the output edu-score (the score of the rephrased data) barely correlates at all (ρ = 0.21, not significant). This suggests that starting with higher-quality source data matters, but the edu-score of the synthetic output is not a reliable proxy.
+{/*
+**The HellaSwag/PIQA anomaly deserves a closer look.** Edu-score improvement shows strong *positive* correlations with HellaSwag (ρ = 0.60) and PIQA (ρ = 0.58), while being *negatively* correlated with math (ρ = −0.39) and reading comprehension (ρ = −0.30). We investigated whether this was a confound from prompt type (FAQ and tutorial prompts both increase edu-scores and might independently help NLU). The correlation survives partial correlation controlling for prompt type (ρ = 0.65 for HellaSwag, ρ = 0.56 for PIQA, both p {'<'} 0.001) and for model size within the Gemma family (ρ = 0.60 and 0.68). So the effect is real. However, the practical magnitude is tiny: HellaSwag scores range from 0.066 to 0.092 across all 65 experiments (CV = 5.8%), compared to `agg_score_macro` ranging from 0.096 to 0.172 (CV = 10.5%). The edu-score captures something about sentence-completion and physical-intuition quality, but the absolute differences are so small that optimizing for it would be chasing noise.
 */}
+**Neither score is a reliable universal proxy.** WinoGrande shows essentially zero correlation with any predictor. The strongest individual correlations (ρ ≈ 0.55–0.60) are still only moderate, explaining roughly 30% of the variance at best. **For synthetic data, there is no shortcut: you have to train models and evaluate them.**
+<Wide>
+<HtmlEmbed
+  id="score-correlation"
+  src="score-correlation.html"
+  data="rephrasing_metadata.json"
+  desc="Spearman rank correlations between quality score metrics and downstream benchmark performance across 65 rephrasing experiments. Blue cells indicate positive correlations, red cells negative. Significance: *** p<0.001, ** p<0.01, * p<0.05."
+/>
+</Wide>
 ### Math Rephrasing: When "Worse" Outputs Win
 We compared two ~1.7B parameter models for generating math word problems: SmolLM2 and Qwen3. SmolLM2's outputs looked objectively worse, yet models trained on them performed better.

app/src/content/chapters/experiments.mdx CHANGED Viewed

@@ -4,13 +4,10 @@ import Sidenote from "../../components/Sidenote.astro";
 import Glossary from "../../components/Glossary.astro";
 import FigRef from "../../components/FigRef.astro";
-{/* TODO: think about what dataset to build and release as artifact: do more rephrasing with smollm2 */}
 {/* TODO: shorten the vllm inference benchmark or put stuff into the appendix */}
 {/* TODO: potentially make a widget for data exploration: look at the same few samples generated by different models or transformed with different prompts */}
-{/* TODO: add a plot for the table with the benchmark results */}
-{/* TODO: Analyze if certain models are more verbose than others (how many tokens did they produce per prompt?) */}
-{/* TODO: Run dclm and edu score impact analysis on model verbosity data (wait for last rephrasing job to be done) */}
-{/* TODO: Add appendix section of weird unexplainable results? */}
 ## Experiments

 import Glossary from "../../components/Glossary.astro";
 import FigRef from "../../components/FigRef.astro";
+{/* TODO: mention the currently running finephrase rephrasing with smollm2 */}
 {/* TODO: shorten the vllm inference benchmark or put stuff into the appendix */}
 {/* TODO: potentially make a widget for data exploration: look at the same few samples generated by different models or transformed with different prompts */}
+{/* TODO: Check if we have more information in the rephrasing_metadata that we can use to do analyses */}
 ## Experiments

app/src/content/embeds/score-correlation.html ADDED Viewed

	@@ -0,0 +1,596 @@

+<div class="d3-score-correlation" style="width:100%;margin:10px 0;min-height:400px;"></div>
+<style>
+  .d3-score-correlation { font-family: system-ui, -apple-system, sans-serif; }
+  .d3-score-correlation .d3-tooltip {
+    position: absolute; top: 0; left: 0;
+    transform: translate(-9999px, -9999px);
+    pointer-events: none;
+    padding: 10px 14px; border-radius: 10px;
+    font-size: 12px; line-height: 1.4;
+    border: 1px solid var(--border-color);
+    background: var(--surface-bg); color: var(--text-color);
+    box-shadow: 0 6px 24px rgba(0,0,0,.22);
+    opacity: 0; transition: opacity .12s ease;
+    z-index: 20; max-width: 300px;
+  }
+  .d3-score-correlation .legend {
+    display: flex; flex-direction: column; align-items: flex-start; gap: 6px;
+    margin-top: 8px;
+  }
+  .d3-score-correlation .legend-title {
+    font-size: 12px; font-weight: 700; color: var(--text-color);
+  }
+  .d3-score-correlation .legend .items {
+    display: flex; flex-wrap: wrap; gap: 4px 12px; align-items: center;
+  }
+  .d3-score-correlation .legend .item {
+    display: inline-flex; align-items: center; gap: 5px; font-size: 11px; color: var(--text-color);
+  }
+  .d3-score-correlation .legend .swatch {
+    width: 20px; height: 14px; border-radius: 3px; border: 1px solid var(--border-color);
+  }
+</style>
+<script>
+(() => {
+  const ensureD3 = (cb) => {
+    if (window.d3 && typeof window.d3.select === 'function') return cb();
+    let s = document.getElementById('d3-cdn-script');
+    if (!s) { s = document.createElement('script'); s.id = 'd3-cdn-script'; s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js'; document.head.appendChild(s); }
+    const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
+    s.addEventListener('load', onReady, { once: true });
+    if (window.d3) onReady();
+  };
+  const bootstrap = () => {
+    const scriptEl = document.currentScript;
+    let container = scriptEl ? scriptEl.previousElementSibling : null;
+    while (container && !(container.classList && container.classList.contains('d3-score-correlation'))) {
+      container = container.previousElementSibling;
+    }
+    if (!container) {
+      const cs = Array.from(document.querySelectorAll('.d3-score-correlation'))
+        .filter(el => !(el.dataset && el.dataset.mounted === 'true'));
+      container = cs[cs.length - 1] || null;
+    }
+    if (!container) return;
+    if (container.dataset.mounted === 'true') return;
+    container.dataset.mounted = 'true';
+    let mountEl = container;
+    while (mountEl && !mountEl.getAttribute?.('data-datafiles')) mountEl = mountEl.parentElement;
+    const dataAttr = mountEl?.getAttribute?.('data-datafiles');
+    const dataPaths = dataAttr
+      ? [dataAttr.includes('/') ? dataAttr : `/data/${dataAttr}`]
+      : ['/data/rephrasing_metadata.json', './assets/data/rephrasing_metadata.json'];
+    const fetchFirst = async (paths) => {
+      for (const p of paths) {
+        try { const r = await fetch(p, { cache: 'no-cache' }); if (r.ok) return r.json(); } catch(_) {}
+      }
+      throw new Error('Data not found');
+    };
+    fetchFirst(dataPaths).then(data => buildChart(data)).catch(err => {
+      container.innerHTML = `<pre style="color:red;padding:12px;">Error: ${err.message}</pre>`;
+    });
+    function buildChart(rawData) {
+      // Spearman correlation helpers
+      const rankArray = (arr) => {
+        const indexed = arr.map((v, i) => ({ v, i })).sort((a, b) => a.v - b.v);
+        const ranks = new Array(arr.length);
+        let i = 0;
+        while (i < indexed.length) {
+          let j = i;
+          while (j < indexed.length && indexed[j].v === indexed[i].v) j++;
+          const avgRank = (i + j + 1) / 2;
+          for (let k = i; k < j; k++) ranks[indexed[k].i] = avgRank;
+          i = j;
+        }
+        return ranks;
+      };
+      const spearman = (x, y) => {
+        const n = x.length;
+        if (n < 5) return { r: 0, p: 1 };
+        const rx = rankArray(x), ry = rankArray(y);
+        const mx = rx.reduce((a, b) => a + b, 0) / n;
+        const my = ry.reduce((a, b) => a + b, 0) / n;
+        let num = 0, dx2 = 0, dy2 = 0;
+        for (let i = 0; i < n; i++) {
+          const dx = rx[i] - mx, dy = ry[i] - my;
+          num += dx * dy; dx2 += dx * dx; dy2 += dy * dy;
+        }
+        const r = dx2 && dy2 ? num / Math.sqrt(dx2 * dy2) : 0;
+        const t = r * Math.sqrt((n - 2) / (1 - r * r + 1e-15));
+        const df = n - 2;
+        const p = df > 30 ? 2 * (1 - normalCDF(Math.abs(t))) : 2 * (1 - tCDF(Math.abs(t), df));
+        return { r, p };
+      };
+      const normalCDF = (x) => {
+        const a1 = 0.254829592, a2 = -0.284496736, a3 = 1.421413741, a4 = -1.453152027, a5 = 1.061405429;
+        const p = 0.3275911, sign = x < 0 ? -1 : 1;
+        x = Math.abs(x) / Math.sqrt(2);
+        const t = 1.0 / (1.0 + p * x);
+        const y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * Math.exp(-x * x);
+        return 0.5 * (1.0 + sign * y);
+      };
+      const tCDF = (t, df) => 1 - 0.5 * incompleteBeta(df / 2, 0.5, df / (df + t * t));
+      const incompleteBeta = (a, b, x) => {
+        if (x === 0 || x === 1) return x;
+        const lnBeta = lgamma(a) + lgamma(b) - lgamma(a + b);
+        const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta);
+        let sum = 1, term = 1;
+        for (let n = 0; n < 200; n++) {
+          term *= (n === 0 ? 1 : (a + n - 1)) * x / (a + n);
+          if (n > 0) term *= (n - b) / n;
+          sum += term;
+          if (Math.abs(term) < 1e-10) break;
+        }
+        return front * sum / a;
+      };
+      const lgamma = (x) => {
+        const c = [76.18009172947146, -86.50532032941677, 24.01409824083091,
+          -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5];
+        let y = x, tmp = x + 5.5;
+        tmp -= (x + 0.5) * Math.log(tmp);
+        let ser = 1.000000000190015;
+        for (let j = 0; j < 6; j++) ser += c[j] / ++y;
+        return -tmp + Math.log(2.5066282746310005 * ser / x);
+      };
+      // Benchmark descriptions for tooltips
+      const BENCH_DESC = {
+        'agg_score_macro': 'Mean of the six category aggregates (GK, RC, RES, NLU, MATH, TABLE).',
+        'agg_score_micro': 'Mean of all 12 individual benchmark scores.',
+        'agg_score_GK': 'Average of ARC Easy and MMLU Redux.',
+        'agg_score_RC': 'Average of SQuAD v2 and DROP.',
+        'agg_score_RES': 'Average of OpenBookQA and XCSQA.',
+        'agg_score_NLU': 'Average of WinoGrande, PIQA, and HellaSwag.',
+        'agg_score_MATH': 'Based on GSM8K alone.',
+        'agg_score_TABLE': 'Average of WikiTableQ and TriviaQA.',
+        'arc_cf:easy': 'Grade-school multiple-choice science questions testing knowledge and reasoning (AI2 Reasoning Challenge).',
+        'mmlu_redux_cf:_average': 'Re-annotated multitask benchmark covering 57 subjects from STEM to humanities (MMLU Redux).',
+        'squad_v2': 'Extractive reading comprehension on Wikipedia passages, including unanswerable questions (Stanford QA Dataset v2).',
+        'drop': 'Reading comprehension requiring discrete reasoning: counting, sorting, and arithmetic over paragraphs.',
+        'openbookqa_cf': 'Elementary science questions requiring multi-step reasoning beyond provided facts (OpenBookQA).',
+        'xcsqa_cf': 'Cross-lingual commonsense QA testing general world knowledge across 16 languages (X-CSQA).',
+        'winogrande_cf': 'Pronoun resolution problems testing commonsense reasoning, adversarially filtered to remove biases.',
+        'piqa_cf': 'Physical intuition QA: choosing the most plausible solution to everyday physical tasks (PIQA).',
+        'hellaswag_cf': 'Sentence completion testing commonsense inference, with adversarially crafted wrong endings (HellaSwag).',
+        'gsm8k': 'Grade-school math word problems requiring 2–8 steps of arithmetic reasoning (GSM8K).',
+        'wikitablequestions': 'Complex questions over Wikipedia tables requiring multi-step reasoning and aggregation.',
+        'treb_qa': 'Large-scale trivia QA requiring cross-sentence reasoning over evidence documents (TriviaQA).',
+      };
+      // Predictors: output, input, delta, improvement for each group
+      const PREDICTORS = [
+        { key: 'output_dclm_score', label: 'Output DCLM', group: 'DCLM',
+          desc: 'Mean DCLM quality score of the rephrased (output) documents.' },
+        { key: 'input_dclm_score', label: 'Input DCLM', group: 'DCLM',
+          desc: 'Mean DCLM quality score of the original (input) documents before rephrasing.' },
+        { key: 'dclm_score_difference', label: 'DCLM Δ', group: 'DCLM',
+          desc: 'Absolute change in DCLM score: output minus input. Positive means the rephrasing increased perceived quality.' },
+        { key: 'dclm_score_improvement', label: 'DCLM Improvement %', group: 'DCLM',
+          desc: 'Relative improvement in DCLM score: (output − input) / input. Measures the proportional quality gain from rephrasing.' },
+        { key: 'output_edu_score', label: 'Output Edu', group: 'EDU',
+          desc: 'Mean FineWeb-Edu score of the rephrased (output) documents.' },
+        { key: 'input_edu_score', label: 'Input Edu', group: 'EDU',
+          desc: 'Mean FineWeb-Edu score of the original (input) documents before rephrasing.' },
+        { key: 'edu_score_difference', label: 'Edu Δ', group: 'EDU',
+          desc: 'Absolute change in Edu score: output minus input. Positive means the rephrasing increased educational value.' },
+        { key: 'edu_score_improvement', label: 'Edu Improvement %', group: 'EDU',
+          desc: 'Relative improvement in Edu score: (output − input) / input. Measures the proportional educational quality gain from rephrasing.' },
+      ];
+      // Targets: grouped so each agg is immediately left of its individual benchmarks
+      // Each group: { agg, individuals[] }
+      const GROUPS = [
+        {
+          name: 'Overall',
+          targets: [
+            { key: 'agg_score_macro', label: 'Macro Avg', isAgg: true },
+            { key: 'agg_score_micro', label: 'Micro Avg', isAgg: true },
+          ]
+        },
+        {
+          name: 'General Knowledge',
+          targets: [
+            { key: 'agg_score_GK', label: 'GK Agg', isAgg: true },
+            { key: 'arc_cf:easy', label: 'ARC Easy', isAgg: false },
+            { key: 'mmlu_redux_cf:_average', label: 'MMLU Redux', isAgg: false },
+          ]
+        },
+        {
+          name: 'Reading Comp.',
+          targets: [
+            { key: 'agg_score_RC', label: 'RC Agg', isAgg: true },
+            { key: 'squad_v2', label: 'SQuAD v2', isAgg: false },
+            { key: 'drop', label: 'DROP', isAgg: false },
+          ]
+        },
+        {
+          name: 'Reasoning',
+          targets: [
+            { key: 'agg_score_RES', label: 'RES Agg', isAgg: true },
+            { key: 'openbookqa_cf', label: 'OpenBookQA', isAgg: false },
+            { key: 'xcsqa_cf', label: 'XCSQA', isAgg: false },
+          ]
+        },
+        {
+          name: 'NLU',
+          targets: [
+            { key: 'agg_score_NLU', label: 'NLU Agg', isAgg: true },
+            { key: 'winogrande_cf', label: 'WinoGrande', isAgg: false },
+            { key: 'piqa_cf', label: 'PIQA', isAgg: false },
+            { key: 'hellaswag_cf', label: 'HellaSwag', isAgg: false },
+          ]
+        },
+        {
+          name: 'Math',
+          targets: [
+            { key: 'agg_score_MATH', label: 'Math Agg', isAgg: true },
+            { key: 'gsm8k', label: 'GSM8K', isAgg: false },
+          ]
+        },
+        {
+          name: 'Table',
+          targets: [
+            { key: 'agg_score_TABLE', label: 'Table Agg', isAgg: true },
+            { key: 'wikitablequestions', label: 'WikiTableQ', isAgg: false },
+            { key: 'treb_qa', label: 'TriviaQA', isAgg: false },
+          ]
+        },
+      ];
+      // Flatten targets in display order
+      const ALL_TARGETS = GROUPS.flatMap(g => g.targets);
+      const DCLM_COUNT = PREDICTORS.filter(p => p.group === 'DCLM').length;
+      // Compute correlation matrix
+      const matrix = [];
+      for (const pred of PREDICTORS) {
+        for (const tgt of ALL_TARGETS) {
+          const pairs = rawData
+            .filter(d => d[pred.key] != null && d.results[tgt.key] != null)
+            .map(d => [d[pred.key], d.results[tgt.key]]);
+          const { r, p } = spearman(pairs.map(p => p[0]), pairs.map(p => p[1]));
+          matrix.push({
+            predictor: pred.key, predictorLabel: pred.label,
+            target: tgt.key, targetLabel: tgt.label,
+            isAgg: tgt.isAgg,
+            desc: BENCH_DESC[tgt.key] || '',
+            r, p, n: pairs.length,
+          });
+        }
+      }
+      // Build the heatmap
+      container.style.position = 'relative';
+      const tip = document.createElement('div');
+      tip.className = 'd3-tooltip';
+      container.appendChild(tip);
+      const svg = d3.select(container).append('svg')
+        .attr('width', '100%')
+        .style('display', 'block');
+      const render = () => {
+        const width = container.clientWidth || 900;
+        const isDark = document.documentElement.getAttribute('data-theme') === 'dark';
+        const divColor = isDark ? 'rgba(255,255,255,0.22)' : 'rgba(0,0,0,0.18)';
+        const textCol = isDark ? 'rgba(255,255,255,0.8)' : 'rgba(0,0,0,0.7)';
+        const mutedCol = isDark ? 'rgba(255,255,255,0.4)' : 'rgba(0,0,0,0.35)';
+        const predLabels = PREDICTORS.map(p => p.label);
+        // Layout
+        const leftMargin = 140;
+        const topMargin = 130; // extra room for two-tier header
+        const rightMargin = 10;
+        const bottomMargin = 10;
+        const cellW = Math.max(30, Math.min(52, (width - leftMargin - rightMargin) / ALL_TARGETS.length));
+        const cellH = Math.max(28, Math.min(42, cellW * 0.82));
+        const plotW = cellW * ALL_TARGETS.length;
+        const rowGap = 8; // gap between DCLM and EDU groups
+        const plotH = cellH * predLabels.length + rowGap;
+        const totalW = leftMargin + plotW + rightMargin;
+        const totalH = topMargin + plotH + bottomMargin;
+        svg.attr('width', totalW).attr('height', totalH);
+        svg.selectAll('*').remove();
+        // Color scale: diverging, reversed so positive = blue
+        // Wider domain (±0.85) so colors stay readable longer
+        const colorScale = d3.scaleDiverging()
+          .domain([-0.85, 0, 0.85])
+          .interpolator(d3.interpolateRdBu)
+          .clamp(true);
+        const cellColor = (r) => colorScale(-r);
+        const g = svg.append('g').attr('transform', `translate(${leftMargin},${topMargin})`);
+        // --- Group dividers (vertical) and header labels ---
+        let colOffset = 0;
+        const groupHeaderY = 18; // top-level group name
+        const colLabelY = topMargin - 6; // individual column labels
+        GROUPS.forEach((grp, gi) => {
+          const groupStartX = colOffset * cellW;
+          const groupW = grp.targets.length * cellW;
+          // Vertical divider before each group (except first)
+          if (gi > 0) {
+            g.append('line')
+              .attr('x1', groupStartX).attr('x2', groupStartX)
+              .attr('y1', -4).attr('y2', plotH + 2)
+              .attr('stroke', divColor)
+              .attr('stroke-width', gi === 1 ? 1.5 : 1)
+              .attr('stroke-dasharray', gi === 1 ? 'none' : '4,3');
+          }
+          // Group header label (top tier)
+          svg.append('text')
+            .attr('x', leftMargin + groupStartX + groupW / 2)
+            .attr('y', groupHeaderY)
+            .attr('text-anchor', 'middle')
+            .attr('font-size', '9.5px')
+            .attr('font-weight', '700')
+            .attr('letter-spacing', '0.5px')
+            .attr('fill', mutedCol)
+            .text(grp.name.toUpperCase());
+          // Bracket line under group header
+          const bracketY = groupHeaderY + 8;
+          svg.append('line')
+            .attr('x1', leftMargin + groupStartX + 4)
+            .attr('x2', leftMargin + groupStartX + groupW - 4)
+            .attr('y1', bracketY).attr('y2', bracketY)
+            .attr('stroke', mutedCol)
+            .attr('stroke-width', 0.8);
+          colOffset += grp.targets.length;
+        });
+        // Helper: y position for a predictor row, with gap after DCLM
+        const rowY = (row) => row < DCLM_COUNT ? row * cellH : row * cellH + rowGap;
+        // --- Horizontal divider between DCLM and EDU ---
+        const divY = DCLM_COUNT * cellH + rowGap / 2;
+        g.append('line')
+          .attr('x1', -2).attr('x2', plotW + 2)
+          .attr('y1', divY).attr('y2', divY)
+          .attr('stroke', isDark ? 'rgba(255,255,255,0.45)' : 'rgba(0,0,0,0.35)')
+          .attr('stroke-width', 2.5);
+        // --- Draw cells ---
+        const cells = g.selectAll('g.cell')
+          .data(matrix)
+          .join('g')
+          .attr('class', 'cell')
+          .attr('transform', d => {
+            const col = ALL_TARGETS.findIndex(t => t.key === d.target);
+            const row = PREDICTORS.findIndex(p => p.key === d.predictor);
+            return `translate(${col * cellW},${rowY(row)})`;
+          });
+        cells.append('rect')
+          .attr('width', cellW - 1)
+          .attr('height', cellH - 1)
+          .attr('rx', 3)
+          .attr('fill', d => cellColor(d.r))
+          .attr('stroke', isDark ? 'rgba(255,255,255,0.06)' : 'rgba(0,0,0,0.04)')
+          .attr('stroke-width', 0.5);
+        const textFill = (r) => Math.abs(r) > 0.5 ? '#fff' : textCol;
+        cells.append('text')
+          .attr('x', (cellW - 1) / 2)
+          .attr('y', (cellH - 1) / 2)
+          .attr('text-anchor', 'middle')
+          .attr('dominant-baseline', 'central')
+          .attr('font-size', Math.max(9, Math.min(12, cellW * 0.24)) + 'px')
+          .attr('font-weight', d => Math.abs(d.r) > 0.4 ? '700' : '500')
+          .attr('fill', d => textFill(d.r))
+          .text(d => d.r.toFixed(2));
+        // Significance markers
+        cells.append('text')
+          .attr('x', cellW - 3).attr('y', 10)
+          .attr('text-anchor', 'end')
+          .attr('font-size', '11px')
+          .attr('font-weight', '700')
+          .attr('fill', d => Math.abs(d.r) > 0.5 ? 'rgba(255,255,255,0.8)' : mutedCol)
+          .text(d => d.p < 0.001 ? '***' : d.p < 0.01 ? '**' : d.p < 0.05 ? '*' : '');
+        // --- Row labels (predictors, with hover descriptions) ---
+        const gLabels = svg.append('g').attr('transform', `translate(${leftMargin - 8},${topMargin})`);
+        PREDICTORS.forEach((pred, i) => {
+          const labelG = gLabels.append('g')
+            .style('cursor', 'help');
+          labelG.append('text')
+            .attr('x', 0).attr('y', rowY(i) + cellH / 2)
+            .attr('text-anchor', 'end')
+            .attr('dominant-baseline', 'central')
+            .attr('font-size', '11px')
+            .attr('fill', textCol)
+            .attr('font-weight', '500')
+            .text(pred.label);
+          // Hit area
+          labelG.append('rect')
+            .attr('x', -leftMargin + 20).attr('y', rowY(i))
+            .attr('width', leftMargin - 20).attr('height', cellH)
+            .attr('fill', 'transparent');
+          labelG.on('mouseenter', function(ev) {
+            tip.innerHTML = `<div style="font-weight:700;font-size:13px;margin-bottom:4px;">${pred.label}</div><div style="font-size:12px;color:var(--muted-color);line-height:1.45;">${pred.desc}</div>`;
+            tip.style.opacity = '1';
+          })
+          .on('mousemove', function(ev) {
+            const [mx, my] = d3.pointer(ev, container);
+            const bw = tip.offsetWidth || 260;
+            const ox = 12;
+            const oy = (my + (tip.offsetHeight || 100) + 20 > totalH) ? -((tip.offsetHeight || 100) + 12) : 14;
+            tip.style.transform = `translate(${Math.round(mx + ox)}px,${Math.round(my + oy)}px)`;
+          })
+          .on('mouseleave', function() {
+            tip.style.opacity = '0';
+            tip.style.transform = 'translate(-9999px,-9999px)';
+          });
+        });
+        // --- Column labels (rotated, with hover descriptions) ---
+        const gColLabels = svg.append('g').attr('transform', `translate(${leftMargin},${topMargin - 6})`);
+        ALL_TARGETS.forEach((tgt, i) => {
+          const labelG = gColLabels.append('g')
+            .attr('transform', `translate(${i * cellW + cellW / 2},0)`)
+            .style('cursor', BENCH_DESC[tgt.key] ? 'help' : 'default');
+          labelG.append('text')
+            .attr('x', 0).attr('y', 0)
+            .attr('transform', 'rotate(-55)')
+            .attr('text-anchor', 'start')
+            .attr('font-size', '10px')
+            .attr('fill', textCol)
+            .attr('font-weight', tgt.isAgg ? '700' : '400')
+            .text(tgt.label);
+          if (BENCH_DESC[tgt.key]) {
+            // Invisible hit area for easier hovering on rotated text
+            labelG.append('rect')
+              .attr('x', -cellW / 2).attr('y', -80)
+              .attr('width', cellW).attr('height', 80)
+              .attr('fill', 'transparent');
+            labelG.on('mouseenter', function(ev) {
+              tip.innerHTML = `<div style="font-weight:700;font-size:13px;margin-bottom:4px;">${tgt.label}</div><div style="font-size:12px;color:var(--muted-color);line-height:1.45;">${BENCH_DESC[tgt.key]}</div>`;
+              tip.style.opacity = '1';
+            })
+            .on('mousemove', function(ev) {
+              const [mx, my] = d3.pointer(ev, container);
+              const bw = tip.offsetWidth || 260;
+              const ox = (mx + bw + 20 > totalW) ? -(bw + 12) : 12;
+              tip.style.transform = `translate(${Math.round(mx + ox)}px,${Math.round(my + 14)}px)`;
+            })
+            .on('mouseleave', function() {
+              tip.style.opacity = '0';
+              tip.style.transform = 'translate(-9999px,-9999px)';
+            });
+          }
+        });
+        // --- Predictor group labels (vertical) ---
+        const dclmCenterY = topMargin + (rowY(0) + rowY(DCLM_COUNT - 1) + cellH) / 2;
+        const eduCenterY = topMargin + (rowY(DCLM_COUNT) + rowY(PREDICTORS.length - 1) + cellH) / 2;
+        const groupLabelX = 14;
+        const GROUP_DESC = {
+          'DCLM': 'DCLM score rates text quality on a 0–1 scale using a fastText classifier trained to distinguish curated, high-quality web data from random web crawls.',
+          'EDU': 'FineWeb-Edu score rates educational value on a 0–5 scale using a classifier trained on LLM-annotated web pages, where higher scores indicate more instructive content.',
+        };
+        [['DCLM', dclmCenterY], ['EDU', eduCenterY]].forEach(([text, cy]) => {
+          const labelG = svg.append('g').style('cursor', 'help');
+          labelG.append('text')
+            .attr('x', groupLabelX).attr('y', cy)
+            .attr('text-anchor', 'middle')
+            .attr('dominant-baseline', 'central')
+            .attr('font-size', '9px')
+            .attr('font-weight', '700')
+            .attr('letter-spacing', '1px')
+            .attr('fill', isDark ? 'rgba(255,255,255,0.35)' : 'rgba(0,0,0,0.3)')
+            .attr('transform', `rotate(-90, ${groupLabelX}, ${cy})`)
+            .text(text);
+          // Hit area for the rotated text
+          const halfH = (DCLM_COUNT * cellH) / 2;
+          labelG.append('rect')
+            .attr('x', 0).attr('y', cy - halfH)
+            .attr('width', 24).attr('height', halfH * 2)
+            .attr('fill', 'transparent');
+          labelG.on('mouseenter', function() {
+            tip.innerHTML = `<div style="font-weight:700;font-size:13px;margin-bottom:4px;">${text} Score</div><div style="font-size:12px;color:var(--muted-color);line-height:1.45;">${GROUP_DESC[text]}</div>`;
+            tip.style.opacity = '1';
+          })
+          .on('mousemove', function(ev) {
+            const [mx, my] = d3.pointer(ev, container);
+            const bw = tip.offsetWidth || 260;
+            tip.style.transform = `translate(${Math.round(mx + 12)}px,${Math.round(my + 14)}px)`;
+          })
+          .on('mouseleave', function() {
+            tip.style.opacity = '0';
+            tip.style.transform = 'translate(-9999px,-9999px)';
+          });
+        });
+        // --- Tooltip interactions ---
+        cells.on('mouseenter', function(ev, d) {
+          d3.select(this).select('rect')
+            .attr('stroke', isDark ? 'rgba(255,255,255,0.6)' : 'rgba(0,0,0,0.5)')
+            .attr('stroke-width', 2);
+          const sig = d.p < 0.001 ? 'p < 0.001 (***)' : d.p < 0.01 ? `p = ${d.p.toFixed(3)} (**)` : d.p < 0.05 ? `p = ${d.p.toFixed(3)} (*)` : `p = ${d.p.toFixed(3)}`;
+          const descHtml = d.desc ? `<div style="margin-top:6px;padding-top:6px;border-top:1px solid var(--border-color);font-size:11px;color:var(--muted-color);line-height:1.4;">${d.desc}</div>` : '';
+          tip.innerHTML = `
+            <div style="font-weight:700;font-size:13px;margin-bottom:4px;">${d.predictorLabel} → ${d.targetLabel}</div>
+            <div style="display:grid;grid-template-columns:auto 1fr;gap:2px 10px;font-size:12px;">
+              <span style="color:var(--muted-color);">Spearman ρ</span><span style="font-weight:700;">${d.r.toFixed(4)}</span>
+              <span style="color:var(--muted-color);">Significance</span><span>${sig}</span>
+              <span style="color:var(--muted-color);">N</span><span>${d.n} experiments</span>
+            </div>${descHtml}`;
+          tip.style.opacity = '1';
+        })
+        .on('mousemove', function(ev) {
+          const [mx, my] = d3.pointer(ev, container);
+          const bw = tip.offsetWidth || 260;
+          const bh = tip.offsetHeight || 120;
+          const ox = (mx + bw + 20 > totalW) ? -(bw + 12) : 12;
+          const oy = (my + bh + 20 > totalH) ? -(bh + 12) : 14;
+          tip.style.transform = `translate(${Math.round(mx + ox)}px,${Math.round(my + oy)}px)`;
+        })
+        .on('mouseleave', function() {
+          d3.select(this).select('rect')
+            .attr('stroke', isDark ? 'rgba(255,255,255,0.06)' : 'rgba(0,0,0,0.04)')
+            .attr('stroke-width', 0.5);
+          tip.style.opacity = '0';
+          tip.style.transform = 'translate(-9999px,-9999px)';
+        });
+      };
+      render();
+      if (window.ResizeObserver) { new ResizeObserver(() => render()).observe(container); }
+      else { window.addEventListener('resize', render); }
+      // Legend
+      const legend = document.createElement('div');
+      legend.className = 'legend';
+      const cs = d3.scaleDiverging().domain([-0.85, 0, 0.85]).interpolator(d3.interpolateRdBu).clamp(true);
+      const sw = (r) => cs(-r);
+      legend.innerHTML = `
+        <div class="legend-title">Legend</div>
+        <div class="items">
+          <span class="item"><span class="swatch" style="background:${sw(-0.6)};"></span><span>ρ = −0.6</span></span>
+          <span class="item"><span class="swatch" style="background:${sw(-0.3)};"></span><span>ρ = −0.3</span></span>
+          <span class="item"><span class="swatch" style="background:${sw(0)};"></span><span>ρ = 0</span></span>
+          <span class="item"><span class="swatch" style="background:${sw(0.3)};"></span><span>ρ = +0.3</span></span>
+          <span class="item"><span class="swatch" style="background:${sw(0.6)};"></span><span>ρ = +0.6</span></span>
+          <span style="margin-left:12px;font-size:11px;color:var(--muted-color);">*** p&lt;0.001 &nbsp; ** p&lt;0.01 &nbsp; * p&lt;0.05</span>
+        </div>`;
+      container.appendChild(legend);
+    }
+  };
+  if (document.readyState === 'loading') {
+    document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
+  } else { ensureD3(bootstrap); }
+})();
+</script>