joelniklaus HF Staff commited on
Commit
5ea1a40
·
1 Parent(s): a160373

add dclm/edu score correlation analysis

Browse files
app/src/content/analysis/score_correlation.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Analyze whether edu-score or DCLM-score predict downstream benchmark performance."""
2
+
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from scipy import stats
10
+
11
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
12
+ logger = logging.getLogger(__name__)
13
+
14
+ DATA_PATH = Path(__file__).parent / "assets/data/rephrasing_metadata.json"
15
+
16
+ # Individual benchmarks (not aggregates)
17
+ INDIVIDUAL_BENCHMARKS = [
18
+ "squad_v2", "arc_cf:easy", "hellaswag_cf", "mmlu_redux_cf:_average",
19
+ "gsm8k", "drop", "wikitablequestions", "treb_qa",
20
+ "winogrande_cf", "piqa_cf", "openbookqa_cf", "xcsqa_cf",
21
+ ]
22
+
23
+ # Aggregate scores
24
+ AGG_SCORES = [
25
+ "agg_score_GK", "agg_score_RC", "agg_score_RES",
26
+ "agg_score_NLU", "agg_score_MATH", "agg_score_TABLE",
27
+ "agg_score_macro", "agg_score_micro",
28
+ ]
29
+
30
+ ALL_TARGETS = INDIVIDUAL_BENCHMARKS + AGG_SCORES
31
+
32
+ # Score predictors to test (both input and output variants)
33
+ PREDICTORS = [
34
+ "input_edu_score", "output_edu_score", "edu_score_difference", "edu_score_improvement",
35
+ "input_dclm_score", "output_dclm_score", "dclm_score_difference", "dclm_score_improvement",
36
+ ]
37
+
38
+
39
+ def load_data() -> pd.DataFrame:
40
+ """Load rephrasing metadata and flatten results into columns."""
41
+ with open(DATA_PATH) as f:
42
+ raw = json.load(f)
43
+
44
+ rows = []
45
+ for entry in raw:
46
+ row = {k: v for k, v in entry.items() if k != "results"}
47
+ row.update(entry["results"])
48
+ rows.append(row)
49
+
50
+ return pd.DataFrame(rows)
51
+
52
+
53
+ def compute_correlations(df: pd.DataFrame) -> pd.DataFrame:
54
+ """Compute Pearson and Spearman correlations between predictors and targets."""
55
+ results = []
56
+ for predictor in PREDICTORS:
57
+ for target in ALL_TARGETS:
58
+ x = df[predictor].values
59
+ y = df[target].values
60
+
61
+ # Drop NaN pairs
62
+ mask = ~(np.isnan(x) | np.isnan(y))
63
+ x, y = x[mask], y[mask]
64
+
65
+ if len(x) < 5:
66
+ continue
67
+
68
+ pearson_r, pearson_p = stats.pearsonr(x, y)
69
+ spearman_r, spearman_p = stats.spearmanr(x, y)
70
+
71
+ results.append({
72
+ "predictor": predictor,
73
+ "target": target,
74
+ "is_aggregate": target in AGG_SCORES,
75
+ "pearson_r": pearson_r,
76
+ "pearson_p": pearson_p,
77
+ "spearman_r": spearman_r,
78
+ "spearman_p": spearman_p,
79
+ "n": len(x),
80
+ })
81
+
82
+ return pd.DataFrame(results)
83
+
84
+
85
+ def print_correlation_table(corr_df: pd.DataFrame, title: str, sort_by: str = "spearman_r") -> None:
86
+ """Print a formatted correlation table sorted by absolute correlation."""
87
+ logger.info(f"\n{'='*90}")
88
+ logger.info(f" {title}")
89
+ logger.info(f"{'='*90}")
90
+
91
+ df = corr_df.copy()
92
+ df["abs_spearman"] = df["spearman_r"].abs()
93
+ df = df.sort_values("abs_spearman", ascending=False)
94
+
95
+ logger.info(f"{'Predictor':<28} {'Target':<28} {'Pearson r':>10} {'p':>10} {'Spearman r':>10} {'p':>10}")
96
+ logger.info("-" * 98)
97
+
98
+ for _, row in df.iterrows():
99
+ sig_marker = ""
100
+ if row["spearman_p"] < 0.001:
101
+ sig_marker = "***"
102
+ elif row["spearman_p"] < 0.01:
103
+ sig_marker = "**"
104
+ elif row["spearman_p"] < 0.05:
105
+ sig_marker = "*"
106
+
107
+ logger.info(
108
+ f"{row['predictor']:<28} {row['target']:<28} "
109
+ f"{row['pearson_r']:>9.4f} {row['pearson_p']:>10.4f} "
110
+ f"{row['spearman_r']:>9.4f} {row['spearman_p']:>10.4f} {sig_marker}"
111
+ )
112
+
113
+
114
+ def main() -> None:
115
+ df = load_data()
116
+ logger.info(f"Loaded {len(df)} experiments")
117
+
118
+ corr_df = compute_correlations(df)
119
+
120
+ # 1. Overall best predictors for agg_score_macro
121
+ macro_corr = corr_df[corr_df["target"] == "agg_score_macro"]
122
+ print_correlation_table(macro_corr, "Correlations with agg_score_macro")
123
+
124
+ # 2. Best predictors for each aggregate score
125
+ agg_corr = corr_df[corr_df["is_aggregate"]]
126
+ print_correlation_table(agg_corr, "All predictor-aggregate correlations")
127
+
128
+ # 3. Best predictors for individual benchmarks
129
+ indiv_corr = corr_df[~corr_df["is_aggregate"]]
130
+ print_correlation_table(indiv_corr, "All predictor-individual benchmark correlations")
131
+
132
+ # 4. Summary: for each predictor, which targets correlate best?
133
+ logger.info(f"\n{'='*90}")
134
+ logger.info(" Summary: Best target for each predictor (by |Spearman r|)")
135
+ logger.info(f"{'='*90}")
136
+ for predictor in PREDICTORS:
137
+ sub = corr_df[corr_df["predictor"] == predictor].copy()
138
+ sub["abs_spearman"] = sub["spearman_r"].abs()
139
+ best = sub.sort_values("abs_spearman", ascending=False).head(3)
140
+ logger.info(f"\n {predictor}:")
141
+ for _, row in best.iterrows():
142
+ sig = "***" if row["spearman_p"] < 0.001 else ("**" if row["spearman_p"] < 0.01 else ("*" if row["spearman_p"] < 0.05 else ""))
143
+ logger.info(f" {row['target']:<28} r={row['spearman_r']:>7.4f} p={row['spearman_p']:.4f} {sig}")
144
+
145
+ # 5. Summary: for each target, which predictor correlates best?
146
+ logger.info(f"\n{'='*90}")
147
+ logger.info(" Summary: Best predictor for each target (by |Spearman r|)")
148
+ logger.info(f"{'='*90}")
149
+ for target in ALL_TARGETS:
150
+ sub = corr_df[corr_df["target"] == target].copy()
151
+ sub["abs_spearman"] = sub["spearman_r"].abs()
152
+ best = sub.sort_values("abs_spearman", ascending=False).iloc[0]
153
+ sig = "***" if best["spearman_p"] < 0.001 else ("**" if best["spearman_p"] < 0.01 else ("*" if best["spearman_p"] < 0.05 else ""))
154
+ logger.info(
155
+ f" {target:<28} <- {best['predictor']:<28} r={best['spearman_r']:>7.4f} p={best['spearman_p']:.4f} {sig}"
156
+ )
157
+
158
+ # 6. Heatmap data: pivot table of Spearman correlations
159
+ logger.info(f"\n{'='*90}")
160
+ logger.info(" Spearman correlation heatmap (predictor x target)")
161
+ logger.info(f"{'='*90}")
162
+ pivot = corr_df.pivot(index="predictor", columns="target", values="spearman_r")
163
+ # Reorder
164
+ pivot = pivot.loc[PREDICTORS, ALL_TARGETS]
165
+ logger.info(pivot.round(3).to_string())
166
+
167
+ # Save heatmap data for potential D3 visualization
168
+ output_path = Path(__file__).parent / "score_correlation_results.json"
169
+ output = {
170
+ "heatmap": {
171
+ "predictors": PREDICTORS,
172
+ "targets": ALL_TARGETS,
173
+ "spearman_r": pivot.values.tolist(),
174
+ "individual_benchmarks": INDIVIDUAL_BENCHMARKS,
175
+ "aggregate_scores": AGG_SCORES,
176
+ },
177
+ "correlations": corr_df.to_dict(orient="records"),
178
+ }
179
+ with open(output_path, "w") as f:
180
+ json.dump(output, f, indent=2)
181
+ logger.info(f"\nSaved results to {output_path}")
182
+
183
+
184
+ if __name__ == "__main__":
185
+ main()
app/src/content/analysis/score_correlation_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb63ac51a4ac538702c4943266c5848d8f8257fdc34268e95e1d3ce6e1847012
3
+ size 52825
app/src/content/chapters/analyses.mdx CHANGED
@@ -1,20 +1,38 @@
1
  import HtmlEmbed from "../../components/HtmlEmbed.astro";
2
  import FigRef from "../../components/FigRef.astro";
 
3
 
4
  ## Analyses
5
 
6
- Our final experiment explores an even more counterintuitive finding.
7
 
8
- {/*
9
 
10
  ### Does edu-score or DCLM-score predict model performance?
11
 
12
- Running these ablations is super expensive. So we were looking for informative proxies that can predict whether a certain dataset will result in better downstream benchmark performance. Since the FineWeb-Edu-score and DCLM-score work well for human data, we surmised it could also work for synthetic data.
 
 
 
 
13
 
14
- TODO: Run this analysis and add a small report
15
 
 
 
16
  */}
17
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  ### Math Rephrasing: When "Worse" Outputs Win
19
 
20
  We compared two ~1.7B parameter models for generating math word problems: SmolLM2 and Qwen3. SmolLM2's outputs looked objectively worse, yet models trained on them performed better.
 
1
  import HtmlEmbed from "../../components/HtmlEmbed.astro";
2
  import FigRef from "../../components/FigRef.astro";
3
+ import Wide from "../../components/Wide.astro";
4
 
5
  ## Analyses
6
 
7
+ TODO: Add entry and exit paragraph
8
 
 
9
 
10
  ### Does edu-score or DCLM-score predict model performance?
11
 
12
+ Running these ablations is super expensive. So we were looking for informative proxies that can predict whether a certain dataset will result in better downstream benchmark performance. Since the FineWeb-Edu-score and DCLM-score work well for human data, we thought they might also work for synthetic data.
13
+
14
+ We computed Spearman rank correlations between various edu-score and DCLM-score metrics (input scores, output scores, score differences, and relative improvements) and all downstream benchmark results across our 65 experiments. <FigRef target="score-correlation" /> shows the full correlation matrix.
15
+
16
+ **DCLM-score is a moderate predictor of aggregate performance.** The output DCLM-score shows the strongest correlation with `agg_score_macro` (ρ = 0.55, p {'<'} 0.001), and DCLM-score difference (output minus input) is similarly predictive (ρ = 0.52). These are moderate correlations at best. The DCLM-score variants are particularly predictive for table understanding (ρ = 0.52–0.55) and reading comprehension (ρ = 0.45–0.47).
17
 
18
+ **Edu-score tells a more nuanced story.** The input edu-score (the score of the original data before rephrasing) correlates with aggregate performance (ρ = 0.43), but the output edu-score (the score of the rephrased data) barely correlates at all (ρ = 0.21, not significant). This suggests that starting with higher-quality source data matters, but the edu-score of the synthetic output is not a reliable proxy.
19
 
20
+ {/*
21
+ **The HellaSwag/PIQA anomaly deserves a closer look.** Edu-score improvement shows strong *positive* correlations with HellaSwag (ρ = 0.60) and PIQA (ρ = 0.58), while being *negatively* correlated with math (ρ = −0.39) and reading comprehension (ρ = −0.30). We investigated whether this was a confound from prompt type (FAQ and tutorial prompts both increase edu-scores and might independently help NLU). The correlation survives partial correlation controlling for prompt type (ρ = 0.65 for HellaSwag, ρ = 0.56 for PIQA, both p {'<'} 0.001) and for model size within the Gemma family (ρ = 0.60 and 0.68). So the effect is real. However, the practical magnitude is tiny: HellaSwag scores range from 0.066 to 0.092 across all 65 experiments (CV = 5.8%), compared to `agg_score_macro` ranging from 0.096 to 0.172 (CV = 10.5%). The edu-score captures something about sentence-completion and physical-intuition quality, but the absolute differences are so small that optimizing for it would be chasing noise.
22
  */}
23
 
24
+ **Neither score is a reliable universal proxy.** WinoGrande shows essentially zero correlation with any predictor. The strongest individual correlations (ρ ≈ 0.55–0.60) are still only moderate, explaining roughly 30% of the variance at best. **For synthetic data, there is no shortcut: you have to train models and evaluate them.**
25
+
26
+ <Wide>
27
+ <HtmlEmbed
28
+ id="score-correlation"
29
+ src="score-correlation.html"
30
+ data="rephrasing_metadata.json"
31
+ desc="Spearman rank correlations between quality score metrics and downstream benchmark performance across 65 rephrasing experiments. Blue cells indicate positive correlations, red cells negative. Significance: *** p<0.001, ** p<0.01, * p<0.05."
32
+ />
33
+ </Wide>
34
+
35
+
36
  ### Math Rephrasing: When "Worse" Outputs Win
37
 
38
  We compared two ~1.7B parameter models for generating math word problems: SmolLM2 and Qwen3. SmolLM2's outputs looked objectively worse, yet models trained on them performed better.
app/src/content/chapters/experiments.mdx CHANGED
@@ -4,13 +4,10 @@ import Sidenote from "../../components/Sidenote.astro";
4
  import Glossary from "../../components/Glossary.astro";
5
  import FigRef from "../../components/FigRef.astro";
6
 
7
- {/* TODO: think about what dataset to build and release as artifact: do more rephrasing with smollm2 */}
8
  {/* TODO: shorten the vllm inference benchmark or put stuff into the appendix */}
9
  {/* TODO: potentially make a widget for data exploration: look at the same few samples generated by different models or transformed with different prompts */}
10
- {/* TODO: add a plot for the table with the benchmark results */}
11
- {/* TODO: Analyze if certain models are more verbose than others (how many tokens did they produce per prompt?) */}
12
- {/* TODO: Run dclm and edu score impact analysis on model verbosity data (wait for last rephrasing job to be done) */}
13
- {/* TODO: Add appendix section of weird unexplainable results? */}
14
 
15
  ## Experiments
16
 
 
4
  import Glossary from "../../components/Glossary.astro";
5
  import FigRef from "../../components/FigRef.astro";
6
 
7
+ {/* TODO: mention the currently running finephrase rephrasing with smollm2 */}
8
  {/* TODO: shorten the vllm inference benchmark or put stuff into the appendix */}
9
  {/* TODO: potentially make a widget for data exploration: look at the same few samples generated by different models or transformed with different prompts */}
10
+ {/* TODO: Check if we have more information in the rephrasing_metadata that we can use to do analyses */}
 
 
 
11
 
12
  ## Experiments
13
 
app/src/content/embeds/score-correlation.html ADDED
@@ -0,0 +1,596 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div class="d3-score-correlation" style="width:100%;margin:10px 0;min-height:400px;"></div>
2
+ <style>
3
+ .d3-score-correlation { font-family: system-ui, -apple-system, sans-serif; }
4
+ .d3-score-correlation .d3-tooltip {
5
+ position: absolute; top: 0; left: 0;
6
+ transform: translate(-9999px, -9999px);
7
+ pointer-events: none;
8
+ padding: 10px 14px; border-radius: 10px;
9
+ font-size: 12px; line-height: 1.4;
10
+ border: 1px solid var(--border-color);
11
+ background: var(--surface-bg); color: var(--text-color);
12
+ box-shadow: 0 6px 24px rgba(0,0,0,.22);
13
+ opacity: 0; transition: opacity .12s ease;
14
+ z-index: 20; max-width: 300px;
15
+ }
16
+ .d3-score-correlation .legend {
17
+ display: flex; flex-direction: column; align-items: flex-start; gap: 6px;
18
+ margin-top: 8px;
19
+ }
20
+ .d3-score-correlation .legend-title {
21
+ font-size: 12px; font-weight: 700; color: var(--text-color);
22
+ }
23
+ .d3-score-correlation .legend .items {
24
+ display: flex; flex-wrap: wrap; gap: 4px 12px; align-items: center;
25
+ }
26
+ .d3-score-correlation .legend .item {
27
+ display: inline-flex; align-items: center; gap: 5px; font-size: 11px; color: var(--text-color);
28
+ }
29
+ .d3-score-correlation .legend .swatch {
30
+ width: 20px; height: 14px; border-radius: 3px; border: 1px solid var(--border-color);
31
+ }
32
+ </style>
33
+ <script>
34
+ (() => {
35
+ const ensureD3 = (cb) => {
36
+ if (window.d3 && typeof window.d3.select === 'function') return cb();
37
+ let s = document.getElementById('d3-cdn-script');
38
+ if (!s) { s = document.createElement('script'); s.id = 'd3-cdn-script'; s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js'; document.head.appendChild(s); }
39
+ const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
40
+ s.addEventListener('load', onReady, { once: true });
41
+ if (window.d3) onReady();
42
+ };
43
+
44
+ const bootstrap = () => {
45
+ const scriptEl = document.currentScript;
46
+ let container = scriptEl ? scriptEl.previousElementSibling : null;
47
+ while (container && !(container.classList && container.classList.contains('d3-score-correlation'))) {
48
+ container = container.previousElementSibling;
49
+ }
50
+ if (!container) {
51
+ const cs = Array.from(document.querySelectorAll('.d3-score-correlation'))
52
+ .filter(el => !(el.dataset && el.dataset.mounted === 'true'));
53
+ container = cs[cs.length - 1] || null;
54
+ }
55
+ if (!container) return;
56
+ if (container.dataset.mounted === 'true') return;
57
+ container.dataset.mounted = 'true';
58
+
59
+ let mountEl = container;
60
+ while (mountEl && !mountEl.getAttribute?.('data-datafiles')) mountEl = mountEl.parentElement;
61
+ const dataAttr = mountEl?.getAttribute?.('data-datafiles');
62
+ const dataPaths = dataAttr
63
+ ? [dataAttr.includes('/') ? dataAttr : `/data/${dataAttr}`]
64
+ : ['/data/rephrasing_metadata.json', './assets/data/rephrasing_metadata.json'];
65
+
66
+ const fetchFirst = async (paths) => {
67
+ for (const p of paths) {
68
+ try { const r = await fetch(p, { cache: 'no-cache' }); if (r.ok) return r.json(); } catch(_) {}
69
+ }
70
+ throw new Error('Data not found');
71
+ };
72
+
73
+ fetchFirst(dataPaths).then(data => buildChart(data)).catch(err => {
74
+ container.innerHTML = `<pre style="color:red;padding:12px;">Error: ${err.message}</pre>`;
75
+ });
76
+
77
+ function buildChart(rawData) {
78
+ // Spearman correlation helpers
79
+ const rankArray = (arr) => {
80
+ const indexed = arr.map((v, i) => ({ v, i })).sort((a, b) => a.v - b.v);
81
+ const ranks = new Array(arr.length);
82
+ let i = 0;
83
+ while (i < indexed.length) {
84
+ let j = i;
85
+ while (j < indexed.length && indexed[j].v === indexed[i].v) j++;
86
+ const avgRank = (i + j + 1) / 2;
87
+ for (let k = i; k < j; k++) ranks[indexed[k].i] = avgRank;
88
+ i = j;
89
+ }
90
+ return ranks;
91
+ };
92
+
93
+ const spearman = (x, y) => {
94
+ const n = x.length;
95
+ if (n < 5) return { r: 0, p: 1 };
96
+ const rx = rankArray(x), ry = rankArray(y);
97
+ const mx = rx.reduce((a, b) => a + b, 0) / n;
98
+ const my = ry.reduce((a, b) => a + b, 0) / n;
99
+ let num = 0, dx2 = 0, dy2 = 0;
100
+ for (let i = 0; i < n; i++) {
101
+ const dx = rx[i] - mx, dy = ry[i] - my;
102
+ num += dx * dy; dx2 += dx * dx; dy2 += dy * dy;
103
+ }
104
+ const r = dx2 && dy2 ? num / Math.sqrt(dx2 * dy2) : 0;
105
+ const t = r * Math.sqrt((n - 2) / (1 - r * r + 1e-15));
106
+ const df = n - 2;
107
+ const p = df > 30 ? 2 * (1 - normalCDF(Math.abs(t))) : 2 * (1 - tCDF(Math.abs(t), df));
108
+ return { r, p };
109
+ };
110
+
111
+ const normalCDF = (x) => {
112
+ const a1 = 0.254829592, a2 = -0.284496736, a3 = 1.421413741, a4 = -1.453152027, a5 = 1.061405429;
113
+ const p = 0.3275911, sign = x < 0 ? -1 : 1;
114
+ x = Math.abs(x) / Math.sqrt(2);
115
+ const t = 1.0 / (1.0 + p * x);
116
+ const y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * Math.exp(-x * x);
117
+ return 0.5 * (1.0 + sign * y);
118
+ };
119
+
120
+ const tCDF = (t, df) => 1 - 0.5 * incompleteBeta(df / 2, 0.5, df / (df + t * t));
121
+
122
+ const incompleteBeta = (a, b, x) => {
123
+ if (x === 0 || x === 1) return x;
124
+ const lnBeta = lgamma(a) + lgamma(b) - lgamma(a + b);
125
+ const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta);
126
+ let sum = 1, term = 1;
127
+ for (let n = 0; n < 200; n++) {
128
+ term *= (n === 0 ? 1 : (a + n - 1)) * x / (a + n);
129
+ if (n > 0) term *= (n - b) / n;
130
+ sum += term;
131
+ if (Math.abs(term) < 1e-10) break;
132
+ }
133
+ return front * sum / a;
134
+ };
135
+
136
+ const lgamma = (x) => {
137
+ const c = [76.18009172947146, -86.50532032941677, 24.01409824083091,
138
+ -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5];
139
+ let y = x, tmp = x + 5.5;
140
+ tmp -= (x + 0.5) * Math.log(tmp);
141
+ let ser = 1.000000000190015;
142
+ for (let j = 0; j < 6; j++) ser += c[j] / ++y;
143
+ return -tmp + Math.log(2.5066282746310005 * ser / x);
144
+ };
145
+
146
+ // Benchmark descriptions for tooltips
147
+ const BENCH_DESC = {
148
+ 'agg_score_macro': 'Mean of the six category aggregates (GK, RC, RES, NLU, MATH, TABLE).',
149
+ 'agg_score_micro': 'Mean of all 12 individual benchmark scores.',
150
+ 'agg_score_GK': 'Average of ARC Easy and MMLU Redux.',
151
+ 'agg_score_RC': 'Average of SQuAD v2 and DROP.',
152
+ 'agg_score_RES': 'Average of OpenBookQA and XCSQA.',
153
+ 'agg_score_NLU': 'Average of WinoGrande, PIQA, and HellaSwag.',
154
+ 'agg_score_MATH': 'Based on GSM8K alone.',
155
+ 'agg_score_TABLE': 'Average of WikiTableQ and TriviaQA.',
156
+ 'arc_cf:easy': 'Grade-school multiple-choice science questions testing knowledge and reasoning (AI2 Reasoning Challenge).',
157
+ 'mmlu_redux_cf:_average': 'Re-annotated multitask benchmark covering 57 subjects from STEM to humanities (MMLU Redux).',
158
+ 'squad_v2': 'Extractive reading comprehension on Wikipedia passages, including unanswerable questions (Stanford QA Dataset v2).',
159
+ 'drop': 'Reading comprehension requiring discrete reasoning: counting, sorting, and arithmetic over paragraphs.',
160
+ 'openbookqa_cf': 'Elementary science questions requiring multi-step reasoning beyond provided facts (OpenBookQA).',
161
+ 'xcsqa_cf': 'Cross-lingual commonsense QA testing general world knowledge across 16 languages (X-CSQA).',
162
+ 'winogrande_cf': 'Pronoun resolution problems testing commonsense reasoning, adversarially filtered to remove biases.',
163
+ 'piqa_cf': 'Physical intuition QA: choosing the most plausible solution to everyday physical tasks (PIQA).',
164
+ 'hellaswag_cf': 'Sentence completion testing commonsense inference, with adversarially crafted wrong endings (HellaSwag).',
165
+ 'gsm8k': 'Grade-school math word problems requiring 2–8 steps of arithmetic reasoning (GSM8K).',
166
+ 'wikitablequestions': 'Complex questions over Wikipedia tables requiring multi-step reasoning and aggregation.',
167
+ 'treb_qa': 'Large-scale trivia QA requiring cross-sentence reasoning over evidence documents (TriviaQA).',
168
+ };
169
+
170
+ // Predictors: output, input, delta, improvement for each group
171
+ const PREDICTORS = [
172
+ { key: 'output_dclm_score', label: 'Output DCLM', group: 'DCLM',
173
+ desc: 'Mean DCLM quality score of the rephrased (output) documents.' },
174
+ { key: 'input_dclm_score', label: 'Input DCLM', group: 'DCLM',
175
+ desc: 'Mean DCLM quality score of the original (input) documents before rephrasing.' },
176
+ { key: 'dclm_score_difference', label: 'DCLM Δ', group: 'DCLM',
177
+ desc: 'Absolute change in DCLM score: output minus input. Positive means the rephrasing increased perceived quality.' },
178
+ { key: 'dclm_score_improvement', label: 'DCLM Improvement %', group: 'DCLM',
179
+ desc: 'Relative improvement in DCLM score: (output − input) / input. Measures the proportional quality gain from rephrasing.' },
180
+ { key: 'output_edu_score', label: 'Output Edu', group: 'EDU',
181
+ desc: 'Mean FineWeb-Edu score of the rephrased (output) documents.' },
182
+ { key: 'input_edu_score', label: 'Input Edu', group: 'EDU',
183
+ desc: 'Mean FineWeb-Edu score of the original (input) documents before rephrasing.' },
184
+ { key: 'edu_score_difference', label: 'Edu Δ', group: 'EDU',
185
+ desc: 'Absolute change in Edu score: output minus input. Positive means the rephrasing increased educational value.' },
186
+ { key: 'edu_score_improvement', label: 'Edu Improvement %', group: 'EDU',
187
+ desc: 'Relative improvement in Edu score: (output − input) / input. Measures the proportional educational quality gain from rephrasing.' },
188
+ ];
189
+
190
+ // Targets: grouped so each agg is immediately left of its individual benchmarks
191
+ // Each group: { agg, individuals[] }
192
+ const GROUPS = [
193
+ {
194
+ name: 'Overall',
195
+ targets: [
196
+ { key: 'agg_score_macro', label: 'Macro Avg', isAgg: true },
197
+ { key: 'agg_score_micro', label: 'Micro Avg', isAgg: true },
198
+ ]
199
+ },
200
+ {
201
+ name: 'General Knowledge',
202
+ targets: [
203
+ { key: 'agg_score_GK', label: 'GK Agg', isAgg: true },
204
+ { key: 'arc_cf:easy', label: 'ARC Easy', isAgg: false },
205
+ { key: 'mmlu_redux_cf:_average', label: 'MMLU Redux', isAgg: false },
206
+ ]
207
+ },
208
+ {
209
+ name: 'Reading Comp.',
210
+ targets: [
211
+ { key: 'agg_score_RC', label: 'RC Agg', isAgg: true },
212
+ { key: 'squad_v2', label: 'SQuAD v2', isAgg: false },
213
+ { key: 'drop', label: 'DROP', isAgg: false },
214
+ ]
215
+ },
216
+ {
217
+ name: 'Reasoning',
218
+ targets: [
219
+ { key: 'agg_score_RES', label: 'RES Agg', isAgg: true },
220
+ { key: 'openbookqa_cf', label: 'OpenBookQA', isAgg: false },
221
+ { key: 'xcsqa_cf', label: 'XCSQA', isAgg: false },
222
+ ]
223
+ },
224
+ {
225
+ name: 'NLU',
226
+ targets: [
227
+ { key: 'agg_score_NLU', label: 'NLU Agg', isAgg: true },
228
+ { key: 'winogrande_cf', label: 'WinoGrande', isAgg: false },
229
+ { key: 'piqa_cf', label: 'PIQA', isAgg: false },
230
+ { key: 'hellaswag_cf', label: 'HellaSwag', isAgg: false },
231
+ ]
232
+ },
233
+ {
234
+ name: 'Math',
235
+ targets: [
236
+ { key: 'agg_score_MATH', label: 'Math Agg', isAgg: true },
237
+ { key: 'gsm8k', label: 'GSM8K', isAgg: false },
238
+ ]
239
+ },
240
+ {
241
+ name: 'Table',
242
+ targets: [
243
+ { key: 'agg_score_TABLE', label: 'Table Agg', isAgg: true },
244
+ { key: 'wikitablequestions', label: 'WikiTableQ', isAgg: false },
245
+ { key: 'treb_qa', label: 'TriviaQA', isAgg: false },
246
+ ]
247
+ },
248
+ ];
249
+
250
+ // Flatten targets in display order
251
+ const ALL_TARGETS = GROUPS.flatMap(g => g.targets);
252
+ const DCLM_COUNT = PREDICTORS.filter(p => p.group === 'DCLM').length;
253
+
254
+ // Compute correlation matrix
255
+ const matrix = [];
256
+ for (const pred of PREDICTORS) {
257
+ for (const tgt of ALL_TARGETS) {
258
+ const pairs = rawData
259
+ .filter(d => d[pred.key] != null && d.results[tgt.key] != null)
260
+ .map(d => [d[pred.key], d.results[tgt.key]]);
261
+ const { r, p } = spearman(pairs.map(p => p[0]), pairs.map(p => p[1]));
262
+ matrix.push({
263
+ predictor: pred.key, predictorLabel: pred.label,
264
+ target: tgt.key, targetLabel: tgt.label,
265
+ isAgg: tgt.isAgg,
266
+ desc: BENCH_DESC[tgt.key] || '',
267
+ r, p, n: pairs.length,
268
+ });
269
+ }
270
+ }
271
+
272
+ // Build the heatmap
273
+ container.style.position = 'relative';
274
+
275
+ const tip = document.createElement('div');
276
+ tip.className = 'd3-tooltip';
277
+ container.appendChild(tip);
278
+
279
+ const svg = d3.select(container).append('svg')
280
+ .attr('width', '100%')
281
+ .style('display', 'block');
282
+
283
+ const render = () => {
284
+ const width = container.clientWidth || 900;
285
+ const isDark = document.documentElement.getAttribute('data-theme') === 'dark';
286
+ const divColor = isDark ? 'rgba(255,255,255,0.22)' : 'rgba(0,0,0,0.18)';
287
+ const textCol = isDark ? 'rgba(255,255,255,0.8)' : 'rgba(0,0,0,0.7)';
288
+ const mutedCol = isDark ? 'rgba(255,255,255,0.4)' : 'rgba(0,0,0,0.35)';
289
+
290
+ const predLabels = PREDICTORS.map(p => p.label);
291
+
292
+ // Layout
293
+ const leftMargin = 140;
294
+ const topMargin = 130; // extra room for two-tier header
295
+ const rightMargin = 10;
296
+ const bottomMargin = 10;
297
+ const cellW = Math.max(30, Math.min(52, (width - leftMargin - rightMargin) / ALL_TARGETS.length));
298
+ const cellH = Math.max(28, Math.min(42, cellW * 0.82));
299
+ const plotW = cellW * ALL_TARGETS.length;
300
+ const rowGap = 8; // gap between DCLM and EDU groups
301
+ const plotH = cellH * predLabels.length + rowGap;
302
+ const totalW = leftMargin + plotW + rightMargin;
303
+ const totalH = topMargin + plotH + bottomMargin;
304
+
305
+ svg.attr('width', totalW).attr('height', totalH);
306
+ svg.selectAll('*').remove();
307
+
308
+ // Color scale: diverging, reversed so positive = blue
309
+ // Wider domain (±0.85) so colors stay readable longer
310
+ const colorScale = d3.scaleDiverging()
311
+ .domain([-0.85, 0, 0.85])
312
+ .interpolator(d3.interpolateRdBu)
313
+ .clamp(true);
314
+ const cellColor = (r) => colorScale(-r);
315
+
316
+ const g = svg.append('g').attr('transform', `translate(${leftMargin},${topMargin})`);
317
+
318
+ // --- Group dividers (vertical) and header labels ---
319
+ let colOffset = 0;
320
+ const groupHeaderY = 18; // top-level group name
321
+ const colLabelY = topMargin - 6; // individual column labels
322
+
323
+ GROUPS.forEach((grp, gi) => {
324
+ const groupStartX = colOffset * cellW;
325
+ const groupW = grp.targets.length * cellW;
326
+
327
+ // Vertical divider before each group (except first)
328
+ if (gi > 0) {
329
+ g.append('line')
330
+ .attr('x1', groupStartX).attr('x2', groupStartX)
331
+ .attr('y1', -4).attr('y2', plotH + 2)
332
+ .attr('stroke', divColor)
333
+ .attr('stroke-width', gi === 1 ? 1.5 : 1)
334
+ .attr('stroke-dasharray', gi === 1 ? 'none' : '4,3');
335
+ }
336
+
337
+ // Group header label (top tier)
338
+ svg.append('text')
339
+ .attr('x', leftMargin + groupStartX + groupW / 2)
340
+ .attr('y', groupHeaderY)
341
+ .attr('text-anchor', 'middle')
342
+ .attr('font-size', '9.5px')
343
+ .attr('font-weight', '700')
344
+ .attr('letter-spacing', '0.5px')
345
+ .attr('fill', mutedCol)
346
+ .text(grp.name.toUpperCase());
347
+
348
+ // Bracket line under group header
349
+ const bracketY = groupHeaderY + 8;
350
+ svg.append('line')
351
+ .attr('x1', leftMargin + groupStartX + 4)
352
+ .attr('x2', leftMargin + groupStartX + groupW - 4)
353
+ .attr('y1', bracketY).attr('y2', bracketY)
354
+ .attr('stroke', mutedCol)
355
+ .attr('stroke-width', 0.8);
356
+
357
+ colOffset += grp.targets.length;
358
+ });
359
+
360
+ // Helper: y position for a predictor row, with gap after DCLM
361
+ const rowY = (row) => row < DCLM_COUNT ? row * cellH : row * cellH + rowGap;
362
+
363
+ // --- Horizontal divider between DCLM and EDU ---
364
+ const divY = DCLM_COUNT * cellH + rowGap / 2;
365
+ g.append('line')
366
+ .attr('x1', -2).attr('x2', plotW + 2)
367
+ .attr('y1', divY).attr('y2', divY)
368
+ .attr('stroke', isDark ? 'rgba(255,255,255,0.45)' : 'rgba(0,0,0,0.35)')
369
+ .attr('stroke-width', 2.5);
370
+
371
+ // --- Draw cells ---
372
+ const cells = g.selectAll('g.cell')
373
+ .data(matrix)
374
+ .join('g')
375
+ .attr('class', 'cell')
376
+ .attr('transform', d => {
377
+ const col = ALL_TARGETS.findIndex(t => t.key === d.target);
378
+ const row = PREDICTORS.findIndex(p => p.key === d.predictor);
379
+ return `translate(${col * cellW},${rowY(row)})`;
380
+ });
381
+
382
+ cells.append('rect')
383
+ .attr('width', cellW - 1)
384
+ .attr('height', cellH - 1)
385
+ .attr('rx', 3)
386
+ .attr('fill', d => cellColor(d.r))
387
+ .attr('stroke', isDark ? 'rgba(255,255,255,0.06)' : 'rgba(0,0,0,0.04)')
388
+ .attr('stroke-width', 0.5);
389
+
390
+ const textFill = (r) => Math.abs(r) > 0.5 ? '#fff' : textCol;
391
+
392
+ cells.append('text')
393
+ .attr('x', (cellW - 1) / 2)
394
+ .attr('y', (cellH - 1) / 2)
395
+ .attr('text-anchor', 'middle')
396
+ .attr('dominant-baseline', 'central')
397
+ .attr('font-size', Math.max(9, Math.min(12, cellW * 0.24)) + 'px')
398
+ .attr('font-weight', d => Math.abs(d.r) > 0.4 ? '700' : '500')
399
+ .attr('fill', d => textFill(d.r))
400
+ .text(d => d.r.toFixed(2));
401
+
402
+ // Significance markers
403
+ cells.append('text')
404
+ .attr('x', cellW - 3).attr('y', 10)
405
+ .attr('text-anchor', 'end')
406
+ .attr('font-size', '11px')
407
+ .attr('font-weight', '700')
408
+ .attr('fill', d => Math.abs(d.r) > 0.5 ? 'rgba(255,255,255,0.8)' : mutedCol)
409
+ .text(d => d.p < 0.001 ? '***' : d.p < 0.01 ? '**' : d.p < 0.05 ? '*' : '');
410
+
411
+ // --- Row labels (predictors, with hover descriptions) ---
412
+ const gLabels = svg.append('g').attr('transform', `translate(${leftMargin - 8},${topMargin})`);
413
+ PREDICTORS.forEach((pred, i) => {
414
+ const labelG = gLabels.append('g')
415
+ .style('cursor', 'help');
416
+
417
+ labelG.append('text')
418
+ .attr('x', 0).attr('y', rowY(i) + cellH / 2)
419
+ .attr('text-anchor', 'end')
420
+ .attr('dominant-baseline', 'central')
421
+ .attr('font-size', '11px')
422
+ .attr('fill', textCol)
423
+ .attr('font-weight', '500')
424
+ .text(pred.label);
425
+
426
+ // Hit area
427
+ labelG.append('rect')
428
+ .attr('x', -leftMargin + 20).attr('y', rowY(i))
429
+ .attr('width', leftMargin - 20).attr('height', cellH)
430
+ .attr('fill', 'transparent');
431
+
432
+ labelG.on('mouseenter', function(ev) {
433
+ tip.innerHTML = `<div style="font-weight:700;font-size:13px;margin-bottom:4px;">${pred.label}</div><div style="font-size:12px;color:var(--muted-color);line-height:1.45;">${pred.desc}</div>`;
434
+ tip.style.opacity = '1';
435
+ })
436
+ .on('mousemove', function(ev) {
437
+ const [mx, my] = d3.pointer(ev, container);
438
+ const bw = tip.offsetWidth || 260;
439
+ const ox = 12;
440
+ const oy = (my + (tip.offsetHeight || 100) + 20 > totalH) ? -((tip.offsetHeight || 100) + 12) : 14;
441
+ tip.style.transform = `translate(${Math.round(mx + ox)}px,${Math.round(my + oy)}px)`;
442
+ })
443
+ .on('mouseleave', function() {
444
+ tip.style.opacity = '0';
445
+ tip.style.transform = 'translate(-9999px,-9999px)';
446
+ });
447
+ });
448
+
449
+ // --- Column labels (rotated, with hover descriptions) ---
450
+ const gColLabels = svg.append('g').attr('transform', `translate(${leftMargin},${topMargin - 6})`);
451
+ ALL_TARGETS.forEach((tgt, i) => {
452
+ const labelG = gColLabels.append('g')
453
+ .attr('transform', `translate(${i * cellW + cellW / 2},0)`)
454
+ .style('cursor', BENCH_DESC[tgt.key] ? 'help' : 'default');
455
+
456
+ labelG.append('text')
457
+ .attr('x', 0).attr('y', 0)
458
+ .attr('transform', 'rotate(-55)')
459
+ .attr('text-anchor', 'start')
460
+ .attr('font-size', '10px')
461
+ .attr('fill', textCol)
462
+ .attr('font-weight', tgt.isAgg ? '700' : '400')
463
+ .text(tgt.label);
464
+
465
+ if (BENCH_DESC[tgt.key]) {
466
+ // Invisible hit area for easier hovering on rotated text
467
+ labelG.append('rect')
468
+ .attr('x', -cellW / 2).attr('y', -80)
469
+ .attr('width', cellW).attr('height', 80)
470
+ .attr('fill', 'transparent');
471
+
472
+ labelG.on('mouseenter', function(ev) {
473
+ tip.innerHTML = `<div style="font-weight:700;font-size:13px;margin-bottom:4px;">${tgt.label}</div><div style="font-size:12px;color:var(--muted-color);line-height:1.45;">${BENCH_DESC[tgt.key]}</div>`;
474
+ tip.style.opacity = '1';
475
+ })
476
+ .on('mousemove', function(ev) {
477
+ const [mx, my] = d3.pointer(ev, container);
478
+ const bw = tip.offsetWidth || 260;
479
+ const ox = (mx + bw + 20 > totalW) ? -(bw + 12) : 12;
480
+ tip.style.transform = `translate(${Math.round(mx + ox)}px,${Math.round(my + 14)}px)`;
481
+ })
482
+ .on('mouseleave', function() {
483
+ tip.style.opacity = '0';
484
+ tip.style.transform = 'translate(-9999px,-9999px)';
485
+ });
486
+ }
487
+ });
488
+
489
+ // --- Predictor group labels (vertical) ---
490
+ const dclmCenterY = topMargin + (rowY(0) + rowY(DCLM_COUNT - 1) + cellH) / 2;
491
+ const eduCenterY = topMargin + (rowY(DCLM_COUNT) + rowY(PREDICTORS.length - 1) + cellH) / 2;
492
+ const groupLabelX = 14;
493
+
494
+ const GROUP_DESC = {
495
+ 'DCLM': 'DCLM score rates text quality on a 0–1 scale using a fastText classifier trained to distinguish curated, high-quality web data from random web crawls.',
496
+ 'EDU': 'FineWeb-Edu score rates educational value on a 0–5 scale using a classifier trained on LLM-annotated web pages, where higher scores indicate more instructive content.',
497
+ };
498
+
499
+ [['DCLM', dclmCenterY], ['EDU', eduCenterY]].forEach(([text, cy]) => {
500
+ const labelG = svg.append('g').style('cursor', 'help');
501
+
502
+ labelG.append('text')
503
+ .attr('x', groupLabelX).attr('y', cy)
504
+ .attr('text-anchor', 'middle')
505
+ .attr('dominant-baseline', 'central')
506
+ .attr('font-size', '9px')
507
+ .attr('font-weight', '700')
508
+ .attr('letter-spacing', '1px')
509
+ .attr('fill', isDark ? 'rgba(255,255,255,0.35)' : 'rgba(0,0,0,0.3)')
510
+ .attr('transform', `rotate(-90, ${groupLabelX}, ${cy})`)
511
+ .text(text);
512
+
513
+ // Hit area for the rotated text
514
+ const halfH = (DCLM_COUNT * cellH) / 2;
515
+ labelG.append('rect')
516
+ .attr('x', 0).attr('y', cy - halfH)
517
+ .attr('width', 24).attr('height', halfH * 2)
518
+ .attr('fill', 'transparent');
519
+
520
+ labelG.on('mouseenter', function() {
521
+ tip.innerHTML = `<div style="font-weight:700;font-size:13px;margin-bottom:4px;">${text} Score</div><div style="font-size:12px;color:var(--muted-color);line-height:1.45;">${GROUP_DESC[text]}</div>`;
522
+ tip.style.opacity = '1';
523
+ })
524
+ .on('mousemove', function(ev) {
525
+ const [mx, my] = d3.pointer(ev, container);
526
+ const bw = tip.offsetWidth || 260;
527
+ tip.style.transform = `translate(${Math.round(mx + 12)}px,${Math.round(my + 14)}px)`;
528
+ })
529
+ .on('mouseleave', function() {
530
+ tip.style.opacity = '0';
531
+ tip.style.transform = 'translate(-9999px,-9999px)';
532
+ });
533
+ });
534
+
535
+ // --- Tooltip interactions ---
536
+ cells.on('mouseenter', function(ev, d) {
537
+ d3.select(this).select('rect')
538
+ .attr('stroke', isDark ? 'rgba(255,255,255,0.6)' : 'rgba(0,0,0,0.5)')
539
+ .attr('stroke-width', 2);
540
+
541
+ const sig = d.p < 0.001 ? 'p < 0.001 (***)' : d.p < 0.01 ? `p = ${d.p.toFixed(3)} (**)` : d.p < 0.05 ? `p = ${d.p.toFixed(3)} (*)` : `p = ${d.p.toFixed(3)}`;
542
+ const descHtml = d.desc ? `<div style="margin-top:6px;padding-top:6px;border-top:1px solid var(--border-color);font-size:11px;color:var(--muted-color);line-height:1.4;">${d.desc}</div>` : '';
543
+ tip.innerHTML = `
544
+ <div style="font-weight:700;font-size:13px;margin-bottom:4px;">${d.predictorLabel} → ${d.targetLabel}</div>
545
+ <div style="display:grid;grid-template-columns:auto 1fr;gap:2px 10px;font-size:12px;">
546
+ <span style="color:var(--muted-color);">Spearman ρ</span><span style="font-weight:700;">${d.r.toFixed(4)}</span>
547
+ <span style="color:var(--muted-color);">Significance</span><span>${sig}</span>
548
+ <span style="color:var(--muted-color);">N</span><span>${d.n} experiments</span>
549
+ </div>${descHtml}`;
550
+ tip.style.opacity = '1';
551
+ })
552
+ .on('mousemove', function(ev) {
553
+ const [mx, my] = d3.pointer(ev, container);
554
+ const bw = tip.offsetWidth || 260;
555
+ const bh = tip.offsetHeight || 120;
556
+ const ox = (mx + bw + 20 > totalW) ? -(bw + 12) : 12;
557
+ const oy = (my + bh + 20 > totalH) ? -(bh + 12) : 14;
558
+ tip.style.transform = `translate(${Math.round(mx + ox)}px,${Math.round(my + oy)}px)`;
559
+ })
560
+ .on('mouseleave', function() {
561
+ d3.select(this).select('rect')
562
+ .attr('stroke', isDark ? 'rgba(255,255,255,0.06)' : 'rgba(0,0,0,0.04)')
563
+ .attr('stroke-width', 0.5);
564
+ tip.style.opacity = '0';
565
+ tip.style.transform = 'translate(-9999px,-9999px)';
566
+ });
567
+ };
568
+
569
+ render();
570
+ if (window.ResizeObserver) { new ResizeObserver(() => render()).observe(container); }
571
+ else { window.addEventListener('resize', render); }
572
+
573
+ // Legend
574
+ const legend = document.createElement('div');
575
+ legend.className = 'legend';
576
+ const cs = d3.scaleDiverging().domain([-0.85, 0, 0.85]).interpolator(d3.interpolateRdBu).clamp(true);
577
+ const sw = (r) => cs(-r);
578
+ legend.innerHTML = `
579
+ <div class="legend-title">Legend</div>
580
+ <div class="items">
581
+ <span class="item"><span class="swatch" style="background:${sw(-0.6)};"></span><span>ρ = −0.6</span></span>
582
+ <span class="item"><span class="swatch" style="background:${sw(-0.3)};"></span><span>ρ = −0.3</span></span>
583
+ <span class="item"><span class="swatch" style="background:${sw(0)};"></span><span>ρ = 0</span></span>
584
+ <span class="item"><span class="swatch" style="background:${sw(0.3)};"></span><span>ρ = +0.3</span></span>
585
+ <span class="item"><span class="swatch" style="background:${sw(0.6)};"></span><span>ρ = +0.6</span></span>
586
+ <span style="margin-left:12px;font-size:11px;color:var(--muted-color);">*** p&lt;0.001 &nbsp; ** p&lt;0.01 &nbsp; * p&lt;0.05</span>
587
+ </div>`;
588
+ container.appendChild(legend);
589
+ }
590
+ };
591
+
592
+ if (document.readyState === 'loading') {
593
+ document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
594
+ } else { ensureD3(bootstrap); }
595
+ })();
596
+ </script>