Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Commit ·
5ea1a40
1
Parent(s): a160373
add dclm/edu score correlation analysis
Browse files
app/src/content/analysis/score_correlation.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Analyze whether edu-score or DCLM-score predict downstream benchmark performance."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from scipy import stats
|
| 10 |
+
|
| 11 |
+
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
DATA_PATH = Path(__file__).parent / "assets/data/rephrasing_metadata.json"
|
| 15 |
+
|
| 16 |
+
# Individual benchmarks (not aggregates)
|
| 17 |
+
INDIVIDUAL_BENCHMARKS = [
|
| 18 |
+
"squad_v2", "arc_cf:easy", "hellaswag_cf", "mmlu_redux_cf:_average",
|
| 19 |
+
"gsm8k", "drop", "wikitablequestions", "treb_qa",
|
| 20 |
+
"winogrande_cf", "piqa_cf", "openbookqa_cf", "xcsqa_cf",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
# Aggregate scores
|
| 24 |
+
AGG_SCORES = [
|
| 25 |
+
"agg_score_GK", "agg_score_RC", "agg_score_RES",
|
| 26 |
+
"agg_score_NLU", "agg_score_MATH", "agg_score_TABLE",
|
| 27 |
+
"agg_score_macro", "agg_score_micro",
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
ALL_TARGETS = INDIVIDUAL_BENCHMARKS + AGG_SCORES
|
| 31 |
+
|
| 32 |
+
# Score predictors to test (both input and output variants)
|
| 33 |
+
PREDICTORS = [
|
| 34 |
+
"input_edu_score", "output_edu_score", "edu_score_difference", "edu_score_improvement",
|
| 35 |
+
"input_dclm_score", "output_dclm_score", "dclm_score_difference", "dclm_score_improvement",
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def load_data() -> pd.DataFrame:
|
| 40 |
+
"""Load rephrasing metadata and flatten results into columns."""
|
| 41 |
+
with open(DATA_PATH) as f:
|
| 42 |
+
raw = json.load(f)
|
| 43 |
+
|
| 44 |
+
rows = []
|
| 45 |
+
for entry in raw:
|
| 46 |
+
row = {k: v for k, v in entry.items() if k != "results"}
|
| 47 |
+
row.update(entry["results"])
|
| 48 |
+
rows.append(row)
|
| 49 |
+
|
| 50 |
+
return pd.DataFrame(rows)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def compute_correlations(df: pd.DataFrame) -> pd.DataFrame:
|
| 54 |
+
"""Compute Pearson and Spearman correlations between predictors and targets."""
|
| 55 |
+
results = []
|
| 56 |
+
for predictor in PREDICTORS:
|
| 57 |
+
for target in ALL_TARGETS:
|
| 58 |
+
x = df[predictor].values
|
| 59 |
+
y = df[target].values
|
| 60 |
+
|
| 61 |
+
# Drop NaN pairs
|
| 62 |
+
mask = ~(np.isnan(x) | np.isnan(y))
|
| 63 |
+
x, y = x[mask], y[mask]
|
| 64 |
+
|
| 65 |
+
if len(x) < 5:
|
| 66 |
+
continue
|
| 67 |
+
|
| 68 |
+
pearson_r, pearson_p = stats.pearsonr(x, y)
|
| 69 |
+
spearman_r, spearman_p = stats.spearmanr(x, y)
|
| 70 |
+
|
| 71 |
+
results.append({
|
| 72 |
+
"predictor": predictor,
|
| 73 |
+
"target": target,
|
| 74 |
+
"is_aggregate": target in AGG_SCORES,
|
| 75 |
+
"pearson_r": pearson_r,
|
| 76 |
+
"pearson_p": pearson_p,
|
| 77 |
+
"spearman_r": spearman_r,
|
| 78 |
+
"spearman_p": spearman_p,
|
| 79 |
+
"n": len(x),
|
| 80 |
+
})
|
| 81 |
+
|
| 82 |
+
return pd.DataFrame(results)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def print_correlation_table(corr_df: pd.DataFrame, title: str, sort_by: str = "spearman_r") -> None:
|
| 86 |
+
"""Print a formatted correlation table sorted by absolute correlation."""
|
| 87 |
+
logger.info(f"\n{'='*90}")
|
| 88 |
+
logger.info(f" {title}")
|
| 89 |
+
logger.info(f"{'='*90}")
|
| 90 |
+
|
| 91 |
+
df = corr_df.copy()
|
| 92 |
+
df["abs_spearman"] = df["spearman_r"].abs()
|
| 93 |
+
df = df.sort_values("abs_spearman", ascending=False)
|
| 94 |
+
|
| 95 |
+
logger.info(f"{'Predictor':<28} {'Target':<28} {'Pearson r':>10} {'p':>10} {'Spearman r':>10} {'p':>10}")
|
| 96 |
+
logger.info("-" * 98)
|
| 97 |
+
|
| 98 |
+
for _, row in df.iterrows():
|
| 99 |
+
sig_marker = ""
|
| 100 |
+
if row["spearman_p"] < 0.001:
|
| 101 |
+
sig_marker = "***"
|
| 102 |
+
elif row["spearman_p"] < 0.01:
|
| 103 |
+
sig_marker = "**"
|
| 104 |
+
elif row["spearman_p"] < 0.05:
|
| 105 |
+
sig_marker = "*"
|
| 106 |
+
|
| 107 |
+
logger.info(
|
| 108 |
+
f"{row['predictor']:<28} {row['target']:<28} "
|
| 109 |
+
f"{row['pearson_r']:>9.4f} {row['pearson_p']:>10.4f} "
|
| 110 |
+
f"{row['spearman_r']:>9.4f} {row['spearman_p']:>10.4f} {sig_marker}"
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def main() -> None:
|
| 115 |
+
df = load_data()
|
| 116 |
+
logger.info(f"Loaded {len(df)} experiments")
|
| 117 |
+
|
| 118 |
+
corr_df = compute_correlations(df)
|
| 119 |
+
|
| 120 |
+
# 1. Overall best predictors for agg_score_macro
|
| 121 |
+
macro_corr = corr_df[corr_df["target"] == "agg_score_macro"]
|
| 122 |
+
print_correlation_table(macro_corr, "Correlations with agg_score_macro")
|
| 123 |
+
|
| 124 |
+
# 2. Best predictors for each aggregate score
|
| 125 |
+
agg_corr = corr_df[corr_df["is_aggregate"]]
|
| 126 |
+
print_correlation_table(agg_corr, "All predictor-aggregate correlations")
|
| 127 |
+
|
| 128 |
+
# 3. Best predictors for individual benchmarks
|
| 129 |
+
indiv_corr = corr_df[~corr_df["is_aggregate"]]
|
| 130 |
+
print_correlation_table(indiv_corr, "All predictor-individual benchmark correlations")
|
| 131 |
+
|
| 132 |
+
# 4. Summary: for each predictor, which targets correlate best?
|
| 133 |
+
logger.info(f"\n{'='*90}")
|
| 134 |
+
logger.info(" Summary: Best target for each predictor (by |Spearman r|)")
|
| 135 |
+
logger.info(f"{'='*90}")
|
| 136 |
+
for predictor in PREDICTORS:
|
| 137 |
+
sub = corr_df[corr_df["predictor"] == predictor].copy()
|
| 138 |
+
sub["abs_spearman"] = sub["spearman_r"].abs()
|
| 139 |
+
best = sub.sort_values("abs_spearman", ascending=False).head(3)
|
| 140 |
+
logger.info(f"\n {predictor}:")
|
| 141 |
+
for _, row in best.iterrows():
|
| 142 |
+
sig = "***" if row["spearman_p"] < 0.001 else ("**" if row["spearman_p"] < 0.01 else ("*" if row["spearman_p"] < 0.05 else ""))
|
| 143 |
+
logger.info(f" {row['target']:<28} r={row['spearman_r']:>7.4f} p={row['spearman_p']:.4f} {sig}")
|
| 144 |
+
|
| 145 |
+
# 5. Summary: for each target, which predictor correlates best?
|
| 146 |
+
logger.info(f"\n{'='*90}")
|
| 147 |
+
logger.info(" Summary: Best predictor for each target (by |Spearman r|)")
|
| 148 |
+
logger.info(f"{'='*90}")
|
| 149 |
+
for target in ALL_TARGETS:
|
| 150 |
+
sub = corr_df[corr_df["target"] == target].copy()
|
| 151 |
+
sub["abs_spearman"] = sub["spearman_r"].abs()
|
| 152 |
+
best = sub.sort_values("abs_spearman", ascending=False).iloc[0]
|
| 153 |
+
sig = "***" if best["spearman_p"] < 0.001 else ("**" if best["spearman_p"] < 0.01 else ("*" if best["spearman_p"] < 0.05 else ""))
|
| 154 |
+
logger.info(
|
| 155 |
+
f" {target:<28} <- {best['predictor']:<28} r={best['spearman_r']:>7.4f} p={best['spearman_p']:.4f} {sig}"
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
# 6. Heatmap data: pivot table of Spearman correlations
|
| 159 |
+
logger.info(f"\n{'='*90}")
|
| 160 |
+
logger.info(" Spearman correlation heatmap (predictor x target)")
|
| 161 |
+
logger.info(f"{'='*90}")
|
| 162 |
+
pivot = corr_df.pivot(index="predictor", columns="target", values="spearman_r")
|
| 163 |
+
# Reorder
|
| 164 |
+
pivot = pivot.loc[PREDICTORS, ALL_TARGETS]
|
| 165 |
+
logger.info(pivot.round(3).to_string())
|
| 166 |
+
|
| 167 |
+
# Save heatmap data for potential D3 visualization
|
| 168 |
+
output_path = Path(__file__).parent / "score_correlation_results.json"
|
| 169 |
+
output = {
|
| 170 |
+
"heatmap": {
|
| 171 |
+
"predictors": PREDICTORS,
|
| 172 |
+
"targets": ALL_TARGETS,
|
| 173 |
+
"spearman_r": pivot.values.tolist(),
|
| 174 |
+
"individual_benchmarks": INDIVIDUAL_BENCHMARKS,
|
| 175 |
+
"aggregate_scores": AGG_SCORES,
|
| 176 |
+
},
|
| 177 |
+
"correlations": corr_df.to_dict(orient="records"),
|
| 178 |
+
}
|
| 179 |
+
with open(output_path, "w") as f:
|
| 180 |
+
json.dump(output, f, indent=2)
|
| 181 |
+
logger.info(f"\nSaved results to {output_path}")
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
if __name__ == "__main__":
|
| 185 |
+
main()
|
app/src/content/analysis/score_correlation_results.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb63ac51a4ac538702c4943266c5848d8f8257fdc34268e95e1d3ce6e1847012
|
| 3 |
+
size 52825
|
app/src/content/chapters/analyses.mdx
CHANGED
|
@@ -1,20 +1,38 @@
|
|
| 1 |
import HtmlEmbed from "../../components/HtmlEmbed.astro";
|
| 2 |
import FigRef from "../../components/FigRef.astro";
|
|
|
|
| 3 |
|
| 4 |
## Analyses
|
| 5 |
|
| 6 |
-
|
| 7 |
|
| 8 |
-
{/*
|
| 9 |
|
| 10 |
### Does edu-score or DCLM-score predict model performance?
|
| 11 |
|
| 12 |
-
Running these ablations is super expensive. So we were looking for informative proxies that can predict whether a certain dataset will result in better downstream benchmark performance. Since the FineWeb-Edu-score and DCLM-score work well for human data, we
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
|
|
|
|
|
|
|
| 16 |
*/}
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
### Math Rephrasing: When "Worse" Outputs Win
|
| 19 |
|
| 20 |
We compared two ~1.7B parameter models for generating math word problems: SmolLM2 and Qwen3. SmolLM2's outputs looked objectively worse, yet models trained on them performed better.
|
|
|
|
| 1 |
import HtmlEmbed from "../../components/HtmlEmbed.astro";
|
| 2 |
import FigRef from "../../components/FigRef.astro";
|
| 3 |
+
import Wide from "../../components/Wide.astro";
|
| 4 |
|
| 5 |
## Analyses
|
| 6 |
|
| 7 |
+
TODO: Add entry and exit paragraph
|
| 8 |
|
|
|
|
| 9 |
|
| 10 |
### Does edu-score or DCLM-score predict model performance?
|
| 11 |
|
| 12 |
+
Running these ablations is super expensive. So we were looking for informative proxies that can predict whether a certain dataset will result in better downstream benchmark performance. Since the FineWeb-Edu-score and DCLM-score work well for human data, we thought they might also work for synthetic data.
|
| 13 |
+
|
| 14 |
+
We computed Spearman rank correlations between various edu-score and DCLM-score metrics (input scores, output scores, score differences, and relative improvements) and all downstream benchmark results across our 65 experiments. <FigRef target="score-correlation" /> shows the full correlation matrix.
|
| 15 |
+
|
| 16 |
+
**DCLM-score is a moderate predictor of aggregate performance.** The output DCLM-score shows the strongest correlation with `agg_score_macro` (ρ = 0.55, p {'<'} 0.001), and DCLM-score difference (output minus input) is similarly predictive (ρ = 0.52). These are moderate correlations at best. The DCLM-score variants are particularly predictive for table understanding (ρ = 0.52–0.55) and reading comprehension (ρ = 0.45–0.47).
|
| 17 |
|
| 18 |
+
**Edu-score tells a more nuanced story.** The input edu-score (the score of the original data before rephrasing) correlates with aggregate performance (ρ = 0.43), but the output edu-score (the score of the rephrased data) barely correlates at all (ρ = 0.21, not significant). This suggests that starting with higher-quality source data matters, but the edu-score of the synthetic output is not a reliable proxy.
|
| 19 |
|
| 20 |
+
{/*
|
| 21 |
+
**The HellaSwag/PIQA anomaly deserves a closer look.** Edu-score improvement shows strong *positive* correlations with HellaSwag (ρ = 0.60) and PIQA (ρ = 0.58), while being *negatively* correlated with math (ρ = −0.39) and reading comprehension (ρ = −0.30). We investigated whether this was a confound from prompt type (FAQ and tutorial prompts both increase edu-scores and might independently help NLU). The correlation survives partial correlation controlling for prompt type (ρ = 0.65 for HellaSwag, ρ = 0.56 for PIQA, both p {'<'} 0.001) and for model size within the Gemma family (ρ = 0.60 and 0.68). So the effect is real. However, the practical magnitude is tiny: HellaSwag scores range from 0.066 to 0.092 across all 65 experiments (CV = 5.8%), compared to `agg_score_macro` ranging from 0.096 to 0.172 (CV = 10.5%). The edu-score captures something about sentence-completion and physical-intuition quality, but the absolute differences are so small that optimizing for it would be chasing noise.
|
| 22 |
*/}
|
| 23 |
|
| 24 |
+
**Neither score is a reliable universal proxy.** WinoGrande shows essentially zero correlation with any predictor. The strongest individual correlations (ρ ≈ 0.55–0.60) are still only moderate, explaining roughly 30% of the variance at best. **For synthetic data, there is no shortcut: you have to train models and evaluate them.**
|
| 25 |
+
|
| 26 |
+
<Wide>
|
| 27 |
+
<HtmlEmbed
|
| 28 |
+
id="score-correlation"
|
| 29 |
+
src="score-correlation.html"
|
| 30 |
+
data="rephrasing_metadata.json"
|
| 31 |
+
desc="Spearman rank correlations between quality score metrics and downstream benchmark performance across 65 rephrasing experiments. Blue cells indicate positive correlations, red cells negative. Significance: *** p<0.001, ** p<0.01, * p<0.05."
|
| 32 |
+
/>
|
| 33 |
+
</Wide>
|
| 34 |
+
|
| 35 |
+
|
| 36 |
### Math Rephrasing: When "Worse" Outputs Win
|
| 37 |
|
| 38 |
We compared two ~1.7B parameter models for generating math word problems: SmolLM2 and Qwen3. SmolLM2's outputs looked objectively worse, yet models trained on them performed better.
|
app/src/content/chapters/experiments.mdx
CHANGED
|
@@ -4,13 +4,10 @@ import Sidenote from "../../components/Sidenote.astro";
|
|
| 4 |
import Glossary from "../../components/Glossary.astro";
|
| 5 |
import FigRef from "../../components/FigRef.astro";
|
| 6 |
|
| 7 |
-
{/* TODO:
|
| 8 |
{/* TODO: shorten the vllm inference benchmark or put stuff into the appendix */}
|
| 9 |
{/* TODO: potentially make a widget for data exploration: look at the same few samples generated by different models or transformed with different prompts */}
|
| 10 |
-
{/* TODO:
|
| 11 |
-
{/* TODO: Analyze if certain models are more verbose than others (how many tokens did they produce per prompt?) */}
|
| 12 |
-
{/* TODO: Run dclm and edu score impact analysis on model verbosity data (wait for last rephrasing job to be done) */}
|
| 13 |
-
{/* TODO: Add appendix section of weird unexplainable results? */}
|
| 14 |
|
| 15 |
## Experiments
|
| 16 |
|
|
|
|
| 4 |
import Glossary from "../../components/Glossary.astro";
|
| 5 |
import FigRef from "../../components/FigRef.astro";
|
| 6 |
|
| 7 |
+
{/* TODO: mention the currently running finephrase rephrasing with smollm2 */}
|
| 8 |
{/* TODO: shorten the vllm inference benchmark or put stuff into the appendix */}
|
| 9 |
{/* TODO: potentially make a widget for data exploration: look at the same few samples generated by different models or transformed with different prompts */}
|
| 10 |
+
{/* TODO: Check if we have more information in the rephrasing_metadata that we can use to do analyses */}
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
## Experiments
|
| 13 |
|
app/src/content/embeds/score-correlation.html
ADDED
|
@@ -0,0 +1,596 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<div class="d3-score-correlation" style="width:100%;margin:10px 0;min-height:400px;"></div>
|
| 2 |
+
<style>
|
| 3 |
+
.d3-score-correlation { font-family: system-ui, -apple-system, sans-serif; }
|
| 4 |
+
.d3-score-correlation .d3-tooltip {
|
| 5 |
+
position: absolute; top: 0; left: 0;
|
| 6 |
+
transform: translate(-9999px, -9999px);
|
| 7 |
+
pointer-events: none;
|
| 8 |
+
padding: 10px 14px; border-radius: 10px;
|
| 9 |
+
font-size: 12px; line-height: 1.4;
|
| 10 |
+
border: 1px solid var(--border-color);
|
| 11 |
+
background: var(--surface-bg); color: var(--text-color);
|
| 12 |
+
box-shadow: 0 6px 24px rgba(0,0,0,.22);
|
| 13 |
+
opacity: 0; transition: opacity .12s ease;
|
| 14 |
+
z-index: 20; max-width: 300px;
|
| 15 |
+
}
|
| 16 |
+
.d3-score-correlation .legend {
|
| 17 |
+
display: flex; flex-direction: column; align-items: flex-start; gap: 6px;
|
| 18 |
+
margin-top: 8px;
|
| 19 |
+
}
|
| 20 |
+
.d3-score-correlation .legend-title {
|
| 21 |
+
font-size: 12px; font-weight: 700; color: var(--text-color);
|
| 22 |
+
}
|
| 23 |
+
.d3-score-correlation .legend .items {
|
| 24 |
+
display: flex; flex-wrap: wrap; gap: 4px 12px; align-items: center;
|
| 25 |
+
}
|
| 26 |
+
.d3-score-correlation .legend .item {
|
| 27 |
+
display: inline-flex; align-items: center; gap: 5px; font-size: 11px; color: var(--text-color);
|
| 28 |
+
}
|
| 29 |
+
.d3-score-correlation .legend .swatch {
|
| 30 |
+
width: 20px; height: 14px; border-radius: 3px; border: 1px solid var(--border-color);
|
| 31 |
+
}
|
| 32 |
+
</style>
|
| 33 |
+
<script>
|
| 34 |
+
(() => {
|
| 35 |
+
const ensureD3 = (cb) => {
|
| 36 |
+
if (window.d3 && typeof window.d3.select === 'function') return cb();
|
| 37 |
+
let s = document.getElementById('d3-cdn-script');
|
| 38 |
+
if (!s) { s = document.createElement('script'); s.id = 'd3-cdn-script'; s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js'; document.head.appendChild(s); }
|
| 39 |
+
const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); };
|
| 40 |
+
s.addEventListener('load', onReady, { once: true });
|
| 41 |
+
if (window.d3) onReady();
|
| 42 |
+
};
|
| 43 |
+
|
| 44 |
+
const bootstrap = () => {
|
| 45 |
+
const scriptEl = document.currentScript;
|
| 46 |
+
let container = scriptEl ? scriptEl.previousElementSibling : null;
|
| 47 |
+
while (container && !(container.classList && container.classList.contains('d3-score-correlation'))) {
|
| 48 |
+
container = container.previousElementSibling;
|
| 49 |
+
}
|
| 50 |
+
if (!container) {
|
| 51 |
+
const cs = Array.from(document.querySelectorAll('.d3-score-correlation'))
|
| 52 |
+
.filter(el => !(el.dataset && el.dataset.mounted === 'true'));
|
| 53 |
+
container = cs[cs.length - 1] || null;
|
| 54 |
+
}
|
| 55 |
+
if (!container) return;
|
| 56 |
+
if (container.dataset.mounted === 'true') return;
|
| 57 |
+
container.dataset.mounted = 'true';
|
| 58 |
+
|
| 59 |
+
let mountEl = container;
|
| 60 |
+
while (mountEl && !mountEl.getAttribute?.('data-datafiles')) mountEl = mountEl.parentElement;
|
| 61 |
+
const dataAttr = mountEl?.getAttribute?.('data-datafiles');
|
| 62 |
+
const dataPaths = dataAttr
|
| 63 |
+
? [dataAttr.includes('/') ? dataAttr : `/data/${dataAttr}`]
|
| 64 |
+
: ['/data/rephrasing_metadata.json', './assets/data/rephrasing_metadata.json'];
|
| 65 |
+
|
| 66 |
+
const fetchFirst = async (paths) => {
|
| 67 |
+
for (const p of paths) {
|
| 68 |
+
try { const r = await fetch(p, { cache: 'no-cache' }); if (r.ok) return r.json(); } catch(_) {}
|
| 69 |
+
}
|
| 70 |
+
throw new Error('Data not found');
|
| 71 |
+
};
|
| 72 |
+
|
| 73 |
+
fetchFirst(dataPaths).then(data => buildChart(data)).catch(err => {
|
| 74 |
+
container.innerHTML = `<pre style="color:red;padding:12px;">Error: ${err.message}</pre>`;
|
| 75 |
+
});
|
| 76 |
+
|
| 77 |
+
function buildChart(rawData) {
|
| 78 |
+
// Spearman correlation helpers
|
| 79 |
+
const rankArray = (arr) => {
|
| 80 |
+
const indexed = arr.map((v, i) => ({ v, i })).sort((a, b) => a.v - b.v);
|
| 81 |
+
const ranks = new Array(arr.length);
|
| 82 |
+
let i = 0;
|
| 83 |
+
while (i < indexed.length) {
|
| 84 |
+
let j = i;
|
| 85 |
+
while (j < indexed.length && indexed[j].v === indexed[i].v) j++;
|
| 86 |
+
const avgRank = (i + j + 1) / 2;
|
| 87 |
+
for (let k = i; k < j; k++) ranks[indexed[k].i] = avgRank;
|
| 88 |
+
i = j;
|
| 89 |
+
}
|
| 90 |
+
return ranks;
|
| 91 |
+
};
|
| 92 |
+
|
| 93 |
+
const spearman = (x, y) => {
|
| 94 |
+
const n = x.length;
|
| 95 |
+
if (n < 5) return { r: 0, p: 1 };
|
| 96 |
+
const rx = rankArray(x), ry = rankArray(y);
|
| 97 |
+
const mx = rx.reduce((a, b) => a + b, 0) / n;
|
| 98 |
+
const my = ry.reduce((a, b) => a + b, 0) / n;
|
| 99 |
+
let num = 0, dx2 = 0, dy2 = 0;
|
| 100 |
+
for (let i = 0; i < n; i++) {
|
| 101 |
+
const dx = rx[i] - mx, dy = ry[i] - my;
|
| 102 |
+
num += dx * dy; dx2 += dx * dx; dy2 += dy * dy;
|
| 103 |
+
}
|
| 104 |
+
const r = dx2 && dy2 ? num / Math.sqrt(dx2 * dy2) : 0;
|
| 105 |
+
const t = r * Math.sqrt((n - 2) / (1 - r * r + 1e-15));
|
| 106 |
+
const df = n - 2;
|
| 107 |
+
const p = df > 30 ? 2 * (1 - normalCDF(Math.abs(t))) : 2 * (1 - tCDF(Math.abs(t), df));
|
| 108 |
+
return { r, p };
|
| 109 |
+
};
|
| 110 |
+
|
| 111 |
+
const normalCDF = (x) => {
|
| 112 |
+
const a1 = 0.254829592, a2 = -0.284496736, a3 = 1.421413741, a4 = -1.453152027, a5 = 1.061405429;
|
| 113 |
+
const p = 0.3275911, sign = x < 0 ? -1 : 1;
|
| 114 |
+
x = Math.abs(x) / Math.sqrt(2);
|
| 115 |
+
const t = 1.0 / (1.0 + p * x);
|
| 116 |
+
const y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * Math.exp(-x * x);
|
| 117 |
+
return 0.5 * (1.0 + sign * y);
|
| 118 |
+
};
|
| 119 |
+
|
| 120 |
+
const tCDF = (t, df) => 1 - 0.5 * incompleteBeta(df / 2, 0.5, df / (df + t * t));
|
| 121 |
+
|
| 122 |
+
const incompleteBeta = (a, b, x) => {
|
| 123 |
+
if (x === 0 || x === 1) return x;
|
| 124 |
+
const lnBeta = lgamma(a) + lgamma(b) - lgamma(a + b);
|
| 125 |
+
const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta);
|
| 126 |
+
let sum = 1, term = 1;
|
| 127 |
+
for (let n = 0; n < 200; n++) {
|
| 128 |
+
term *= (n === 0 ? 1 : (a + n - 1)) * x / (a + n);
|
| 129 |
+
if (n > 0) term *= (n - b) / n;
|
| 130 |
+
sum += term;
|
| 131 |
+
if (Math.abs(term) < 1e-10) break;
|
| 132 |
+
}
|
| 133 |
+
return front * sum / a;
|
| 134 |
+
};
|
| 135 |
+
|
| 136 |
+
const lgamma = (x) => {
|
| 137 |
+
const c = [76.18009172947146, -86.50532032941677, 24.01409824083091,
|
| 138 |
+
-1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5];
|
| 139 |
+
let y = x, tmp = x + 5.5;
|
| 140 |
+
tmp -= (x + 0.5) * Math.log(tmp);
|
| 141 |
+
let ser = 1.000000000190015;
|
| 142 |
+
for (let j = 0; j < 6; j++) ser += c[j] / ++y;
|
| 143 |
+
return -tmp + Math.log(2.5066282746310005 * ser / x);
|
| 144 |
+
};
|
| 145 |
+
|
| 146 |
+
// Benchmark descriptions for tooltips
|
| 147 |
+
const BENCH_DESC = {
|
| 148 |
+
'agg_score_macro': 'Mean of the six category aggregates (GK, RC, RES, NLU, MATH, TABLE).',
|
| 149 |
+
'agg_score_micro': 'Mean of all 12 individual benchmark scores.',
|
| 150 |
+
'agg_score_GK': 'Average of ARC Easy and MMLU Redux.',
|
| 151 |
+
'agg_score_RC': 'Average of SQuAD v2 and DROP.',
|
| 152 |
+
'agg_score_RES': 'Average of OpenBookQA and XCSQA.',
|
| 153 |
+
'agg_score_NLU': 'Average of WinoGrande, PIQA, and HellaSwag.',
|
| 154 |
+
'agg_score_MATH': 'Based on GSM8K alone.',
|
| 155 |
+
'agg_score_TABLE': 'Average of WikiTableQ and TriviaQA.',
|
| 156 |
+
'arc_cf:easy': 'Grade-school multiple-choice science questions testing knowledge and reasoning (AI2 Reasoning Challenge).',
|
| 157 |
+
'mmlu_redux_cf:_average': 'Re-annotated multitask benchmark covering 57 subjects from STEM to humanities (MMLU Redux).',
|
| 158 |
+
'squad_v2': 'Extractive reading comprehension on Wikipedia passages, including unanswerable questions (Stanford QA Dataset v2).',
|
| 159 |
+
'drop': 'Reading comprehension requiring discrete reasoning: counting, sorting, and arithmetic over paragraphs.',
|
| 160 |
+
'openbookqa_cf': 'Elementary science questions requiring multi-step reasoning beyond provided facts (OpenBookQA).',
|
| 161 |
+
'xcsqa_cf': 'Cross-lingual commonsense QA testing general world knowledge across 16 languages (X-CSQA).',
|
| 162 |
+
'winogrande_cf': 'Pronoun resolution problems testing commonsense reasoning, adversarially filtered to remove biases.',
|
| 163 |
+
'piqa_cf': 'Physical intuition QA: choosing the most plausible solution to everyday physical tasks (PIQA).',
|
| 164 |
+
'hellaswag_cf': 'Sentence completion testing commonsense inference, with adversarially crafted wrong endings (HellaSwag).',
|
| 165 |
+
'gsm8k': 'Grade-school math word problems requiring 2–8 steps of arithmetic reasoning (GSM8K).',
|
| 166 |
+
'wikitablequestions': 'Complex questions over Wikipedia tables requiring multi-step reasoning and aggregation.',
|
| 167 |
+
'treb_qa': 'Large-scale trivia QA requiring cross-sentence reasoning over evidence documents (TriviaQA).',
|
| 168 |
+
};
|
| 169 |
+
|
| 170 |
+
// Predictors: output, input, delta, improvement for each group
|
| 171 |
+
const PREDICTORS = [
|
| 172 |
+
{ key: 'output_dclm_score', label: 'Output DCLM', group: 'DCLM',
|
| 173 |
+
desc: 'Mean DCLM quality score of the rephrased (output) documents.' },
|
| 174 |
+
{ key: 'input_dclm_score', label: 'Input DCLM', group: 'DCLM',
|
| 175 |
+
desc: 'Mean DCLM quality score of the original (input) documents before rephrasing.' },
|
| 176 |
+
{ key: 'dclm_score_difference', label: 'DCLM Δ', group: 'DCLM',
|
| 177 |
+
desc: 'Absolute change in DCLM score: output minus input. Positive means the rephrasing increased perceived quality.' },
|
| 178 |
+
{ key: 'dclm_score_improvement', label: 'DCLM Improvement %', group: 'DCLM',
|
| 179 |
+
desc: 'Relative improvement in DCLM score: (output − input) / input. Measures the proportional quality gain from rephrasing.' },
|
| 180 |
+
{ key: 'output_edu_score', label: 'Output Edu', group: 'EDU',
|
| 181 |
+
desc: 'Mean FineWeb-Edu score of the rephrased (output) documents.' },
|
| 182 |
+
{ key: 'input_edu_score', label: 'Input Edu', group: 'EDU',
|
| 183 |
+
desc: 'Mean FineWeb-Edu score of the original (input) documents before rephrasing.' },
|
| 184 |
+
{ key: 'edu_score_difference', label: 'Edu Δ', group: 'EDU',
|
| 185 |
+
desc: 'Absolute change in Edu score: output minus input. Positive means the rephrasing increased educational value.' },
|
| 186 |
+
{ key: 'edu_score_improvement', label: 'Edu Improvement %', group: 'EDU',
|
| 187 |
+
desc: 'Relative improvement in Edu score: (output − input) / input. Measures the proportional educational quality gain from rephrasing.' },
|
| 188 |
+
];
|
| 189 |
+
|
| 190 |
+
// Targets: grouped so each agg is immediately left of its individual benchmarks
|
| 191 |
+
// Each group: { agg, individuals[] }
|
| 192 |
+
const GROUPS = [
|
| 193 |
+
{
|
| 194 |
+
name: 'Overall',
|
| 195 |
+
targets: [
|
| 196 |
+
{ key: 'agg_score_macro', label: 'Macro Avg', isAgg: true },
|
| 197 |
+
{ key: 'agg_score_micro', label: 'Micro Avg', isAgg: true },
|
| 198 |
+
]
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
name: 'General Knowledge',
|
| 202 |
+
targets: [
|
| 203 |
+
{ key: 'agg_score_GK', label: 'GK Agg', isAgg: true },
|
| 204 |
+
{ key: 'arc_cf:easy', label: 'ARC Easy', isAgg: false },
|
| 205 |
+
{ key: 'mmlu_redux_cf:_average', label: 'MMLU Redux', isAgg: false },
|
| 206 |
+
]
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
name: 'Reading Comp.',
|
| 210 |
+
targets: [
|
| 211 |
+
{ key: 'agg_score_RC', label: 'RC Agg', isAgg: true },
|
| 212 |
+
{ key: 'squad_v2', label: 'SQuAD v2', isAgg: false },
|
| 213 |
+
{ key: 'drop', label: 'DROP', isAgg: false },
|
| 214 |
+
]
|
| 215 |
+
},
|
| 216 |
+
{
|
| 217 |
+
name: 'Reasoning',
|
| 218 |
+
targets: [
|
| 219 |
+
{ key: 'agg_score_RES', label: 'RES Agg', isAgg: true },
|
| 220 |
+
{ key: 'openbookqa_cf', label: 'OpenBookQA', isAgg: false },
|
| 221 |
+
{ key: 'xcsqa_cf', label: 'XCSQA', isAgg: false },
|
| 222 |
+
]
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
name: 'NLU',
|
| 226 |
+
targets: [
|
| 227 |
+
{ key: 'agg_score_NLU', label: 'NLU Agg', isAgg: true },
|
| 228 |
+
{ key: 'winogrande_cf', label: 'WinoGrande', isAgg: false },
|
| 229 |
+
{ key: 'piqa_cf', label: 'PIQA', isAgg: false },
|
| 230 |
+
{ key: 'hellaswag_cf', label: 'HellaSwag', isAgg: false },
|
| 231 |
+
]
|
| 232 |
+
},
|
| 233 |
+
{
|
| 234 |
+
name: 'Math',
|
| 235 |
+
targets: [
|
| 236 |
+
{ key: 'agg_score_MATH', label: 'Math Agg', isAgg: true },
|
| 237 |
+
{ key: 'gsm8k', label: 'GSM8K', isAgg: false },
|
| 238 |
+
]
|
| 239 |
+
},
|
| 240 |
+
{
|
| 241 |
+
name: 'Table',
|
| 242 |
+
targets: [
|
| 243 |
+
{ key: 'agg_score_TABLE', label: 'Table Agg', isAgg: true },
|
| 244 |
+
{ key: 'wikitablequestions', label: 'WikiTableQ', isAgg: false },
|
| 245 |
+
{ key: 'treb_qa', label: 'TriviaQA', isAgg: false },
|
| 246 |
+
]
|
| 247 |
+
},
|
| 248 |
+
];
|
| 249 |
+
|
| 250 |
+
// Flatten targets in display order
|
| 251 |
+
const ALL_TARGETS = GROUPS.flatMap(g => g.targets);
|
| 252 |
+
const DCLM_COUNT = PREDICTORS.filter(p => p.group === 'DCLM').length;
|
| 253 |
+
|
| 254 |
+
// Compute correlation matrix
|
| 255 |
+
const matrix = [];
|
| 256 |
+
for (const pred of PREDICTORS) {
|
| 257 |
+
for (const tgt of ALL_TARGETS) {
|
| 258 |
+
const pairs = rawData
|
| 259 |
+
.filter(d => d[pred.key] != null && d.results[tgt.key] != null)
|
| 260 |
+
.map(d => [d[pred.key], d.results[tgt.key]]);
|
| 261 |
+
const { r, p } = spearman(pairs.map(p => p[0]), pairs.map(p => p[1]));
|
| 262 |
+
matrix.push({
|
| 263 |
+
predictor: pred.key, predictorLabel: pred.label,
|
| 264 |
+
target: tgt.key, targetLabel: tgt.label,
|
| 265 |
+
isAgg: tgt.isAgg,
|
| 266 |
+
desc: BENCH_DESC[tgt.key] || '',
|
| 267 |
+
r, p, n: pairs.length,
|
| 268 |
+
});
|
| 269 |
+
}
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
// Build the heatmap
|
| 273 |
+
container.style.position = 'relative';
|
| 274 |
+
|
| 275 |
+
const tip = document.createElement('div');
|
| 276 |
+
tip.className = 'd3-tooltip';
|
| 277 |
+
container.appendChild(tip);
|
| 278 |
+
|
| 279 |
+
const svg = d3.select(container).append('svg')
|
| 280 |
+
.attr('width', '100%')
|
| 281 |
+
.style('display', 'block');
|
| 282 |
+
|
| 283 |
+
const render = () => {
|
| 284 |
+
const width = container.clientWidth || 900;
|
| 285 |
+
const isDark = document.documentElement.getAttribute('data-theme') === 'dark';
|
| 286 |
+
const divColor = isDark ? 'rgba(255,255,255,0.22)' : 'rgba(0,0,0,0.18)';
|
| 287 |
+
const textCol = isDark ? 'rgba(255,255,255,0.8)' : 'rgba(0,0,0,0.7)';
|
| 288 |
+
const mutedCol = isDark ? 'rgba(255,255,255,0.4)' : 'rgba(0,0,0,0.35)';
|
| 289 |
+
|
| 290 |
+
const predLabels = PREDICTORS.map(p => p.label);
|
| 291 |
+
|
| 292 |
+
// Layout
|
| 293 |
+
const leftMargin = 140;
|
| 294 |
+
const topMargin = 130; // extra room for two-tier header
|
| 295 |
+
const rightMargin = 10;
|
| 296 |
+
const bottomMargin = 10;
|
| 297 |
+
const cellW = Math.max(30, Math.min(52, (width - leftMargin - rightMargin) / ALL_TARGETS.length));
|
| 298 |
+
const cellH = Math.max(28, Math.min(42, cellW * 0.82));
|
| 299 |
+
const plotW = cellW * ALL_TARGETS.length;
|
| 300 |
+
const rowGap = 8; // gap between DCLM and EDU groups
|
| 301 |
+
const plotH = cellH * predLabels.length + rowGap;
|
| 302 |
+
const totalW = leftMargin + plotW + rightMargin;
|
| 303 |
+
const totalH = topMargin + plotH + bottomMargin;
|
| 304 |
+
|
| 305 |
+
svg.attr('width', totalW).attr('height', totalH);
|
| 306 |
+
svg.selectAll('*').remove();
|
| 307 |
+
|
| 308 |
+
// Color scale: diverging, reversed so positive = blue
|
| 309 |
+
// Wider domain (±0.85) so colors stay readable longer
|
| 310 |
+
const colorScale = d3.scaleDiverging()
|
| 311 |
+
.domain([-0.85, 0, 0.85])
|
| 312 |
+
.interpolator(d3.interpolateRdBu)
|
| 313 |
+
.clamp(true);
|
| 314 |
+
const cellColor = (r) => colorScale(-r);
|
| 315 |
+
|
| 316 |
+
const g = svg.append('g').attr('transform', `translate(${leftMargin},${topMargin})`);
|
| 317 |
+
|
| 318 |
+
// --- Group dividers (vertical) and header labels ---
|
| 319 |
+
let colOffset = 0;
|
| 320 |
+
const groupHeaderY = 18; // top-level group name
|
| 321 |
+
const colLabelY = topMargin - 6; // individual column labels
|
| 322 |
+
|
| 323 |
+
GROUPS.forEach((grp, gi) => {
|
| 324 |
+
const groupStartX = colOffset * cellW;
|
| 325 |
+
const groupW = grp.targets.length * cellW;
|
| 326 |
+
|
| 327 |
+
// Vertical divider before each group (except first)
|
| 328 |
+
if (gi > 0) {
|
| 329 |
+
g.append('line')
|
| 330 |
+
.attr('x1', groupStartX).attr('x2', groupStartX)
|
| 331 |
+
.attr('y1', -4).attr('y2', plotH + 2)
|
| 332 |
+
.attr('stroke', divColor)
|
| 333 |
+
.attr('stroke-width', gi === 1 ? 1.5 : 1)
|
| 334 |
+
.attr('stroke-dasharray', gi === 1 ? 'none' : '4,3');
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
// Group header label (top tier)
|
| 338 |
+
svg.append('text')
|
| 339 |
+
.attr('x', leftMargin + groupStartX + groupW / 2)
|
| 340 |
+
.attr('y', groupHeaderY)
|
| 341 |
+
.attr('text-anchor', 'middle')
|
| 342 |
+
.attr('font-size', '9.5px')
|
| 343 |
+
.attr('font-weight', '700')
|
| 344 |
+
.attr('letter-spacing', '0.5px')
|
| 345 |
+
.attr('fill', mutedCol)
|
| 346 |
+
.text(grp.name.toUpperCase());
|
| 347 |
+
|
| 348 |
+
// Bracket line under group header
|
| 349 |
+
const bracketY = groupHeaderY + 8;
|
| 350 |
+
svg.append('line')
|
| 351 |
+
.attr('x1', leftMargin + groupStartX + 4)
|
| 352 |
+
.attr('x2', leftMargin + groupStartX + groupW - 4)
|
| 353 |
+
.attr('y1', bracketY).attr('y2', bracketY)
|
| 354 |
+
.attr('stroke', mutedCol)
|
| 355 |
+
.attr('stroke-width', 0.8);
|
| 356 |
+
|
| 357 |
+
colOffset += grp.targets.length;
|
| 358 |
+
});
|
| 359 |
+
|
| 360 |
+
// Helper: y position for a predictor row, with gap after DCLM
|
| 361 |
+
const rowY = (row) => row < DCLM_COUNT ? row * cellH : row * cellH + rowGap;
|
| 362 |
+
|
| 363 |
+
// --- Horizontal divider between DCLM and EDU ---
|
| 364 |
+
const divY = DCLM_COUNT * cellH + rowGap / 2;
|
| 365 |
+
g.append('line')
|
| 366 |
+
.attr('x1', -2).attr('x2', plotW + 2)
|
| 367 |
+
.attr('y1', divY).attr('y2', divY)
|
| 368 |
+
.attr('stroke', isDark ? 'rgba(255,255,255,0.45)' : 'rgba(0,0,0,0.35)')
|
| 369 |
+
.attr('stroke-width', 2.5);
|
| 370 |
+
|
| 371 |
+
// --- Draw cells ---
|
| 372 |
+
const cells = g.selectAll('g.cell')
|
| 373 |
+
.data(matrix)
|
| 374 |
+
.join('g')
|
| 375 |
+
.attr('class', 'cell')
|
| 376 |
+
.attr('transform', d => {
|
| 377 |
+
const col = ALL_TARGETS.findIndex(t => t.key === d.target);
|
| 378 |
+
const row = PREDICTORS.findIndex(p => p.key === d.predictor);
|
| 379 |
+
return `translate(${col * cellW},${rowY(row)})`;
|
| 380 |
+
});
|
| 381 |
+
|
| 382 |
+
cells.append('rect')
|
| 383 |
+
.attr('width', cellW - 1)
|
| 384 |
+
.attr('height', cellH - 1)
|
| 385 |
+
.attr('rx', 3)
|
| 386 |
+
.attr('fill', d => cellColor(d.r))
|
| 387 |
+
.attr('stroke', isDark ? 'rgba(255,255,255,0.06)' : 'rgba(0,0,0,0.04)')
|
| 388 |
+
.attr('stroke-width', 0.5);
|
| 389 |
+
|
| 390 |
+
const textFill = (r) => Math.abs(r) > 0.5 ? '#fff' : textCol;
|
| 391 |
+
|
| 392 |
+
cells.append('text')
|
| 393 |
+
.attr('x', (cellW - 1) / 2)
|
| 394 |
+
.attr('y', (cellH - 1) / 2)
|
| 395 |
+
.attr('text-anchor', 'middle')
|
| 396 |
+
.attr('dominant-baseline', 'central')
|
| 397 |
+
.attr('font-size', Math.max(9, Math.min(12, cellW * 0.24)) + 'px')
|
| 398 |
+
.attr('font-weight', d => Math.abs(d.r) > 0.4 ? '700' : '500')
|
| 399 |
+
.attr('fill', d => textFill(d.r))
|
| 400 |
+
.text(d => d.r.toFixed(2));
|
| 401 |
+
|
| 402 |
+
// Significance markers
|
| 403 |
+
cells.append('text')
|
| 404 |
+
.attr('x', cellW - 3).attr('y', 10)
|
| 405 |
+
.attr('text-anchor', 'end')
|
| 406 |
+
.attr('font-size', '11px')
|
| 407 |
+
.attr('font-weight', '700')
|
| 408 |
+
.attr('fill', d => Math.abs(d.r) > 0.5 ? 'rgba(255,255,255,0.8)' : mutedCol)
|
| 409 |
+
.text(d => d.p < 0.001 ? '***' : d.p < 0.01 ? '**' : d.p < 0.05 ? '*' : '');
|
| 410 |
+
|
| 411 |
+
// --- Row labels (predictors, with hover descriptions) ---
|
| 412 |
+
const gLabels = svg.append('g').attr('transform', `translate(${leftMargin - 8},${topMargin})`);
|
| 413 |
+
PREDICTORS.forEach((pred, i) => {
|
| 414 |
+
const labelG = gLabels.append('g')
|
| 415 |
+
.style('cursor', 'help');
|
| 416 |
+
|
| 417 |
+
labelG.append('text')
|
| 418 |
+
.attr('x', 0).attr('y', rowY(i) + cellH / 2)
|
| 419 |
+
.attr('text-anchor', 'end')
|
| 420 |
+
.attr('dominant-baseline', 'central')
|
| 421 |
+
.attr('font-size', '11px')
|
| 422 |
+
.attr('fill', textCol)
|
| 423 |
+
.attr('font-weight', '500')
|
| 424 |
+
.text(pred.label);
|
| 425 |
+
|
| 426 |
+
// Hit area
|
| 427 |
+
labelG.append('rect')
|
| 428 |
+
.attr('x', -leftMargin + 20).attr('y', rowY(i))
|
| 429 |
+
.attr('width', leftMargin - 20).attr('height', cellH)
|
| 430 |
+
.attr('fill', 'transparent');
|
| 431 |
+
|
| 432 |
+
labelG.on('mouseenter', function(ev) {
|
| 433 |
+
tip.innerHTML = `<div style="font-weight:700;font-size:13px;margin-bottom:4px;">${pred.label}</div><div style="font-size:12px;color:var(--muted-color);line-height:1.45;">${pred.desc}</div>`;
|
| 434 |
+
tip.style.opacity = '1';
|
| 435 |
+
})
|
| 436 |
+
.on('mousemove', function(ev) {
|
| 437 |
+
const [mx, my] = d3.pointer(ev, container);
|
| 438 |
+
const bw = tip.offsetWidth || 260;
|
| 439 |
+
const ox = 12;
|
| 440 |
+
const oy = (my + (tip.offsetHeight || 100) + 20 > totalH) ? -((tip.offsetHeight || 100) + 12) : 14;
|
| 441 |
+
tip.style.transform = `translate(${Math.round(mx + ox)}px,${Math.round(my + oy)}px)`;
|
| 442 |
+
})
|
| 443 |
+
.on('mouseleave', function() {
|
| 444 |
+
tip.style.opacity = '0';
|
| 445 |
+
tip.style.transform = 'translate(-9999px,-9999px)';
|
| 446 |
+
});
|
| 447 |
+
});
|
| 448 |
+
|
| 449 |
+
// --- Column labels (rotated, with hover descriptions) ---
|
| 450 |
+
const gColLabels = svg.append('g').attr('transform', `translate(${leftMargin},${topMargin - 6})`);
|
| 451 |
+
ALL_TARGETS.forEach((tgt, i) => {
|
| 452 |
+
const labelG = gColLabels.append('g')
|
| 453 |
+
.attr('transform', `translate(${i * cellW + cellW / 2},0)`)
|
| 454 |
+
.style('cursor', BENCH_DESC[tgt.key] ? 'help' : 'default');
|
| 455 |
+
|
| 456 |
+
labelG.append('text')
|
| 457 |
+
.attr('x', 0).attr('y', 0)
|
| 458 |
+
.attr('transform', 'rotate(-55)')
|
| 459 |
+
.attr('text-anchor', 'start')
|
| 460 |
+
.attr('font-size', '10px')
|
| 461 |
+
.attr('fill', textCol)
|
| 462 |
+
.attr('font-weight', tgt.isAgg ? '700' : '400')
|
| 463 |
+
.text(tgt.label);
|
| 464 |
+
|
| 465 |
+
if (BENCH_DESC[tgt.key]) {
|
| 466 |
+
// Invisible hit area for easier hovering on rotated text
|
| 467 |
+
labelG.append('rect')
|
| 468 |
+
.attr('x', -cellW / 2).attr('y', -80)
|
| 469 |
+
.attr('width', cellW).attr('height', 80)
|
| 470 |
+
.attr('fill', 'transparent');
|
| 471 |
+
|
| 472 |
+
labelG.on('mouseenter', function(ev) {
|
| 473 |
+
tip.innerHTML = `<div style="font-weight:700;font-size:13px;margin-bottom:4px;">${tgt.label}</div><div style="font-size:12px;color:var(--muted-color);line-height:1.45;">${BENCH_DESC[tgt.key]}</div>`;
|
| 474 |
+
tip.style.opacity = '1';
|
| 475 |
+
})
|
| 476 |
+
.on('mousemove', function(ev) {
|
| 477 |
+
const [mx, my] = d3.pointer(ev, container);
|
| 478 |
+
const bw = tip.offsetWidth || 260;
|
| 479 |
+
const ox = (mx + bw + 20 > totalW) ? -(bw + 12) : 12;
|
| 480 |
+
tip.style.transform = `translate(${Math.round(mx + ox)}px,${Math.round(my + 14)}px)`;
|
| 481 |
+
})
|
| 482 |
+
.on('mouseleave', function() {
|
| 483 |
+
tip.style.opacity = '0';
|
| 484 |
+
tip.style.transform = 'translate(-9999px,-9999px)';
|
| 485 |
+
});
|
| 486 |
+
}
|
| 487 |
+
});
|
| 488 |
+
|
| 489 |
+
// --- Predictor group labels (vertical) ---
|
| 490 |
+
const dclmCenterY = topMargin + (rowY(0) + rowY(DCLM_COUNT - 1) + cellH) / 2;
|
| 491 |
+
const eduCenterY = topMargin + (rowY(DCLM_COUNT) + rowY(PREDICTORS.length - 1) + cellH) / 2;
|
| 492 |
+
const groupLabelX = 14;
|
| 493 |
+
|
| 494 |
+
const GROUP_DESC = {
|
| 495 |
+
'DCLM': 'DCLM score rates text quality on a 0–1 scale using a fastText classifier trained to distinguish curated, high-quality web data from random web crawls.',
|
| 496 |
+
'EDU': 'FineWeb-Edu score rates educational value on a 0–5 scale using a classifier trained on LLM-annotated web pages, where higher scores indicate more instructive content.',
|
| 497 |
+
};
|
| 498 |
+
|
| 499 |
+
[['DCLM', dclmCenterY], ['EDU', eduCenterY]].forEach(([text, cy]) => {
|
| 500 |
+
const labelG = svg.append('g').style('cursor', 'help');
|
| 501 |
+
|
| 502 |
+
labelG.append('text')
|
| 503 |
+
.attr('x', groupLabelX).attr('y', cy)
|
| 504 |
+
.attr('text-anchor', 'middle')
|
| 505 |
+
.attr('dominant-baseline', 'central')
|
| 506 |
+
.attr('font-size', '9px')
|
| 507 |
+
.attr('font-weight', '700')
|
| 508 |
+
.attr('letter-spacing', '1px')
|
| 509 |
+
.attr('fill', isDark ? 'rgba(255,255,255,0.35)' : 'rgba(0,0,0,0.3)')
|
| 510 |
+
.attr('transform', `rotate(-90, ${groupLabelX}, ${cy})`)
|
| 511 |
+
.text(text);
|
| 512 |
+
|
| 513 |
+
// Hit area for the rotated text
|
| 514 |
+
const halfH = (DCLM_COUNT * cellH) / 2;
|
| 515 |
+
labelG.append('rect')
|
| 516 |
+
.attr('x', 0).attr('y', cy - halfH)
|
| 517 |
+
.attr('width', 24).attr('height', halfH * 2)
|
| 518 |
+
.attr('fill', 'transparent');
|
| 519 |
+
|
| 520 |
+
labelG.on('mouseenter', function() {
|
| 521 |
+
tip.innerHTML = `<div style="font-weight:700;font-size:13px;margin-bottom:4px;">${text} Score</div><div style="font-size:12px;color:var(--muted-color);line-height:1.45;">${GROUP_DESC[text]}</div>`;
|
| 522 |
+
tip.style.opacity = '1';
|
| 523 |
+
})
|
| 524 |
+
.on('mousemove', function(ev) {
|
| 525 |
+
const [mx, my] = d3.pointer(ev, container);
|
| 526 |
+
const bw = tip.offsetWidth || 260;
|
| 527 |
+
tip.style.transform = `translate(${Math.round(mx + 12)}px,${Math.round(my + 14)}px)`;
|
| 528 |
+
})
|
| 529 |
+
.on('mouseleave', function() {
|
| 530 |
+
tip.style.opacity = '0';
|
| 531 |
+
tip.style.transform = 'translate(-9999px,-9999px)';
|
| 532 |
+
});
|
| 533 |
+
});
|
| 534 |
+
|
| 535 |
+
// --- Tooltip interactions ---
|
| 536 |
+
cells.on('mouseenter', function(ev, d) {
|
| 537 |
+
d3.select(this).select('rect')
|
| 538 |
+
.attr('stroke', isDark ? 'rgba(255,255,255,0.6)' : 'rgba(0,0,0,0.5)')
|
| 539 |
+
.attr('stroke-width', 2);
|
| 540 |
+
|
| 541 |
+
const sig = d.p < 0.001 ? 'p < 0.001 (***)' : d.p < 0.01 ? `p = ${d.p.toFixed(3)} (**)` : d.p < 0.05 ? `p = ${d.p.toFixed(3)} (*)` : `p = ${d.p.toFixed(3)}`;
|
| 542 |
+
const descHtml = d.desc ? `<div style="margin-top:6px;padding-top:6px;border-top:1px solid var(--border-color);font-size:11px;color:var(--muted-color);line-height:1.4;">${d.desc}</div>` : '';
|
| 543 |
+
tip.innerHTML = `
|
| 544 |
+
<div style="font-weight:700;font-size:13px;margin-bottom:4px;">${d.predictorLabel} → ${d.targetLabel}</div>
|
| 545 |
+
<div style="display:grid;grid-template-columns:auto 1fr;gap:2px 10px;font-size:12px;">
|
| 546 |
+
<span style="color:var(--muted-color);">Spearman ρ</span><span style="font-weight:700;">${d.r.toFixed(4)}</span>
|
| 547 |
+
<span style="color:var(--muted-color);">Significance</span><span>${sig}</span>
|
| 548 |
+
<span style="color:var(--muted-color);">N</span><span>${d.n} experiments</span>
|
| 549 |
+
</div>${descHtml}`;
|
| 550 |
+
tip.style.opacity = '1';
|
| 551 |
+
})
|
| 552 |
+
.on('mousemove', function(ev) {
|
| 553 |
+
const [mx, my] = d3.pointer(ev, container);
|
| 554 |
+
const bw = tip.offsetWidth || 260;
|
| 555 |
+
const bh = tip.offsetHeight || 120;
|
| 556 |
+
const ox = (mx + bw + 20 > totalW) ? -(bw + 12) : 12;
|
| 557 |
+
const oy = (my + bh + 20 > totalH) ? -(bh + 12) : 14;
|
| 558 |
+
tip.style.transform = `translate(${Math.round(mx + ox)}px,${Math.round(my + oy)}px)`;
|
| 559 |
+
})
|
| 560 |
+
.on('mouseleave', function() {
|
| 561 |
+
d3.select(this).select('rect')
|
| 562 |
+
.attr('stroke', isDark ? 'rgba(255,255,255,0.06)' : 'rgba(0,0,0,0.04)')
|
| 563 |
+
.attr('stroke-width', 0.5);
|
| 564 |
+
tip.style.opacity = '0';
|
| 565 |
+
tip.style.transform = 'translate(-9999px,-9999px)';
|
| 566 |
+
});
|
| 567 |
+
};
|
| 568 |
+
|
| 569 |
+
render();
|
| 570 |
+
if (window.ResizeObserver) { new ResizeObserver(() => render()).observe(container); }
|
| 571 |
+
else { window.addEventListener('resize', render); }
|
| 572 |
+
|
| 573 |
+
// Legend
|
| 574 |
+
const legend = document.createElement('div');
|
| 575 |
+
legend.className = 'legend';
|
| 576 |
+
const cs = d3.scaleDiverging().domain([-0.85, 0, 0.85]).interpolator(d3.interpolateRdBu).clamp(true);
|
| 577 |
+
const sw = (r) => cs(-r);
|
| 578 |
+
legend.innerHTML = `
|
| 579 |
+
<div class="legend-title">Legend</div>
|
| 580 |
+
<div class="items">
|
| 581 |
+
<span class="item"><span class="swatch" style="background:${sw(-0.6)};"></span><span>ρ = −0.6</span></span>
|
| 582 |
+
<span class="item"><span class="swatch" style="background:${sw(-0.3)};"></span><span>ρ = −0.3</span></span>
|
| 583 |
+
<span class="item"><span class="swatch" style="background:${sw(0)};"></span><span>ρ = 0</span></span>
|
| 584 |
+
<span class="item"><span class="swatch" style="background:${sw(0.3)};"></span><span>ρ = +0.3</span></span>
|
| 585 |
+
<span class="item"><span class="swatch" style="background:${sw(0.6)};"></span><span>ρ = +0.6</span></span>
|
| 586 |
+
<span style="margin-left:12px;font-size:11px;color:var(--muted-color);">*** p<0.001 ** p<0.01 * p<0.05</span>
|
| 587 |
+
</div>`;
|
| 588 |
+
container.appendChild(legend);
|
| 589 |
+
}
|
| 590 |
+
};
|
| 591 |
+
|
| 592 |
+
if (document.readyState === 'loading') {
|
| 593 |
+
document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true });
|
| 594 |
+
} else { ensureD3(bootstrap); }
|
| 595 |
+
})();
|
| 596 |
+
</script>
|