| import gradio as gr |
| import pandas as pd |
|
|
| |
|
|
| SUMMARY = pd.DataFrame([ |
| {"Method": "AirRep (pretrained)", "Spearman ρ": 0.1172, "AUPRC": 0.132, "ROC-AUC": 0.638, "R@10": 0.16, "R@100": 0.35, "MRR": 0.344, "best-F1": 0.223}, |
| {"Method": "STRIDE (300 subsets)", "Spearman ρ": 0.1152, "AUPRC": 0.109, "ROC-AUC": 0.647, "R@10": 0.14, "R@100": 0.32, "MRR": 0.210, "best-F1": 0.211}, |
| {"Method": "LoGRA", "Spearman ρ":-0.0064, "AUPRC": 0.064, "ROC-AUC": 0.497, "R@10": 0.00, "R@100": 0.00, "MRR": 0.007, "best-F1": 0.192}, |
| ]) |
|
|
| PER_MODEL = pd.DataFrame([ |
| {"Model": "0.5%_seed0", "Method": "AirRep", "Spearman ρ": 0.0856, "p": 0.051, "AUPRC": 0.066, "ROC-AUC": 0.623, "R@10": 0.00, "R@100": 0.32, "MRR": 0.071}, |
| {"Model": "0.5%_seed0", "Method": "LoGRA", "Spearman ρ": 0.0133, "p": 0.762, "AUPRC": 0.044, "ROC-AUC": 0.519, "R@10": 0.00, "R@100": 0.00, "MRR": 0.005}, |
| {"Model": "0.5%_seed0", "Method": "STRIDE", "Spearman ρ": 0.2158, "p": 0.000, "AUPRC": 0.131, "ROC-AUC": 0.810, "R@10": 0.10, "R@100": 0.59, "MRR": 0.200}, |
| {"Model": "0.5%_seed1", "Method": "AirRep", "Spearman ρ": 0.1098, "p": 0.012, "AUPRC": 0.114, "ROC-AUC": 0.658, "R@10": 0.20, "R@100": 0.36, "MRR": 0.250}, |
| {"Model": "0.5%_seed1", "Method": "LoGRA", "Spearman ρ": 0.0392, "p": 0.372, "AUPRC": 0.047, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.00, "MRR": 0.006}, |
| {"Model": "0.5%_seed1", "Method": "STRIDE", "Spearman ρ": 0.1178, "p": 0.007, "AUPRC": 0.081, "ROC-AUC": 0.669, "R@10": 0.10, "R@100": 0.41, "MRR": 0.125}, |
| {"Model": "1%_seed0", "Method": "AirRep", "Spearman ρ": 0.1709, "p": 0.000, "AUPRC": 0.182, "ROC-AUC": 0.679, "R@10": 0.30, "R@100": 0.36, "MRR": 1.000}, |
| {"Model": "1%_seed0", "Method": "LoGRA", "Spearman ρ": 0.0154, "p": 0.720, "AUPRC": 0.081, "ROC-AUC": 0.516, "R@10": 0.00, "R@100": 0.00, "MRR": 0.010}, |
| {"Model": "1%_seed0", "Method": "STRIDE", "Spearman ρ": 0.1281, "p": 0.003, "AUPRC": 0.138, "ROC-AUC": 0.634, "R@10": 0.30, "R@100": 0.27, "MRR": 0.333}, |
| {"Model": "1%_seed1", "Method": "AirRep", "Spearman ρ": 0.0956, "p": 0.026, "AUPRC": 0.117, "ROC-AUC": 0.600, "R@10": 0.00, "R@100": 0.31, "MRR": 0.067}, |
| {"Model": "1%_seed1", "Method": "LoGRA", "Spearman ρ": 0.0111, "p": 0.796, "AUPRC": 0.081, "ROC-AUC": 0.512, "R@10": 0.00, "R@100": 0.00, "MRR": 0.007}, |
| {"Model": "1%_seed1", "Method": "STRIDE", "Spearman ρ": 0.0604, "p": 0.159, "AUPRC": 0.109, "ROC-AUC": 0.563, "R@10": 0.20, "R@100": 0.20, "MRR": 0.333}, |
| {"Model": "1%_seed2", "Method": "AirRep", "Spearman ρ": 0.1241, "p": 0.004, "AUPRC": 0.181, "ROC-AUC": 0.630, "R@10": 0.30, "R@100": 0.40, "MRR": 0.333}, |
| {"Model": "1%_seed2", "Method": "LoGRA", "Spearman ρ":-0.1111, "p": 0.009, "AUPRC": 0.065, "ROC-AUC": 0.383, "R@10": 0.00, "R@100": 0.00, "MRR": 0.009}, |
| {"Model": "1%_seed2", "Method": "STRIDE", "Spearman ρ": 0.0537, "p": 0.210, "AUPRC": 0.088, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.13, "MRR": 0.059}, |
| ]) |
|
|
| PAIRWISE = pd.DataFrame([ |
| {"Model": "0.5%_seed0", "AirRep↔STRIDE": -0.1209, "AirRep↔LoGRA": 0.0635, "STRIDE↔LoGRA": 0.0268}, |
| {"Model": "0.5%_seed1", "AirRep↔STRIDE": -0.2116, "AirRep↔LoGRA": -0.0779, "STRIDE↔LoGRA": 0.0946}, |
| {"Model": "1%_seed0", "AirRep↔STRIDE": 0.1615, "AirRep↔LoGRA": 0.0093, "STRIDE↔LoGRA": 0.0626}, |
| {"Model": "1%_seed1", "AirRep↔STRIDE": -0.1355, "AirRep↔LoGRA": 0.0179, "STRIDE↔LoGRA": -0.0328}, |
| {"Model": "1%_seed2", "AirRep↔STRIDE": -0.0041, "AirRep↔LoGRA": 0.0442, "STRIDE↔LoGRA": 0.0369}, |
| {"Model": "MEAN", "AirRep↔STRIDE": -0.0621, "AirRep↔LoGRA": 0.0114, "STRIDE↔LoGRA": 0.0376}, |
| ]) |
|
|
| FINDINGS = """ |
| ## Setup |
| |
| We fine-tune Qwen2.5-0.5B on MATH training data at contamination rates of **0.5% and 1%** (5 models, 2–3 seeds per rate). |
| A subset of MATH *test* examples is injected as exact replicas into the training pool. |
| |
| **Task:** given a MATH test query, predict whether it appeared in training data. |
| |
| **Query set per model:** 500 clean validation examples + leaked examples (~22–45 depending on rate). |
| |
| --- |
| |
| ## Key findings |
| |
| ### 1. AirRep ≈ STRIDE >> LoGRA on detection; LoGRA near-random |
| |
| All Spearman ρ values are versus the binary leaked/not-leaked label. |
| |
| **LoGRA's ρ averages −0.006** — statistically indistinguishable from zero, and slightly negative |
| on one model. Its gradient influence scores have enormous variance (σ ≈ 2.9 × 10⁶) relative to |
| the mean gap between leaked and clean queries (~250K), making the signal undetectable. |
| |
| **AirRep and STRIDE are comparable** (ρ ≈ 0.115–0.117) but reach that level via different mechanisms |
| (see finding 3). |
| |
| ### 2. LoGRA's p-values are all non-significant |
| |
| Every LoGRA Spearman p-value is > 0.37. AirRep and STRIDE are mostly significant at α = 0.05, |
| especially at the 1% contamination rate. |
| |
| ### 3. AirRep and STRIDE rankings are essentially uncorrelated with each other |
| |
| Pairwise Spearman ρ between method rankings (not vs ground truth): |
| |
| | Pair | Mean ρ | |
| |------|--------| |
| | AirRep ↔ STRIDE | −0.06 | |
| | AirRep ↔ LoGRA | +0.01 | |
| | STRIDE ↔ LoGRA | +0.04 | |
| |
| AirRep and STRIDE are nearly **anti-correlated** — they flag almost entirely different queries. |
| Both have signal against the ground truth, but via independent mechanisms. This suggests |
| an ensemble could outperform either method alone. |
| |
| ### 4. STRIDE false positives are "easy" problems |
| |
| At 1% contamination, STRIDE's top-20 contains only 4 true positives. False positives are |
| short, concrete, textbook problems: |
| |
| > *"Evaluate $\\dfrac{7}{45^2 - 38^2}$"* |
| |
| > *"How many times does the digit 8 appear from 1 to 1000?"* |
| |
| > *"If $f(x) = ax + b$ and $f(f(f(x))) = 8x + 21$, find $a + b$."* |
| |
| STRIDE conflates *"training data has large gradient influence on this query"* with *"this query |
| was in training."* AirRep avoids this because representation similarity is far more sensitive |
| to exact/near-duplicate content. |
| |
| ### 5. None of the methods distinguish memorized from merely-leaked |
| |
| STRIDE score means: memorized = 7.43, leaked-not-memorized = 7.48, clean = 7.07. |
| The leaked groups are ~0.4 above clean mean, but indistinguishable from each other. |
| |
| --- |
| |
| ## Open questions |
| |
| 1. AirRep and STRIDE flag almost non-overlapping query sets — does an ensemble help? |
| 2. Would STRIDE with more subsets (1000) recover stronger signal, or is gradient noise fundamental? |
| 3. Is LoGRA's failure specific to small models / short sequences, or generic for this task? |
| 4. How do results change at 1.5% contamination (3 more models running now)? |
| """ |
|
|
| |
|
|
| with gr.Blocks(title="Attribution Method Comparison — MATH Contamination") as demo: |
| gr.Markdown( |
| "# Attribution Method Comparison: AirRep vs STRIDE vs LoGRA\n" |
| "**Task:** detect benchmark contamination in Qwen2.5-0.5B fine-tuned on MATH. " |
| "5 models evaluated (0.5%×2 seeds, 1%×3 seeds). " |
| "Spearman ρ is against the binary leaked/not-leaked label. " |
| "See the **Community** tab to discuss." |
| ) |
|
|
| with gr.Tabs(): |
| with gr.Tab("Summary"): |
| gr.Markdown("### Mean metrics over 5 models") |
| gr.Dataframe(SUMMARY.round(4), label="Summary") |
| gr.Markdown( |
| "> **AirRep ≈ STRIDE on Spearman ρ (0.117 vs 0.115). LoGRA ≈ 0.** " |
| "STRIDE edges AirRep on ROC-AUC; AirRep leads on AUPRC and MRR." |
| ) |
|
|
| with gr.Tab("Per-model (with p-values)"): |
| gr.Markdown("### Spearman ρ vs leaked label, per model\n" |
| "LoGRA p-values are all > 0.37 (non-significant). " |
| "AirRep and STRIDE are mostly significant at α=0.05.") |
| gr.Dataframe(PER_MODEL.round(4), label="Per-model results") |
|
|
| with gr.Tab("Pairwise ranking agreement"): |
| gr.Markdown( |
| "### Do the methods agree on which queries look contaminated?\n" |
| "Spearman ρ between each pair of methods' score rankings (not vs ground truth). " |
| "Near-zero means the methods are flagging almost entirely different queries." |
| ) |
| gr.Dataframe(PAIRWISE.round(4), label="Pairwise Spearman ρ between method rankings") |
| gr.Markdown( |
| "> **AirRep↔STRIDE mean ρ = −0.06** — nearly anti-correlated. " |
| "Both have independent signal, suggesting an ensemble could outperform either alone." |
| ) |
|
|
| with gr.Tab("Findings & Discussion"): |
| gr.Markdown(FINDINGS) |
|
|
| demo.launch() |
|
|