File size: 9,010 Bytes
2e8969b e831e4b 2e8969b e831e4b 2e8969b e831e4b 2e8969b e831e4b 2e8969b e831e4b 2e8969b e831e4b 2e8969b e831e4b 2e8969b e831e4b 2e8969b e831e4b 2e8969b e831e4b 2e8969b e831e4b 2e8969b e831e4b 2e8969b e831e4b 2e8969b e831e4b 2e8969b e831e4b 2e8969b e831e4b 2e8969b e831e4b 2e8969b e831e4b 2e8969b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | import gradio as gr
import pandas as pd
# ── Data ─────────────────────────────────────────────────────────────────────
SUMMARY = pd.DataFrame([
{"Method": "AirRep (pretrained)", "Spearman ρ": 0.1172, "AUPRC": 0.132, "ROC-AUC": 0.638, "R@10": 0.16, "R@100": 0.35, "MRR": 0.344, "best-F1": 0.223},
{"Method": "STRIDE (300 subsets)", "Spearman ρ": 0.1152, "AUPRC": 0.109, "ROC-AUC": 0.647, "R@10": 0.14, "R@100": 0.32, "MRR": 0.210, "best-F1": 0.211},
{"Method": "LoGRA", "Spearman ρ":-0.0064, "AUPRC": 0.064, "ROC-AUC": 0.497, "R@10": 0.00, "R@100": 0.00, "MRR": 0.007, "best-F1": 0.192},
])
PER_MODEL = pd.DataFrame([
{"Model": "0.5%_seed0", "Method": "AirRep", "Spearman ρ": 0.0856, "p": 0.051, "AUPRC": 0.066, "ROC-AUC": 0.623, "R@10": 0.00, "R@100": 0.32, "MRR": 0.071},
{"Model": "0.5%_seed0", "Method": "LoGRA", "Spearman ρ": 0.0133, "p": 0.762, "AUPRC": 0.044, "ROC-AUC": 0.519, "R@10": 0.00, "R@100": 0.00, "MRR": 0.005},
{"Model": "0.5%_seed0", "Method": "STRIDE", "Spearman ρ": 0.2158, "p": 0.000, "AUPRC": 0.131, "ROC-AUC": 0.810, "R@10": 0.10, "R@100": 0.59, "MRR": 0.200},
{"Model": "0.5%_seed1", "Method": "AirRep", "Spearman ρ": 0.1098, "p": 0.012, "AUPRC": 0.114, "ROC-AUC": 0.658, "R@10": 0.20, "R@100": 0.36, "MRR": 0.250},
{"Model": "0.5%_seed1", "Method": "LoGRA", "Spearman ρ": 0.0392, "p": 0.372, "AUPRC": 0.047, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.00, "MRR": 0.006},
{"Model": "0.5%_seed1", "Method": "STRIDE", "Spearman ρ": 0.1178, "p": 0.007, "AUPRC": 0.081, "ROC-AUC": 0.669, "R@10": 0.10, "R@100": 0.41, "MRR": 0.125},
{"Model": "1%_seed0", "Method": "AirRep", "Spearman ρ": 0.1709, "p": 0.000, "AUPRC": 0.182, "ROC-AUC": 0.679, "R@10": 0.30, "R@100": 0.36, "MRR": 1.000},
{"Model": "1%_seed0", "Method": "LoGRA", "Spearman ρ": 0.0154, "p": 0.720, "AUPRC": 0.081, "ROC-AUC": 0.516, "R@10": 0.00, "R@100": 0.00, "MRR": 0.010},
{"Model": "1%_seed0", "Method": "STRIDE", "Spearman ρ": 0.1281, "p": 0.003, "AUPRC": 0.138, "ROC-AUC": 0.634, "R@10": 0.30, "R@100": 0.27, "MRR": 0.333},
{"Model": "1%_seed1", "Method": "AirRep", "Spearman ρ": 0.0956, "p": 0.026, "AUPRC": 0.117, "ROC-AUC": 0.600, "R@10": 0.00, "R@100": 0.31, "MRR": 0.067},
{"Model": "1%_seed1", "Method": "LoGRA", "Spearman ρ": 0.0111, "p": 0.796, "AUPRC": 0.081, "ROC-AUC": 0.512, "R@10": 0.00, "R@100": 0.00, "MRR": 0.007},
{"Model": "1%_seed1", "Method": "STRIDE", "Spearman ρ": 0.0604, "p": 0.159, "AUPRC": 0.109, "ROC-AUC": 0.563, "R@10": 0.20, "R@100": 0.20, "MRR": 0.333},
{"Model": "1%_seed2", "Method": "AirRep", "Spearman ρ": 0.1241, "p": 0.004, "AUPRC": 0.181, "ROC-AUC": 0.630, "R@10": 0.30, "R@100": 0.40, "MRR": 0.333},
{"Model": "1%_seed2", "Method": "LoGRA", "Spearman ρ":-0.1111, "p": 0.009, "AUPRC": 0.065, "ROC-AUC": 0.383, "R@10": 0.00, "R@100": 0.00, "MRR": 0.009},
{"Model": "1%_seed2", "Method": "STRIDE", "Spearman ρ": 0.0537, "p": 0.210, "AUPRC": 0.088, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.13, "MRR": 0.059},
])
PAIRWISE = pd.DataFrame([
{"Model": "0.5%_seed0", "AirRep↔STRIDE": -0.1209, "AirRep↔LoGRA": 0.0635, "STRIDE↔LoGRA": 0.0268},
{"Model": "0.5%_seed1", "AirRep↔STRIDE": -0.2116, "AirRep↔LoGRA": -0.0779, "STRIDE↔LoGRA": 0.0946},
{"Model": "1%_seed0", "AirRep↔STRIDE": 0.1615, "AirRep↔LoGRA": 0.0093, "STRIDE↔LoGRA": 0.0626},
{"Model": "1%_seed1", "AirRep↔STRIDE": -0.1355, "AirRep↔LoGRA": 0.0179, "STRIDE↔LoGRA": -0.0328},
{"Model": "1%_seed2", "AirRep↔STRIDE": -0.0041, "AirRep↔LoGRA": 0.0442, "STRIDE↔LoGRA": 0.0369},
{"Model": "MEAN", "AirRep↔STRIDE": -0.0621, "AirRep↔LoGRA": 0.0114, "STRIDE↔LoGRA": 0.0376},
])
FINDINGS = """
## Setup
We fine-tune Qwen2.5-0.5B on MATH training data at contamination rates of **0.5% and 1%** (5 models, 2–3 seeds per rate).
A subset of MATH *test* examples is injected as exact replicas into the training pool.
**Task:** given a MATH test query, predict whether it appeared in training data.
**Query set per model:** 500 clean validation examples + leaked examples (~22–45 depending on rate).
---
## Key findings
### 1. AirRep ≈ STRIDE >> LoGRA on detection; LoGRA near-random
All Spearman ρ values are versus the binary leaked/not-leaked label.
**LoGRA's ρ averages −0.006** — statistically indistinguishable from zero, and slightly negative
on one model. Its gradient influence scores have enormous variance (σ ≈ 2.9 × 10⁶) relative to
the mean gap between leaked and clean queries (~250K), making the signal undetectable.
**AirRep and STRIDE are comparable** (ρ ≈ 0.115–0.117) but reach that level via different mechanisms
(see finding 3).
### 2. LoGRA's p-values are all non-significant
Every LoGRA Spearman p-value is > 0.37. AirRep and STRIDE are mostly significant at α = 0.05,
especially at the 1% contamination rate.
### 3. AirRep and STRIDE rankings are essentially uncorrelated with each other
Pairwise Spearman ρ between method rankings (not vs ground truth):
| Pair | Mean ρ |
|------|--------|
| AirRep ↔ STRIDE | −0.06 |
| AirRep ↔ LoGRA | +0.01 |
| STRIDE ↔ LoGRA | +0.04 |
AirRep and STRIDE are nearly **anti-correlated** — they flag almost entirely different queries.
Both have signal against the ground truth, but via independent mechanisms. This suggests
an ensemble could outperform either method alone.
### 4. STRIDE false positives are "easy" problems
At 1% contamination, STRIDE's top-20 contains only 4 true positives. False positives are
short, concrete, textbook problems:
> *"Evaluate $\\dfrac{7}{45^2 - 38^2}$"*
> *"How many times does the digit 8 appear from 1 to 1000?"*
> *"If $f(x) = ax + b$ and $f(f(f(x))) = 8x + 21$, find $a + b$."*
STRIDE conflates *"training data has large gradient influence on this query"* with *"this query
was in training."* AirRep avoids this because representation similarity is far more sensitive
to exact/near-duplicate content.
### 5. None of the methods distinguish memorized from merely-leaked
STRIDE score means: memorized = 7.43, leaked-not-memorized = 7.48, clean = 7.07.
The leaked groups are ~0.4 above clean mean, but indistinguishable from each other.
---
## Open questions
1. AirRep and STRIDE flag almost non-overlapping query sets — does an ensemble help?
2. Would STRIDE with more subsets (1000) recover stronger signal, or is gradient noise fundamental?
3. Is LoGRA's failure specific to small models / short sequences, or generic for this task?
4. How do results change at 1.5% contamination (3 more models running now)?
"""
# ── UI ────────────────────────────────────────────────────────────────────────
with gr.Blocks(title="Attribution Method Comparison — MATH Contamination") as demo:
gr.Markdown(
"# Attribution Method Comparison: AirRep vs STRIDE vs LoGRA\n"
"**Task:** detect benchmark contamination in Qwen2.5-0.5B fine-tuned on MATH. "
"5 models evaluated (0.5%×2 seeds, 1%×3 seeds). "
"Spearman ρ is against the binary leaked/not-leaked label. "
"See the **Community** tab to discuss."
)
with gr.Tabs():
with gr.Tab("Summary"):
gr.Markdown("### Mean metrics over 5 models")
gr.Dataframe(SUMMARY.round(4), label="Summary")
gr.Markdown(
"> **AirRep ≈ STRIDE on Spearman ρ (0.117 vs 0.115). LoGRA ≈ 0.** "
"STRIDE edges AirRep on ROC-AUC; AirRep leads on AUPRC and MRR."
)
with gr.Tab("Per-model (with p-values)"):
gr.Markdown("### Spearman ρ vs leaked label, per model\n"
"LoGRA p-values are all > 0.37 (non-significant). "
"AirRep and STRIDE are mostly significant at α=0.05.")
gr.Dataframe(PER_MODEL.round(4), label="Per-model results")
with gr.Tab("Pairwise ranking agreement"):
gr.Markdown(
"### Do the methods agree on which queries look contaminated?\n"
"Spearman ρ between each pair of methods' score rankings (not vs ground truth). "
"Near-zero means the methods are flagging almost entirely different queries."
)
gr.Dataframe(PAIRWISE.round(4), label="Pairwise Spearman ρ between method rankings")
gr.Markdown(
"> **AirRep↔STRIDE mean ρ = −0.06** — nearly anti-correlated. "
"Both have independent signal, suggesting an ensemble could outperform either alone."
)
with gr.Tab("Findings & Discussion"):
gr.Markdown(FINDINGS)
demo.launch()
|