import gradio as gr import pandas as pd # ── Data ───────────────────────────────────────────────────────────────────── SUMMARY = pd.DataFrame([ {"Method": "AirRep (pretrained)", "Spearman ρ": 0.1172, "AUPRC": 0.132, "ROC-AUC": 0.638, "R@10": 0.16, "R@100": 0.35, "MRR": 0.344, "best-F1": 0.223}, {"Method": "STRIDE (300 subsets)", "Spearman ρ": 0.1152, "AUPRC": 0.109, "ROC-AUC": 0.647, "R@10": 0.14, "R@100": 0.32, "MRR": 0.210, "best-F1": 0.211}, {"Method": "LoGRA", "Spearman ρ":-0.0064, "AUPRC": 0.064, "ROC-AUC": 0.497, "R@10": 0.00, "R@100": 0.00, "MRR": 0.007, "best-F1": 0.192}, ]) PER_MODEL = pd.DataFrame([ {"Model": "0.5%_seed0", "Method": "AirRep", "Spearman ρ": 0.0856, "p": 0.051, "AUPRC": 0.066, "ROC-AUC": 0.623, "R@10": 0.00, "R@100": 0.32, "MRR": 0.071}, {"Model": "0.5%_seed0", "Method": "LoGRA", "Spearman ρ": 0.0133, "p": 0.762, "AUPRC": 0.044, "ROC-AUC": 0.519, "R@10": 0.00, "R@100": 0.00, "MRR": 0.005}, {"Model": "0.5%_seed0", "Method": "STRIDE", "Spearman ρ": 0.2158, "p": 0.000, "AUPRC": 0.131, "ROC-AUC": 0.810, "R@10": 0.10, "R@100": 0.59, "MRR": 0.200}, {"Model": "0.5%_seed1", "Method": "AirRep", "Spearman ρ": 0.1098, "p": 0.012, "AUPRC": 0.114, "ROC-AUC": 0.658, "R@10": 0.20, "R@100": 0.36, "MRR": 0.250}, {"Model": "0.5%_seed1", "Method": "LoGRA", "Spearman ρ": 0.0392, "p": 0.372, "AUPRC": 0.047, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.00, "MRR": 0.006}, {"Model": "0.5%_seed1", "Method": "STRIDE", "Spearman ρ": 0.1178, "p": 0.007, "AUPRC": 0.081, "ROC-AUC": 0.669, "R@10": 0.10, "R@100": 0.41, "MRR": 0.125}, {"Model": "1%_seed0", "Method": "AirRep", "Spearman ρ": 0.1709, "p": 0.000, "AUPRC": 0.182, "ROC-AUC": 0.679, "R@10": 0.30, "R@100": 0.36, "MRR": 1.000}, {"Model": "1%_seed0", "Method": "LoGRA", "Spearman ρ": 0.0154, "p": 0.720, "AUPRC": 0.081, "ROC-AUC": 0.516, "R@10": 0.00, "R@100": 0.00, "MRR": 0.010}, {"Model": "1%_seed0", "Method": "STRIDE", "Spearman ρ": 0.1281, "p": 0.003, "AUPRC": 0.138, "ROC-AUC": 0.634, "R@10": 0.30, "R@100": 0.27, "MRR": 0.333}, {"Model": "1%_seed1", "Method": "AirRep", "Spearman ρ": 0.0956, "p": 0.026, "AUPRC": 0.117, "ROC-AUC": 0.600, "R@10": 0.00, "R@100": 0.31, "MRR": 0.067}, {"Model": "1%_seed1", "Method": "LoGRA", "Spearman ρ": 0.0111, "p": 0.796, "AUPRC": 0.081, "ROC-AUC": 0.512, "R@10": 0.00, "R@100": 0.00, "MRR": 0.007}, {"Model": "1%_seed1", "Method": "STRIDE", "Spearman ρ": 0.0604, "p": 0.159, "AUPRC": 0.109, "ROC-AUC": 0.563, "R@10": 0.20, "R@100": 0.20, "MRR": 0.333}, {"Model": "1%_seed2", "Method": "AirRep", "Spearman ρ": 0.1241, "p": 0.004, "AUPRC": 0.181, "ROC-AUC": 0.630, "R@10": 0.30, "R@100": 0.40, "MRR": 0.333}, {"Model": "1%_seed2", "Method": "LoGRA", "Spearman ρ":-0.1111, "p": 0.009, "AUPRC": 0.065, "ROC-AUC": 0.383, "R@10": 0.00, "R@100": 0.00, "MRR": 0.009}, {"Model": "1%_seed2", "Method": "STRIDE", "Spearman ρ": 0.0537, "p": 0.210, "AUPRC": 0.088, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.13, "MRR": 0.059}, ]) PAIRWISE = pd.DataFrame([ {"Model": "0.5%_seed0", "AirRep↔STRIDE": -0.1209, "AirRep↔LoGRA": 0.0635, "STRIDE↔LoGRA": 0.0268}, {"Model": "0.5%_seed1", "AirRep↔STRIDE": -0.2116, "AirRep↔LoGRA": -0.0779, "STRIDE↔LoGRA": 0.0946}, {"Model": "1%_seed0", "AirRep↔STRIDE": 0.1615, "AirRep↔LoGRA": 0.0093, "STRIDE↔LoGRA": 0.0626}, {"Model": "1%_seed1", "AirRep↔STRIDE": -0.1355, "AirRep↔LoGRA": 0.0179, "STRIDE↔LoGRA": -0.0328}, {"Model": "1%_seed2", "AirRep↔STRIDE": -0.0041, "AirRep↔LoGRA": 0.0442, "STRIDE↔LoGRA": 0.0369}, {"Model": "MEAN", "AirRep↔STRIDE": -0.0621, "AirRep↔LoGRA": 0.0114, "STRIDE↔LoGRA": 0.0376}, ]) FINDINGS = """ ## Setup We fine-tune Qwen2.5-0.5B on MATH training data at contamination rates of **0.5% and 1%** (5 models, 2–3 seeds per rate). A subset of MATH *test* examples is injected as exact replicas into the training pool. **Task:** given a MATH test query, predict whether it appeared in training data. **Query set per model:** 500 clean validation examples + leaked examples (~22–45 depending on rate). --- ## Key findings ### 1. AirRep ≈ STRIDE >> LoGRA on detection; LoGRA near-random All Spearman ρ values are versus the binary leaked/not-leaked label. **LoGRA's ρ averages −0.006** — statistically indistinguishable from zero, and slightly negative on one model. Its gradient influence scores have enormous variance (σ ≈ 2.9 × 10⁶) relative to the mean gap between leaked and clean queries (~250K), making the signal undetectable. **AirRep and STRIDE are comparable** (ρ ≈ 0.115–0.117) but reach that level via different mechanisms (see finding 3). ### 2. LoGRA's p-values are all non-significant Every LoGRA Spearman p-value is > 0.37. AirRep and STRIDE are mostly significant at α = 0.05, especially at the 1% contamination rate. ### 3. AirRep and STRIDE rankings are essentially uncorrelated with each other Pairwise Spearman ρ between method rankings (not vs ground truth): | Pair | Mean ρ | |------|--------| | AirRep ↔ STRIDE | −0.06 | | AirRep ↔ LoGRA | +0.01 | | STRIDE ↔ LoGRA | +0.04 | AirRep and STRIDE are nearly **anti-correlated** — they flag almost entirely different queries. Both have signal against the ground truth, but via independent mechanisms. This suggests an ensemble could outperform either method alone. ### 4. STRIDE false positives are "easy" problems At 1% contamination, STRIDE's top-20 contains only 4 true positives. False positives are short, concrete, textbook problems: > *"Evaluate $\\dfrac{7}{45^2 - 38^2}$"* > *"How many times does the digit 8 appear from 1 to 1000?"* > *"If $f(x) = ax + b$ and $f(f(f(x))) = 8x + 21$, find $a + b$."* STRIDE conflates *"training data has large gradient influence on this query"* with *"this query was in training."* AirRep avoids this because representation similarity is far more sensitive to exact/near-duplicate content. ### 5. None of the methods distinguish memorized from merely-leaked STRIDE score means: memorized = 7.43, leaked-not-memorized = 7.48, clean = 7.07. The leaked groups are ~0.4 above clean mean, but indistinguishable from each other. --- ## Open questions 1. AirRep and STRIDE flag almost non-overlapping query sets — does an ensemble help? 2. Would STRIDE with more subsets (1000) recover stronger signal, or is gradient noise fundamental? 3. Is LoGRA's failure specific to small models / short sequences, or generic for this task? 4. How do results change at 1.5% contamination (3 more models running now)? """ # ── UI ──────────────────────────────────────────────────────────────────────── with gr.Blocks(title="Attribution Method Comparison — MATH Contamination") as demo: gr.Markdown( "# Attribution Method Comparison: AirRep vs STRIDE vs LoGRA\n" "**Task:** detect benchmark contamination in Qwen2.5-0.5B fine-tuned on MATH. " "5 models evaluated (0.5%×2 seeds, 1%×3 seeds). " "Spearman ρ is against the binary leaked/not-leaked label. " "See the **Community** tab to discuss." ) with gr.Tabs(): with gr.Tab("Summary"): gr.Markdown("### Mean metrics over 5 models") gr.Dataframe(SUMMARY.round(4), label="Summary") gr.Markdown( "> **AirRep ≈ STRIDE on Spearman ρ (0.117 vs 0.115). LoGRA ≈ 0.** " "STRIDE edges AirRep on ROC-AUC; AirRep leads on AUPRC and MRR." ) with gr.Tab("Per-model (with p-values)"): gr.Markdown("### Spearman ρ vs leaked label, per model\n" "LoGRA p-values are all > 0.37 (non-significant). " "AirRep and STRIDE are mostly significant at α=0.05.") gr.Dataframe(PER_MODEL.round(4), label="Per-model results") with gr.Tab("Pairwise ranking agreement"): gr.Markdown( "### Do the methods agree on which queries look contaminated?\n" "Spearman ρ between each pair of methods' score rankings (not vs ground truth). " "Near-zero means the methods are flagging almost entirely different queries." ) gr.Dataframe(PAIRWISE.round(4), label="Pairwise Spearman ρ between method rankings") gr.Markdown( "> **AirRep↔STRIDE mean ρ = −0.06** — nearly anti-correlated. " "Both have independent signal, suggesting an ensemble could outperform either alone." ) with gr.Tab("Findings & Discussion"): gr.Markdown(FINDINGS) demo.launch()