amirali1985's picture
Upload app.py with huggingface_hub
e831e4b verified
import gradio as gr
import pandas as pd
# ── Data ─────────────────────────────────────────────────────────────────────
SUMMARY = pd.DataFrame([
{"Method": "AirRep (pretrained)", "Spearman ρ": 0.1172, "AUPRC": 0.132, "ROC-AUC": 0.638, "R@10": 0.16, "R@100": 0.35, "MRR": 0.344, "best-F1": 0.223},
{"Method": "STRIDE (300 subsets)", "Spearman ρ": 0.1152, "AUPRC": 0.109, "ROC-AUC": 0.647, "R@10": 0.14, "R@100": 0.32, "MRR": 0.210, "best-F1": 0.211},
{"Method": "LoGRA", "Spearman ρ":-0.0064, "AUPRC": 0.064, "ROC-AUC": 0.497, "R@10": 0.00, "R@100": 0.00, "MRR": 0.007, "best-F1": 0.192},
])
PER_MODEL = pd.DataFrame([
{"Model": "0.5%_seed0", "Method": "AirRep", "Spearman ρ": 0.0856, "p": 0.051, "AUPRC": 0.066, "ROC-AUC": 0.623, "R@10": 0.00, "R@100": 0.32, "MRR": 0.071},
{"Model": "0.5%_seed0", "Method": "LoGRA", "Spearman ρ": 0.0133, "p": 0.762, "AUPRC": 0.044, "ROC-AUC": 0.519, "R@10": 0.00, "R@100": 0.00, "MRR": 0.005},
{"Model": "0.5%_seed0", "Method": "STRIDE", "Spearman ρ": 0.2158, "p": 0.000, "AUPRC": 0.131, "ROC-AUC": 0.810, "R@10": 0.10, "R@100": 0.59, "MRR": 0.200},
{"Model": "0.5%_seed1", "Method": "AirRep", "Spearman ρ": 0.1098, "p": 0.012, "AUPRC": 0.114, "ROC-AUC": 0.658, "R@10": 0.20, "R@100": 0.36, "MRR": 0.250},
{"Model": "0.5%_seed1", "Method": "LoGRA", "Spearman ρ": 0.0392, "p": 0.372, "AUPRC": 0.047, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.00, "MRR": 0.006},
{"Model": "0.5%_seed1", "Method": "STRIDE", "Spearman ρ": 0.1178, "p": 0.007, "AUPRC": 0.081, "ROC-AUC": 0.669, "R@10": 0.10, "R@100": 0.41, "MRR": 0.125},
{"Model": "1%_seed0", "Method": "AirRep", "Spearman ρ": 0.1709, "p": 0.000, "AUPRC": 0.182, "ROC-AUC": 0.679, "R@10": 0.30, "R@100": 0.36, "MRR": 1.000},
{"Model": "1%_seed0", "Method": "LoGRA", "Spearman ρ": 0.0154, "p": 0.720, "AUPRC": 0.081, "ROC-AUC": 0.516, "R@10": 0.00, "R@100": 0.00, "MRR": 0.010},
{"Model": "1%_seed0", "Method": "STRIDE", "Spearman ρ": 0.1281, "p": 0.003, "AUPRC": 0.138, "ROC-AUC": 0.634, "R@10": 0.30, "R@100": 0.27, "MRR": 0.333},
{"Model": "1%_seed1", "Method": "AirRep", "Spearman ρ": 0.0956, "p": 0.026, "AUPRC": 0.117, "ROC-AUC": 0.600, "R@10": 0.00, "R@100": 0.31, "MRR": 0.067},
{"Model": "1%_seed1", "Method": "LoGRA", "Spearman ρ": 0.0111, "p": 0.796, "AUPRC": 0.081, "ROC-AUC": 0.512, "R@10": 0.00, "R@100": 0.00, "MRR": 0.007},
{"Model": "1%_seed1", "Method": "STRIDE", "Spearman ρ": 0.0604, "p": 0.159, "AUPRC": 0.109, "ROC-AUC": 0.563, "R@10": 0.20, "R@100": 0.20, "MRR": 0.333},
{"Model": "1%_seed2", "Method": "AirRep", "Spearman ρ": 0.1241, "p": 0.004, "AUPRC": 0.181, "ROC-AUC": 0.630, "R@10": 0.30, "R@100": 0.40, "MRR": 0.333},
{"Model": "1%_seed2", "Method": "LoGRA", "Spearman ρ":-0.1111, "p": 0.009, "AUPRC": 0.065, "ROC-AUC": 0.383, "R@10": 0.00, "R@100": 0.00, "MRR": 0.009},
{"Model": "1%_seed2", "Method": "STRIDE", "Spearman ρ": 0.0537, "p": 0.210, "AUPRC": 0.088, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.13, "MRR": 0.059},
])
PAIRWISE = pd.DataFrame([
{"Model": "0.5%_seed0", "AirRep↔STRIDE": -0.1209, "AirRep↔LoGRA": 0.0635, "STRIDE↔LoGRA": 0.0268},
{"Model": "0.5%_seed1", "AirRep↔STRIDE": -0.2116, "AirRep↔LoGRA": -0.0779, "STRIDE↔LoGRA": 0.0946},
{"Model": "1%_seed0", "AirRep↔STRIDE": 0.1615, "AirRep↔LoGRA": 0.0093, "STRIDE↔LoGRA": 0.0626},
{"Model": "1%_seed1", "AirRep↔STRIDE": -0.1355, "AirRep↔LoGRA": 0.0179, "STRIDE↔LoGRA": -0.0328},
{"Model": "1%_seed2", "AirRep↔STRIDE": -0.0041, "AirRep↔LoGRA": 0.0442, "STRIDE↔LoGRA": 0.0369},
{"Model": "MEAN", "AirRep↔STRIDE": -0.0621, "AirRep↔LoGRA": 0.0114, "STRIDE↔LoGRA": 0.0376},
])
FINDINGS = """
## Setup
We fine-tune Qwen2.5-0.5B on MATH training data at contamination rates of **0.5% and 1%** (5 models, 2–3 seeds per rate).
A subset of MATH *test* examples is injected as exact replicas into the training pool.
**Task:** given a MATH test query, predict whether it appeared in training data.
**Query set per model:** 500 clean validation examples + leaked examples (~22–45 depending on rate).
---
## Key findings
### 1. AirRep ≈ STRIDE >> LoGRA on detection; LoGRA near-random
All Spearman ρ values are versus the binary leaked/not-leaked label.
**LoGRA's ρ averages −0.006** — statistically indistinguishable from zero, and slightly negative
on one model. Its gradient influence scores have enormous variance (σ ≈ 2.9 × 10⁶) relative to
the mean gap between leaked and clean queries (~250K), making the signal undetectable.
**AirRep and STRIDE are comparable** (ρ ≈ 0.115–0.117) but reach that level via different mechanisms
(see finding 3).
### 2. LoGRA's p-values are all non-significant
Every LoGRA Spearman p-value is > 0.37. AirRep and STRIDE are mostly significant at α = 0.05,
especially at the 1% contamination rate.
### 3. AirRep and STRIDE rankings are essentially uncorrelated with each other
Pairwise Spearman ρ between method rankings (not vs ground truth):
| Pair | Mean ρ |
|------|--------|
| AirRep ↔ STRIDE | −0.06 |
| AirRep ↔ LoGRA | +0.01 |
| STRIDE ↔ LoGRA | +0.04 |
AirRep and STRIDE are nearly **anti-correlated** — they flag almost entirely different queries.
Both have signal against the ground truth, but via independent mechanisms. This suggests
an ensemble could outperform either method alone.
### 4. STRIDE false positives are "easy" problems
At 1% contamination, STRIDE's top-20 contains only 4 true positives. False positives are
short, concrete, textbook problems:
> *"Evaluate $\\dfrac{7}{45^2 - 38^2}$"*
> *"How many times does the digit 8 appear from 1 to 1000?"*
> *"If $f(x) = ax + b$ and $f(f(f(x))) = 8x + 21$, find $a + b$."*
STRIDE conflates *"training data has large gradient influence on this query"* with *"this query
was in training."* AirRep avoids this because representation similarity is far more sensitive
to exact/near-duplicate content.
### 5. None of the methods distinguish memorized from merely-leaked
STRIDE score means: memorized = 7.43, leaked-not-memorized = 7.48, clean = 7.07.
The leaked groups are ~0.4 above clean mean, but indistinguishable from each other.
---
## Open questions
1. AirRep and STRIDE flag almost non-overlapping query sets — does an ensemble help?
2. Would STRIDE with more subsets (1000) recover stronger signal, or is gradient noise fundamental?
3. Is LoGRA's failure specific to small models / short sequences, or generic for this task?
4. How do results change at 1.5% contamination (3 more models running now)?
"""
# ── UI ────────────────────────────────────────────────────────────────────────
with gr.Blocks(title="Attribution Method Comparison — MATH Contamination") as demo:
gr.Markdown(
"# Attribution Method Comparison: AirRep vs STRIDE vs LoGRA\n"
"**Task:** detect benchmark contamination in Qwen2.5-0.5B fine-tuned on MATH. "
"5 models evaluated (0.5%×2 seeds, 1%×3 seeds). "
"Spearman ρ is against the binary leaked/not-leaked label. "
"See the **Community** tab to discuss."
)
with gr.Tabs():
with gr.Tab("Summary"):
gr.Markdown("### Mean metrics over 5 models")
gr.Dataframe(SUMMARY.round(4), label="Summary")
gr.Markdown(
"> **AirRep ≈ STRIDE on Spearman ρ (0.117 vs 0.115). LoGRA ≈ 0.** "
"STRIDE edges AirRep on ROC-AUC; AirRep leads on AUPRC and MRR."
)
with gr.Tab("Per-model (with p-values)"):
gr.Markdown("### Spearman ρ vs leaked label, per model\n"
"LoGRA p-values are all > 0.37 (non-significant). "
"AirRep and STRIDE are mostly significant at α=0.05.")
gr.Dataframe(PER_MODEL.round(4), label="Per-model results")
with gr.Tab("Pairwise ranking agreement"):
gr.Markdown(
"### Do the methods agree on which queries look contaminated?\n"
"Spearman ρ between each pair of methods' score rankings (not vs ground truth). "
"Near-zero means the methods are flagging almost entirely different queries."
)
gr.Dataframe(PAIRWISE.round(4), label="Pairwise Spearman ρ between method rankings")
gr.Markdown(
"> **AirRep↔STRIDE mean ρ = −0.06** — nearly anti-correlated. "
"Both have independent signal, suggesting an ensemble could outperform either alone."
)
with gr.Tab("Findings & Discussion"):
gr.Markdown(FINDINGS)
demo.launch()