Spaces:

stride-influence
/

attribution-comparison

Sleeping

App Files Files Community

attribution-comparison / app.py

amirali1985

Upload app.py with huggingface_hub

e831e4b verified 7 days ago

raw

history blame contribute delete

9.01 kB

	import gradio as gr
	import pandas as pd

	# ── Data ─────────────────────────────────────────────────────────────────────

	SUMMARY = pd.DataFrame([
	{"Method": "AirRep (pretrained)", "Spearman ρ": 0.1172, "AUPRC": 0.132, "ROC-AUC": 0.638, "R@10": 0.16, "R@100": 0.35, "MRR": 0.344, "best-F1": 0.223},
	{"Method": "STRIDE (300 subsets)", "Spearman ρ": 0.1152, "AUPRC": 0.109, "ROC-AUC": 0.647, "R@10": 0.14, "R@100": 0.32, "MRR": 0.210, "best-F1": 0.211},
	{"Method": "LoGRA", "Spearman ρ":-0.0064, "AUPRC": 0.064, "ROC-AUC": 0.497, "R@10": 0.00, "R@100": 0.00, "MRR": 0.007, "best-F1": 0.192},
	])

	PER_MODEL = pd.DataFrame([
	{"Model": "0.5%_seed0", "Method": "AirRep", "Spearman ρ": 0.0856, "p": 0.051, "AUPRC": 0.066, "ROC-AUC": 0.623, "R@10": 0.00, "R@100": 0.32, "MRR": 0.071},
	{"Model": "0.5%_seed0", "Method": "LoGRA", "Spearman ρ": 0.0133, "p": 0.762, "AUPRC": 0.044, "ROC-AUC": 0.519, "R@10": 0.00, "R@100": 0.00, "MRR": 0.005},
	{"Model": "0.5%_seed0", "Method": "STRIDE", "Spearman ρ": 0.2158, "p": 0.000, "AUPRC": 0.131, "ROC-AUC": 0.810, "R@10": 0.10, "R@100": 0.59, "MRR": 0.200},
	{"Model": "0.5%_seed1", "Method": "AirRep", "Spearman ρ": 0.1098, "p": 0.012, "AUPRC": 0.114, "ROC-AUC": 0.658, "R@10": 0.20, "R@100": 0.36, "MRR": 0.250},
	{"Model": "0.5%_seed1", "Method": "LoGRA", "Spearman ρ": 0.0392, "p": 0.372, "AUPRC": 0.047, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.00, "MRR": 0.006},
	{"Model": "0.5%_seed1", "Method": "STRIDE", "Spearman ρ": 0.1178, "p": 0.007, "AUPRC": 0.081, "ROC-AUC": 0.669, "R@10": 0.10, "R@100": 0.41, "MRR": 0.125},
	{"Model": "1%_seed0", "Method": "AirRep", "Spearman ρ": 0.1709, "p": 0.000, "AUPRC": 0.182, "ROC-AUC": 0.679, "R@10": 0.30, "R@100": 0.36, "MRR": 1.000},
	{"Model": "1%_seed0", "Method": "LoGRA", "Spearman ρ": 0.0154, "p": 0.720, "AUPRC": 0.081, "ROC-AUC": 0.516, "R@10": 0.00, "R@100": 0.00, "MRR": 0.010},
	{"Model": "1%_seed0", "Method": "STRIDE", "Spearman ρ": 0.1281, "p": 0.003, "AUPRC": 0.138, "ROC-AUC": 0.634, "R@10": 0.30, "R@100": 0.27, "MRR": 0.333},
	{"Model": "1%_seed1", "Method": "AirRep", "Spearman ρ": 0.0956, "p": 0.026, "AUPRC": 0.117, "ROC-AUC": 0.600, "R@10": 0.00, "R@100": 0.31, "MRR": 0.067},
	{"Model": "1%_seed1", "Method": "LoGRA", "Spearman ρ": 0.0111, "p": 0.796, "AUPRC": 0.081, "ROC-AUC": 0.512, "R@10": 0.00, "R@100": 0.00, "MRR": 0.007},
	{"Model": "1%_seed1", "Method": "STRIDE", "Spearman ρ": 0.0604, "p": 0.159, "AUPRC": 0.109, "ROC-AUC": 0.563, "R@10": 0.20, "R@100": 0.20, "MRR": 0.333},
	{"Model": "1%_seed2", "Method": "AirRep", "Spearman ρ": 0.1241, "p": 0.004, "AUPRC": 0.181, "ROC-AUC": 0.630, "R@10": 0.30, "R@100": 0.40, "MRR": 0.333},
	{"Model": "1%_seed2", "Method": "LoGRA", "Spearman ρ":-0.1111, "p": 0.009, "AUPRC": 0.065, "ROC-AUC": 0.383, "R@10": 0.00, "R@100": 0.00, "MRR": 0.009},
	{"Model": "1%_seed2", "Method": "STRIDE", "Spearman ρ": 0.0537, "p": 0.210, "AUPRC": 0.088, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.13, "MRR": 0.059},
	])

	PAIRWISE = pd.DataFrame([
	{"Model": "0.5%_seed0", "AirRep↔STRIDE": -0.1209, "AirRep↔LoGRA": 0.0635, "STRIDE↔LoGRA": 0.0268},
	{"Model": "0.5%_seed1", "AirRep↔STRIDE": -0.2116, "AirRep↔LoGRA": -0.0779, "STRIDE↔LoGRA": 0.0946},
	{"Model": "1%_seed0", "AirRep↔STRIDE": 0.1615, "AirRep↔LoGRA": 0.0093, "STRIDE↔LoGRA": 0.0626},
	{"Model": "1%_seed1", "AirRep↔STRIDE": -0.1355, "AirRep↔LoGRA": 0.0179, "STRIDE↔LoGRA": -0.0328},
	{"Model": "1%_seed2", "AirRep↔STRIDE": -0.0041, "AirRep↔LoGRA": 0.0442, "STRIDE↔LoGRA": 0.0369},
	{"Model": "MEAN", "AirRep↔STRIDE": -0.0621, "AirRep↔LoGRA": 0.0114, "STRIDE↔LoGRA": 0.0376},
	])

	FINDINGS = """
	## Setup

	We fine-tune Qwen2.5-0.5B on MATH training data at contamination rates of 0.5% and 1% (5 models, 2–3 seeds per rate).
	A subset of MATH test examples is injected as exact replicas into the training pool.

	Task: given a MATH test query, predict whether it appeared in training data.

	Query set per model: 500 clean validation examples + leaked examples (~22–45 depending on rate).

	---

	## Key findings

	### 1. AirRep ≈ STRIDE >> LoGRA on detection; LoGRA near-random

	All Spearman ρ values are versus the binary leaked/not-leaked label.

	LoGRA's ρ averages −0.006 — statistically indistinguishable from zero, and slightly negative
	on one model. Its gradient influence scores have enormous variance (σ ≈ 2.9 × 10⁶) relative to
	the mean gap between leaked and clean queries (~250K), making the signal undetectable.

	AirRep and STRIDE are comparable (ρ ≈ 0.115–0.117) but reach that level via different mechanisms
	(see finding 3).

	### 2. LoGRA's p-values are all non-significant

	Every LoGRA Spearman p-value is > 0.37. AirRep and STRIDE are mostly significant at α = 0.05,
	especially at the 1% contamination rate.

	### 3. AirRep and STRIDE rankings are essentially uncorrelated with each other

	Pairwise Spearman ρ between method rankings (not vs ground truth):

	\| Pair \| Mean ρ \|
	\|------\|--------\|
	\| AirRep ↔ STRIDE \| −0.06 \|
	\| AirRep ↔ LoGRA \| +0.01 \|
	\| STRIDE ↔ LoGRA \| +0.04 \|

	AirRep and STRIDE are nearly anti-correlated — they flag almost entirely different queries.
	Both have signal against the ground truth, but via independent mechanisms. This suggests
	an ensemble could outperform either method alone.

	### 4. STRIDE false positives are "easy" problems

	At 1% contamination, STRIDE's top-20 contains only 4 true positives. False positives are
	short, concrete, textbook problems:

	> "Evaluate $\\dfrac{7}{45^2 - 38^2}$"

	> "How many times does the digit 8 appear from 1 to 1000?"

	> "If $f(x) = ax + b$ and $f(f(f(x))) = 8x + 21$, find $a + b$."

	STRIDE conflates "training data has large gradient influence on this query" with *"this query
	was in training."* AirRep avoids this because representation similarity is far more sensitive
	to exact/near-duplicate content.

	### 5. None of the methods distinguish memorized from merely-leaked

	STRIDE score means: memorized = 7.43, leaked-not-memorized = 7.48, clean = 7.07.
	The leaked groups are ~0.4 above clean mean, but indistinguishable from each other.

	---

	## Open questions

	1. AirRep and STRIDE flag almost non-overlapping query sets — does an ensemble help?
	2. Would STRIDE with more subsets (1000) recover stronger signal, or is gradient noise fundamental?
	3. Is LoGRA's failure specific to small models / short sequences, or generic for this task?
	4. How do results change at 1.5% contamination (3 more models running now)?
	"""

	# ── UI ────────────────────────────────────────────────────────────────────────

	with gr.Blocks(title="Attribution Method Comparison — MATH Contamination") as demo:
	gr.Markdown(
	"# Attribution Method Comparison: AirRep vs STRIDE vs LoGRA\n"
	"Task: detect benchmark contamination in Qwen2.5-0.5B fine-tuned on MATH. "
	"5 models evaluated (0.5%×2 seeds, 1%×3 seeds). "
	"Spearman ρ is against the binary leaked/not-leaked label. "
	"See the Community tab to discuss."
	)

	with gr.Tabs():
	with gr.Tab("Summary"):
	gr.Markdown("### Mean metrics over 5 models")
	gr.Dataframe(SUMMARY.round(4), label="Summary")
	gr.Markdown(
	"> AirRep ≈ STRIDE on Spearman ρ (0.117 vs 0.115). LoGRA ≈ 0. "
	"STRIDE edges AirRep on ROC-AUC; AirRep leads on AUPRC and MRR."
	)

	with gr.Tab("Per-model (with p-values)"):
	gr.Markdown("### Spearman ρ vs leaked label, per model\n"
	"LoGRA p-values are all > 0.37 (non-significant). "
	"AirRep and STRIDE are mostly significant at α=0.05.")
	gr.Dataframe(PER_MODEL.round(4), label="Per-model results")

	with gr.Tab("Pairwise ranking agreement"):
	gr.Markdown(
	"### Do the methods agree on which queries look contaminated?\n"
	"Spearman ρ between each pair of methods' score rankings (not vs ground truth). "
	"Near-zero means the methods are flagging almost entirely different queries."
	)
	gr.Dataframe(PAIRWISE.round(4), label="Pairwise Spearman ρ between method rankings")
	gr.Markdown(
	"> AirRep↔STRIDE mean ρ = −0.06 — nearly anti-correlated. "
	"Both have independent signal, suggesting an ensemble could outperform either alone."
	)

	with gr.Tab("Findings & Discussion"):
	gr.Markdown(FINDINGS)

	demo.launch()