Spaces:

stride-influence
/

attribution-comparison

Sleeping

App Files Files Community

amirali1985 commited on 21 days ago

Commit

2e8969b

verified ·

1 Parent(s): 459cb57

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +126 -0

app.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import gradio as gr
+import pandas as pd
+# ── Data ─────────────────────────────────────────────────────────────────────
+SUMMARY = pd.DataFrame([
+    {"Method": "AirRep (pretrained)",  "AUPRC": 0.132, "ROC-AUC": 0.638, "R@10": 0.16, "R@50": 0.23, "R@100": 0.35, "MRR": 0.344, "best-F1": 0.223, "hit@10": 0.93},
+    {"Method": "STRIDE (300 subsets)", "AUPRC": 0.109, "ROC-AUC": 0.647, "R@10": 0.14, "R@50": 0.20, "R@100": 0.32, "MRR": 0.210, "best-F1": 0.211, "hit@10": 0.02},
+    {"Method": "LoGRA",                "AUPRC": 0.064, "ROC-AUC": 0.497, "R@10": 0.00, "R@50": 0.00, "R@100": 0.00, "MRR": 0.007, "best-F1": 0.192, "hit@10": 0.07},
+])
+PER_MODEL = pd.DataFrame([
+    {"Model": "0.5%_seed0", "Method": "AirRep",  "AUPRC": 0.066, "ROC-AUC": 0.623, "R@10": 0.00, "R@100": 0.32, "MRR": 0.071},
+    {"Model": "0.5%_seed0", "Method": "LoGRA",   "AUPRC": 0.044, "ROC-AUC": 0.519, "R@10": 0.00, "R@100": 0.00, "MRR": 0.005},
+    {"Model": "0.5%_seed0", "Method": "STRIDE",  "AUPRC": 0.131, "ROC-AUC": 0.810, "R@10": 0.10, "R@100": 0.59, "MRR": 0.200},
+    {"Model": "0.5%_seed1", "Method": "AirRep",  "AUPRC": 0.114, "ROC-AUC": 0.658, "R@10": 0.20, "R@100": 0.36, "MRR": 0.250},
+    {"Model": "0.5%_seed1", "Method": "LoGRA",   "AUPRC": 0.047, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.00, "MRR": 0.006},
+    {"Model": "0.5%_seed1", "Method": "STRIDE",  "AUPRC": 0.081, "ROC-AUC": 0.669, "R@10": 0.10, "R@100": 0.41, "MRR": 0.125},
+    {"Model": "1%_seed0",   "Method": "AirRep",  "AUPRC": 0.182, "ROC-AUC": 0.679, "R@10": 0.30, "R@100": 0.36, "MRR": 1.000},
+    {"Model": "1%_seed0",   "Method": "LoGRA",   "AUPRC": 0.081, "ROC-AUC": 0.516, "R@10": 0.00, "R@100": 0.00, "MRR": 0.010},
+    {"Model": "1%_seed0",   "Method": "STRIDE",  "AUPRC": 0.138, "ROC-AUC": 0.634, "R@10": 0.30, "R@100": 0.27, "MRR": 0.333},
+    {"Model": "1%_seed1",   "Method": "AirRep",  "AUPRC": 0.117, "ROC-AUC": 0.600, "R@10": 0.00, "R@100": 0.31, "MRR": 0.067},
+    {"Model": "1%_seed1",   "Method": "LoGRA",   "AUPRC": 0.081, "ROC-AUC": 0.512, "R@10": 0.00, "R@100": 0.00, "MRR": 0.007},
+    {"Model": "1%_seed1",   "Method": "STRIDE",  "AUPRC": 0.109, "ROC-AUC": 0.563, "R@10": 0.20, "R@100": 0.20, "MRR": 0.333},
+    {"Model": "1%_seed2",   "Method": "AirRep",  "AUPRC": 0.181, "ROC-AUC": 0.630, "R@10": 0.30, "R@100": 0.40, "MRR": 0.333},
+    {"Model": "1%_seed2",   "Method": "LoGRA",   "AUPRC": 0.065, "ROC-AUC": 0.383, "R@10": 0.00, "R@100": 0.00, "MRR": 0.009},
+    {"Model": "1%_seed2",   "Method": "STRIDE",  "AUPRC": 0.088, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.13, "MRR": 0.059},
+])
+RETRIEVAL = pd.DataFrame([
+    {"Model": "0.5%_seed0", "Method": "AirRep",  "hit@1": 1.00, "hit@5": 1.00, "hit@10": 1.00, "hit@20": 1.00, "hit@100": 1.00, "random@10": 0.048},
+    {"Model": "0.5%_seed0", "Method": "LoGRA",   "hit@1": 0.00, "hit@5": 0.05, "hit@10": 0.09, "hit@20": 0.14, "hit@100": 0.41, "random@10": 0.048},
+    {"Model": "0.5%_seed0", "Method": "STRIDE",  "hit@1": 0.00, "hit@5": 0.00, "hit@10": 0.05, "hit@20": 0.14, "hit@100": 0.23, "random@10": 0.048},
+    {"Model": "1%_seed0",   "Method": "AirRep",  "hit@1": 0.91, "hit@5": 0.91, "hit@10": 0.91, "hit@20": 0.91, "hit@100": 0.91, "random@10": 0.043},
+    {"Model": "1%_seed0",   "Method": "LoGRA",   "hit@1": 0.00, "hit@5": 0.02, "hit@10": 0.04, "hit@20": 0.11, "hit@100": 0.40, "random@10": 0.043},
+    {"Model": "1%_seed0",   "Method": "STRIDE",  "hit@1": 0.00, "hit@5": 0.00, "hit@10": 0.00, "hit@20": 0.02, "hit@100": 0.22, "random@10": 0.043},
+    {"Model": "1%_seed1",   "Method": "AirRep",  "hit@1": 0.87, "hit@5": 0.89, "hit@10": 0.89, "hit@20": 0.89, "hit@100": 0.89, "random@10": 0.043},
+    {"Model": "1%_seed1",   "Method": "LoGRA",   "hit@1": 0.02, "hit@5": 0.04, "hit@10": 0.07, "hit@20": 0.13, "hit@100": 0.36, "random@10": 0.043},
+    {"Model": "1%_seed1",   "Method": "STRIDE",  "hit@1": 0.00, "hit@5": 0.00, "hit@10": 0.00, "hit@20": 0.04, "hit@100": 0.09, "random@10": 0.043},
+])
+FINDINGS = """
+## Setup
+We fine-tune Qwen2.5-0.5B on MATH training data at contamination rates of **0.5%, 1%, 1.5%** (3 seeds each).
+A subset of MATH *test* examples is injected as exact replicas into the training pool.
+**Task:** given a MATH test query, predict whether it appeared in training data.
+**Query set:** 500 clean validation examples + all leaked examples for that model (~22–45 depending on rate).
+---
+## Key findings
+### 1. AirRep >> STRIDE > LoGRA
+All three methods detect contamination above chance, but with very different profiles:
+- **AirRep** (representation similarity, no GPU training needed): best at both *query-level detection*
+  (AUPRC 0.132, ROC-AUC 0.638) and *training-item retrieval* (hit@1 ≈ 0.93). It finds the exact
+  training replica at rank 1 for 87–100% of leaked queries.
+- **STRIDE** (gradient steering operator + Lasso): competitive with AirRep on ROC-AUC (0.647 vs 0.638),
+  meaningful AUPRC (0.109). But training-item retrieval is essentially random — it cannot point to
+  *which* training example caused contamination, only that the query is likely contaminated.
+- **LoGRA** (full gradient influence via LogIX): near-random contamination detection (ROC-AUC 0.497,
+  R@100 = 0.00 on every model). The raw gradient dot-product scores have enormous variance
+  (std ~ 2.9M vs mean gap ~ 250K between leaked/nonleaked). Training-item retrieval is marginally
+  above random at hit@100 but well below STRIDE at hit@10.
+### 2. None of the gradient methods can distinguish memorized from leaked-not-memorized
+Score distributions are nearly identical between leaked+memorized and leaked-but-not-memorized
+(e.g. STRIDE means: memorized=7.43, leaked-not-mem=7.48, clean=7.07).
+### 3. STRIDE false positives are "easy" problems
+The top STRIDE false positives are short, concrete, textbook-style problems
+(e.g. "Evaluate 7/(45²-38²)", "How many times does 8 appear from 1–1000?").
+STRIDE appears to score highly any query where the training data *strongly determines model behavior*,
+not just queries that are exact replicas in training.
+### 4. Why does LoGRA fail?
+Gradient influence (LoGRA / LogIX) measures how much each training example shifts model loss
+on a given query. For a query that appears exactly in training, its own replica has very high
+influence — but so do other training examples for *unrelated* reasons (random gradient alignment).
+The max-over-pool signal is buried in noise. Representation similarity (AirRep) is a much more
+direct signal for exact/near-duplicate detection.
+"""
+# ── UI ────────────────────────────────────────────────────────────────────────
+with gr.Blocks(title="Attribution Method Comparison — MATH Contamination") as demo:
+    gr.Markdown("# Attribution Method Comparison: AirRep vs STRIDE vs LoGRA\n"
+                "**Task:** detect benchmark contamination in Qwen2.5-0.5B fine-tuned on MATH. "
+                "5 models evaluated (0.5%×2 seeds, 1%×3 seeds). "
+                "See the **Community** tab to discuss.")
+    with gr.Tabs():
+        with gr.Tab("Summary"):
+            gr.Markdown("### Mean metrics over 5 models\n"
+                        "*Query-level detection*: can the method rank leaked queries above clean ones?\n"
+                        "*hit@10*: fraction of leaked queries whose exact training replica appears in the method's top-10 retrieved training items.")
+            gr.Dataframe(SUMMARY.round(3), label="Summary (averaged over 5 models)")
+            gr.Markdown("> **Ranking: AirRep >> STRIDE > LoGRA**\n"
+                        "> STRIDE beats LoGRA on every metric despite being cheaper (~1-2h vs ~4h/model). "
+                        "AirRep (no gradient training, just embedding similarity) beats both.")
+        with gr.Tab("Per-model breakdown"):
+            gr.Markdown("### Detection metrics per model and method")
+            gr.Dataframe(PER_MODEL.round(3), label="Per-model results")
+        with gr.Tab("Training-item retrieval"):
+            gr.Markdown("### Can the method retrieve the exact training replica for a leaked query?\n"
+                        "`random@10` is the expected hit@10 under uniform random retrieval.")
+            gr.Dataframe(RETRIEVAL.round(3), label="Retrieval hit@k")
+            gr.Markdown("> AirRep finds the exact replica at **rank 1** for 87–100% of leaked queries.\n"
+                        "> STRIDE and LoGRA are both near or below random at hit@10.")
+        with gr.Tab("Findings & Discussion"):
+            gr.Markdown(FINDINGS)
+demo.launch()