Upload app.py with huggingface_hub
Browse files
app.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
# ββ Data βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 5 |
+
|
| 6 |
+
SUMMARY = pd.DataFrame([
|
| 7 |
+
{"Method": "AirRep (pretrained)", "AUPRC": 0.132, "ROC-AUC": 0.638, "R@10": 0.16, "R@50": 0.23, "R@100": 0.35, "MRR": 0.344, "best-F1": 0.223, "hit@10": 0.93},
|
| 8 |
+
{"Method": "STRIDE (300 subsets)", "AUPRC": 0.109, "ROC-AUC": 0.647, "R@10": 0.14, "R@50": 0.20, "R@100": 0.32, "MRR": 0.210, "best-F1": 0.211, "hit@10": 0.02},
|
| 9 |
+
{"Method": "LoGRA", "AUPRC": 0.064, "ROC-AUC": 0.497, "R@10": 0.00, "R@50": 0.00, "R@100": 0.00, "MRR": 0.007, "best-F1": 0.192, "hit@10": 0.07},
|
| 10 |
+
])
|
| 11 |
+
|
| 12 |
+
PER_MODEL = pd.DataFrame([
|
| 13 |
+
{"Model": "0.5%_seed0", "Method": "AirRep", "AUPRC": 0.066, "ROC-AUC": 0.623, "R@10": 0.00, "R@100": 0.32, "MRR": 0.071},
|
| 14 |
+
{"Model": "0.5%_seed0", "Method": "LoGRA", "AUPRC": 0.044, "ROC-AUC": 0.519, "R@10": 0.00, "R@100": 0.00, "MRR": 0.005},
|
| 15 |
+
{"Model": "0.5%_seed0", "Method": "STRIDE", "AUPRC": 0.131, "ROC-AUC": 0.810, "R@10": 0.10, "R@100": 0.59, "MRR": 0.200},
|
| 16 |
+
{"Model": "0.5%_seed1", "Method": "AirRep", "AUPRC": 0.114, "ROC-AUC": 0.658, "R@10": 0.20, "R@100": 0.36, "MRR": 0.250},
|
| 17 |
+
{"Model": "0.5%_seed1", "Method": "LoGRA", "AUPRC": 0.047, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.00, "MRR": 0.006},
|
| 18 |
+
{"Model": "0.5%_seed1", "Method": "STRIDE", "AUPRC": 0.081, "ROC-AUC": 0.669, "R@10": 0.10, "R@100": 0.41, "MRR": 0.125},
|
| 19 |
+
{"Model": "1%_seed0", "Method": "AirRep", "AUPRC": 0.182, "ROC-AUC": 0.679, "R@10": 0.30, "R@100": 0.36, "MRR": 1.000},
|
| 20 |
+
{"Model": "1%_seed0", "Method": "LoGRA", "AUPRC": 0.081, "ROC-AUC": 0.516, "R@10": 0.00, "R@100": 0.00, "MRR": 0.010},
|
| 21 |
+
{"Model": "1%_seed0", "Method": "STRIDE", "AUPRC": 0.138, "ROC-AUC": 0.634, "R@10": 0.30, "R@100": 0.27, "MRR": 0.333},
|
| 22 |
+
{"Model": "1%_seed1", "Method": "AirRep", "AUPRC": 0.117, "ROC-AUC": 0.600, "R@10": 0.00, "R@100": 0.31, "MRR": 0.067},
|
| 23 |
+
{"Model": "1%_seed1", "Method": "LoGRA", "AUPRC": 0.081, "ROC-AUC": 0.512, "R@10": 0.00, "R@100": 0.00, "MRR": 0.007},
|
| 24 |
+
{"Model": "1%_seed1", "Method": "STRIDE", "AUPRC": 0.109, "ROC-AUC": 0.563, "R@10": 0.20, "R@100": 0.20, "MRR": 0.333},
|
| 25 |
+
{"Model": "1%_seed2", "Method": "AirRep", "AUPRC": 0.181, "ROC-AUC": 0.630, "R@10": 0.30, "R@100": 0.40, "MRR": 0.333},
|
| 26 |
+
{"Model": "1%_seed2", "Method": "LoGRA", "AUPRC": 0.065, "ROC-AUC": 0.383, "R@10": 0.00, "R@100": 0.00, "MRR": 0.009},
|
| 27 |
+
{"Model": "1%_seed2", "Method": "STRIDE", "AUPRC": 0.088, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.13, "MRR": 0.059},
|
| 28 |
+
])
|
| 29 |
+
|
| 30 |
+
RETRIEVAL = pd.DataFrame([
|
| 31 |
+
{"Model": "0.5%_seed0", "Method": "AirRep", "hit@1": 1.00, "hit@5": 1.00, "hit@10": 1.00, "hit@20": 1.00, "hit@100": 1.00, "random@10": 0.048},
|
| 32 |
+
{"Model": "0.5%_seed0", "Method": "LoGRA", "hit@1": 0.00, "hit@5": 0.05, "hit@10": 0.09, "hit@20": 0.14, "hit@100": 0.41, "random@10": 0.048},
|
| 33 |
+
{"Model": "0.5%_seed0", "Method": "STRIDE", "hit@1": 0.00, "hit@5": 0.00, "hit@10": 0.05, "hit@20": 0.14, "hit@100": 0.23, "random@10": 0.048},
|
| 34 |
+
{"Model": "1%_seed0", "Method": "AirRep", "hit@1": 0.91, "hit@5": 0.91, "hit@10": 0.91, "hit@20": 0.91, "hit@100": 0.91, "random@10": 0.043},
|
| 35 |
+
{"Model": "1%_seed0", "Method": "LoGRA", "hit@1": 0.00, "hit@5": 0.02, "hit@10": 0.04, "hit@20": 0.11, "hit@100": 0.40, "random@10": 0.043},
|
| 36 |
+
{"Model": "1%_seed0", "Method": "STRIDE", "hit@1": 0.00, "hit@5": 0.00, "hit@10": 0.00, "hit@20": 0.02, "hit@100": 0.22, "random@10": 0.043},
|
| 37 |
+
{"Model": "1%_seed1", "Method": "AirRep", "hit@1": 0.87, "hit@5": 0.89, "hit@10": 0.89, "hit@20": 0.89, "hit@100": 0.89, "random@10": 0.043},
|
| 38 |
+
{"Model": "1%_seed1", "Method": "LoGRA", "hit@1": 0.02, "hit@5": 0.04, "hit@10": 0.07, "hit@20": 0.13, "hit@100": 0.36, "random@10": 0.043},
|
| 39 |
+
{"Model": "1%_seed1", "Method": "STRIDE", "hit@1": 0.00, "hit@5": 0.00, "hit@10": 0.00, "hit@20": 0.04, "hit@100": 0.09, "random@10": 0.043},
|
| 40 |
+
])
|
| 41 |
+
|
| 42 |
+
FINDINGS = """
|
| 43 |
+
## Setup
|
| 44 |
+
|
| 45 |
+
We fine-tune Qwen2.5-0.5B on MATH training data at contamination rates of **0.5%, 1%, 1.5%** (3 seeds each).
|
| 46 |
+
A subset of MATH *test* examples is injected as exact replicas into the training pool.
|
| 47 |
+
|
| 48 |
+
**Task:** given a MATH test query, predict whether it appeared in training data.
|
| 49 |
+
|
| 50 |
+
**Query set:** 500 clean validation examples + all leaked examples for that model (~22β45 depending on rate).
|
| 51 |
+
|
| 52 |
+
---
|
| 53 |
+
|
| 54 |
+
## Key findings
|
| 55 |
+
|
| 56 |
+
### 1. AirRep >> STRIDE > LoGRA
|
| 57 |
+
|
| 58 |
+
All three methods detect contamination above chance, but with very different profiles:
|
| 59 |
+
|
| 60 |
+
- **AirRep** (representation similarity, no GPU training needed): best at both *query-level detection*
|
| 61 |
+
(AUPRC 0.132, ROC-AUC 0.638) and *training-item retrieval* (hit@1 β 0.93). It finds the exact
|
| 62 |
+
training replica at rank 1 for 87β100% of leaked queries.
|
| 63 |
+
|
| 64 |
+
- **STRIDE** (gradient steering operator + Lasso): competitive with AirRep on ROC-AUC (0.647 vs 0.638),
|
| 65 |
+
meaningful AUPRC (0.109). But training-item retrieval is essentially random β it cannot point to
|
| 66 |
+
*which* training example caused contamination, only that the query is likely contaminated.
|
| 67 |
+
|
| 68 |
+
- **LoGRA** (full gradient influence via LogIX): near-random contamination detection (ROC-AUC 0.497,
|
| 69 |
+
R@100 = 0.00 on every model). The raw gradient dot-product scores have enormous variance
|
| 70 |
+
(std ~ 2.9M vs mean gap ~ 250K between leaked/nonleaked). Training-item retrieval is marginally
|
| 71 |
+
above random at hit@100 but well below STRIDE at hit@10.
|
| 72 |
+
|
| 73 |
+
### 2. None of the gradient methods can distinguish memorized from leaked-not-memorized
|
| 74 |
+
|
| 75 |
+
Score distributions are nearly identical between leaked+memorized and leaked-but-not-memorized
|
| 76 |
+
(e.g. STRIDE means: memorized=7.43, leaked-not-mem=7.48, clean=7.07).
|
| 77 |
+
|
| 78 |
+
### 3. STRIDE false positives are "easy" problems
|
| 79 |
+
|
| 80 |
+
The top STRIDE false positives are short, concrete, textbook-style problems
|
| 81 |
+
(e.g. "Evaluate 7/(45Β²-38Β²)", "How many times does 8 appear from 1β1000?").
|
| 82 |
+
STRIDE appears to score highly any query where the training data *strongly determines model behavior*,
|
| 83 |
+
not just queries that are exact replicas in training.
|
| 84 |
+
|
| 85 |
+
### 4. Why does LoGRA fail?
|
| 86 |
+
|
| 87 |
+
Gradient influence (LoGRA / LogIX) measures how much each training example shifts model loss
|
| 88 |
+
on a given query. For a query that appears exactly in training, its own replica has very high
|
| 89 |
+
influence β but so do other training examples for *unrelated* reasons (random gradient alignment).
|
| 90 |
+
The max-over-pool signal is buried in noise. Representation similarity (AirRep) is a much more
|
| 91 |
+
direct signal for exact/near-duplicate detection.
|
| 92 |
+
"""
|
| 93 |
+
|
| 94 |
+
# ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 95 |
+
|
| 96 |
+
with gr.Blocks(title="Attribution Method Comparison β MATH Contamination") as demo:
|
| 97 |
+
gr.Markdown("# Attribution Method Comparison: AirRep vs STRIDE vs LoGRA\n"
|
| 98 |
+
"**Task:** detect benchmark contamination in Qwen2.5-0.5B fine-tuned on MATH. "
|
| 99 |
+
"5 models evaluated (0.5%Γ2 seeds, 1%Γ3 seeds). "
|
| 100 |
+
"See the **Community** tab to discuss.")
|
| 101 |
+
|
| 102 |
+
with gr.Tabs():
|
| 103 |
+
with gr.Tab("Summary"):
|
| 104 |
+
gr.Markdown("### Mean metrics over 5 models\n"
|
| 105 |
+
"*Query-level detection*: can the method rank leaked queries above clean ones?\n"
|
| 106 |
+
"*hit@10*: fraction of leaked queries whose exact training replica appears in the method's top-10 retrieved training items.")
|
| 107 |
+
gr.Dataframe(SUMMARY.round(3), label="Summary (averaged over 5 models)")
|
| 108 |
+
gr.Markdown("> **Ranking: AirRep >> STRIDE > LoGRA**\n"
|
| 109 |
+
"> STRIDE beats LoGRA on every metric despite being cheaper (~1-2h vs ~4h/model). "
|
| 110 |
+
"AirRep (no gradient training, just embedding similarity) beats both.")
|
| 111 |
+
|
| 112 |
+
with gr.Tab("Per-model breakdown"):
|
| 113 |
+
gr.Markdown("### Detection metrics per model and method")
|
| 114 |
+
gr.Dataframe(PER_MODEL.round(3), label="Per-model results")
|
| 115 |
+
|
| 116 |
+
with gr.Tab("Training-item retrieval"):
|
| 117 |
+
gr.Markdown("### Can the method retrieve the exact training replica for a leaked query?\n"
|
| 118 |
+
"`random@10` is the expected hit@10 under uniform random retrieval.")
|
| 119 |
+
gr.Dataframe(RETRIEVAL.round(3), label="Retrieval hit@k")
|
| 120 |
+
gr.Markdown("> AirRep finds the exact replica at **rank 1** for 87β100% of leaked queries.\n"
|
| 121 |
+
"> STRIDE and LoGRA are both near or below random at hit@10.")
|
| 122 |
+
|
| 123 |
+
with gr.Tab("Findings & Discussion"):
|
| 124 |
+
gr.Markdown(FINDINGS)
|
| 125 |
+
|
| 126 |
+
demo.launch()
|