amirali1985 commited on
Commit
2e8969b
Β·
verified Β·
1 Parent(s): 459cb57

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +126 -0
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+
4
+ # ── Data ─────────────────────────────────────────────────────────────────────
5
+
6
+ SUMMARY = pd.DataFrame([
7
+ {"Method": "AirRep (pretrained)", "AUPRC": 0.132, "ROC-AUC": 0.638, "R@10": 0.16, "R@50": 0.23, "R@100": 0.35, "MRR": 0.344, "best-F1": 0.223, "hit@10": 0.93},
8
+ {"Method": "STRIDE (300 subsets)", "AUPRC": 0.109, "ROC-AUC": 0.647, "R@10": 0.14, "R@50": 0.20, "R@100": 0.32, "MRR": 0.210, "best-F1": 0.211, "hit@10": 0.02},
9
+ {"Method": "LoGRA", "AUPRC": 0.064, "ROC-AUC": 0.497, "R@10": 0.00, "R@50": 0.00, "R@100": 0.00, "MRR": 0.007, "best-F1": 0.192, "hit@10": 0.07},
10
+ ])
11
+
12
+ PER_MODEL = pd.DataFrame([
13
+ {"Model": "0.5%_seed0", "Method": "AirRep", "AUPRC": 0.066, "ROC-AUC": 0.623, "R@10": 0.00, "R@100": 0.32, "MRR": 0.071},
14
+ {"Model": "0.5%_seed0", "Method": "LoGRA", "AUPRC": 0.044, "ROC-AUC": 0.519, "R@10": 0.00, "R@100": 0.00, "MRR": 0.005},
15
+ {"Model": "0.5%_seed0", "Method": "STRIDE", "AUPRC": 0.131, "ROC-AUC": 0.810, "R@10": 0.10, "R@100": 0.59, "MRR": 0.200},
16
+ {"Model": "0.5%_seed1", "Method": "AirRep", "AUPRC": 0.114, "ROC-AUC": 0.658, "R@10": 0.20, "R@100": 0.36, "MRR": 0.250},
17
+ {"Model": "0.5%_seed1", "Method": "LoGRA", "AUPRC": 0.047, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.00, "MRR": 0.006},
18
+ {"Model": "0.5%_seed1", "Method": "STRIDE", "AUPRC": 0.081, "ROC-AUC": 0.669, "R@10": 0.10, "R@100": 0.41, "MRR": 0.125},
19
+ {"Model": "1%_seed0", "Method": "AirRep", "AUPRC": 0.182, "ROC-AUC": 0.679, "R@10": 0.30, "R@100": 0.36, "MRR": 1.000},
20
+ {"Model": "1%_seed0", "Method": "LoGRA", "AUPRC": 0.081, "ROC-AUC": 0.516, "R@10": 0.00, "R@100": 0.00, "MRR": 0.010},
21
+ {"Model": "1%_seed0", "Method": "STRIDE", "AUPRC": 0.138, "ROC-AUC": 0.634, "R@10": 0.30, "R@100": 0.27, "MRR": 0.333},
22
+ {"Model": "1%_seed1", "Method": "AirRep", "AUPRC": 0.117, "ROC-AUC": 0.600, "R@10": 0.00, "R@100": 0.31, "MRR": 0.067},
23
+ {"Model": "1%_seed1", "Method": "LoGRA", "AUPRC": 0.081, "ROC-AUC": 0.512, "R@10": 0.00, "R@100": 0.00, "MRR": 0.007},
24
+ {"Model": "1%_seed1", "Method": "STRIDE", "AUPRC": 0.109, "ROC-AUC": 0.563, "R@10": 0.20, "R@100": 0.20, "MRR": 0.333},
25
+ {"Model": "1%_seed2", "Method": "AirRep", "AUPRC": 0.181, "ROC-AUC": 0.630, "R@10": 0.30, "R@100": 0.40, "MRR": 0.333},
26
+ {"Model": "1%_seed2", "Method": "LoGRA", "AUPRC": 0.065, "ROC-AUC": 0.383, "R@10": 0.00, "R@100": 0.00, "MRR": 0.009},
27
+ {"Model": "1%_seed2", "Method": "STRIDE", "AUPRC": 0.088, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.13, "MRR": 0.059},
28
+ ])
29
+
30
+ RETRIEVAL = pd.DataFrame([
31
+ {"Model": "0.5%_seed0", "Method": "AirRep", "hit@1": 1.00, "hit@5": 1.00, "hit@10": 1.00, "hit@20": 1.00, "hit@100": 1.00, "random@10": 0.048},
32
+ {"Model": "0.5%_seed0", "Method": "LoGRA", "hit@1": 0.00, "hit@5": 0.05, "hit@10": 0.09, "hit@20": 0.14, "hit@100": 0.41, "random@10": 0.048},
33
+ {"Model": "0.5%_seed0", "Method": "STRIDE", "hit@1": 0.00, "hit@5": 0.00, "hit@10": 0.05, "hit@20": 0.14, "hit@100": 0.23, "random@10": 0.048},
34
+ {"Model": "1%_seed0", "Method": "AirRep", "hit@1": 0.91, "hit@5": 0.91, "hit@10": 0.91, "hit@20": 0.91, "hit@100": 0.91, "random@10": 0.043},
35
+ {"Model": "1%_seed0", "Method": "LoGRA", "hit@1": 0.00, "hit@5": 0.02, "hit@10": 0.04, "hit@20": 0.11, "hit@100": 0.40, "random@10": 0.043},
36
+ {"Model": "1%_seed0", "Method": "STRIDE", "hit@1": 0.00, "hit@5": 0.00, "hit@10": 0.00, "hit@20": 0.02, "hit@100": 0.22, "random@10": 0.043},
37
+ {"Model": "1%_seed1", "Method": "AirRep", "hit@1": 0.87, "hit@5": 0.89, "hit@10": 0.89, "hit@20": 0.89, "hit@100": 0.89, "random@10": 0.043},
38
+ {"Model": "1%_seed1", "Method": "LoGRA", "hit@1": 0.02, "hit@5": 0.04, "hit@10": 0.07, "hit@20": 0.13, "hit@100": 0.36, "random@10": 0.043},
39
+ {"Model": "1%_seed1", "Method": "STRIDE", "hit@1": 0.00, "hit@5": 0.00, "hit@10": 0.00, "hit@20": 0.04, "hit@100": 0.09, "random@10": 0.043},
40
+ ])
41
+
42
+ FINDINGS = """
43
+ ## Setup
44
+
45
+ We fine-tune Qwen2.5-0.5B on MATH training data at contamination rates of **0.5%, 1%, 1.5%** (3 seeds each).
46
+ A subset of MATH *test* examples is injected as exact replicas into the training pool.
47
+
48
+ **Task:** given a MATH test query, predict whether it appeared in training data.
49
+
50
+ **Query set:** 500 clean validation examples + all leaked examples for that model (~22–45 depending on rate).
51
+
52
+ ---
53
+
54
+ ## Key findings
55
+
56
+ ### 1. AirRep >> STRIDE > LoGRA
57
+
58
+ All three methods detect contamination above chance, but with very different profiles:
59
+
60
+ - **AirRep** (representation similarity, no GPU training needed): best at both *query-level detection*
61
+ (AUPRC 0.132, ROC-AUC 0.638) and *training-item retrieval* (hit@1 β‰ˆ 0.93). It finds the exact
62
+ training replica at rank 1 for 87–100% of leaked queries.
63
+
64
+ - **STRIDE** (gradient steering operator + Lasso): competitive with AirRep on ROC-AUC (0.647 vs 0.638),
65
+ meaningful AUPRC (0.109). But training-item retrieval is essentially random β€” it cannot point to
66
+ *which* training example caused contamination, only that the query is likely contaminated.
67
+
68
+ - **LoGRA** (full gradient influence via LogIX): near-random contamination detection (ROC-AUC 0.497,
69
+ R@100 = 0.00 on every model). The raw gradient dot-product scores have enormous variance
70
+ (std ~ 2.9M vs mean gap ~ 250K between leaked/nonleaked). Training-item retrieval is marginally
71
+ above random at hit@100 but well below STRIDE at hit@10.
72
+
73
+ ### 2. None of the gradient methods can distinguish memorized from leaked-not-memorized
74
+
75
+ Score distributions are nearly identical between leaked+memorized and leaked-but-not-memorized
76
+ (e.g. STRIDE means: memorized=7.43, leaked-not-mem=7.48, clean=7.07).
77
+
78
+ ### 3. STRIDE false positives are "easy" problems
79
+
80
+ The top STRIDE false positives are short, concrete, textbook-style problems
81
+ (e.g. "Evaluate 7/(45Β²-38Β²)", "How many times does 8 appear from 1–1000?").
82
+ STRIDE appears to score highly any query where the training data *strongly determines model behavior*,
83
+ not just queries that are exact replicas in training.
84
+
85
+ ### 4. Why does LoGRA fail?
86
+
87
+ Gradient influence (LoGRA / LogIX) measures how much each training example shifts model loss
88
+ on a given query. For a query that appears exactly in training, its own replica has very high
89
+ influence β€” but so do other training examples for *unrelated* reasons (random gradient alignment).
90
+ The max-over-pool signal is buried in noise. Representation similarity (AirRep) is a much more
91
+ direct signal for exact/near-duplicate detection.
92
+ """
93
+
94
+ # ── UI ────────────────────────────────────────────────────────────────────────
95
+
96
+ with gr.Blocks(title="Attribution Method Comparison β€” MATH Contamination") as demo:
97
+ gr.Markdown("# Attribution Method Comparison: AirRep vs STRIDE vs LoGRA\n"
98
+ "**Task:** detect benchmark contamination in Qwen2.5-0.5B fine-tuned on MATH. "
99
+ "5 models evaluated (0.5%Γ—2 seeds, 1%Γ—3 seeds). "
100
+ "See the **Community** tab to discuss.")
101
+
102
+ with gr.Tabs():
103
+ with gr.Tab("Summary"):
104
+ gr.Markdown("### Mean metrics over 5 models\n"
105
+ "*Query-level detection*: can the method rank leaked queries above clean ones?\n"
106
+ "*hit@10*: fraction of leaked queries whose exact training replica appears in the method's top-10 retrieved training items.")
107
+ gr.Dataframe(SUMMARY.round(3), label="Summary (averaged over 5 models)")
108
+ gr.Markdown("> **Ranking: AirRep >> STRIDE > LoGRA**\n"
109
+ "> STRIDE beats LoGRA on every metric despite being cheaper (~1-2h vs ~4h/model). "
110
+ "AirRep (no gradient training, just embedding similarity) beats both.")
111
+
112
+ with gr.Tab("Per-model breakdown"):
113
+ gr.Markdown("### Detection metrics per model and method")
114
+ gr.Dataframe(PER_MODEL.round(3), label="Per-model results")
115
+
116
+ with gr.Tab("Training-item retrieval"):
117
+ gr.Markdown("### Can the method retrieve the exact training replica for a leaked query?\n"
118
+ "`random@10` is the expected hit@10 under uniform random retrieval.")
119
+ gr.Dataframe(RETRIEVAL.round(3), label="Retrieval hit@k")
120
+ gr.Markdown("> AirRep finds the exact replica at **rank 1** for 87–100% of leaked queries.\n"
121
+ "> STRIDE and LoGRA are both near or below random at hit@10.")
122
+
123
+ with gr.Tab("Findings & Discussion"):
124
+ gr.Markdown(FINDINGS)
125
+
126
+ demo.launch()