File size: 9,010 Bytes
2e8969b
 
 
 
 
 
e831e4b
 
 
2e8969b
 
 
e831e4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e8969b
 
e831e4b
 
 
 
 
 
 
2e8969b
 
 
 
 
e831e4b
2e8969b
 
 
 
e831e4b
2e8969b
 
 
 
 
e831e4b
2e8969b
e831e4b
2e8969b
e831e4b
 
 
2e8969b
e831e4b
 
2e8969b
e831e4b
2e8969b
e831e4b
 
2e8969b
e831e4b
2e8969b
e831e4b
2e8969b
e831e4b
 
 
 
 
2e8969b
e831e4b
 
 
2e8969b
e831e4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e8969b
 
 
 
 
e831e4b
 
 
 
 
 
 
2e8969b
 
 
e831e4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e8969b
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import gradio as gr
import pandas as pd

# ── Data ─────────────────────────────────────────────────────────────────────

SUMMARY = pd.DataFrame([
    {"Method": "AirRep (pretrained)",  "Spearman ρ": 0.1172, "AUPRC": 0.132, "ROC-AUC": 0.638, "R@10": 0.16, "R@100": 0.35, "MRR": 0.344, "best-F1": 0.223},
    {"Method": "STRIDE (300 subsets)", "Spearman ρ": 0.1152, "AUPRC": 0.109, "ROC-AUC": 0.647, "R@10": 0.14, "R@100": 0.32, "MRR": 0.210, "best-F1": 0.211},
    {"Method": "LoGRA",                "Spearman ρ":-0.0064, "AUPRC": 0.064, "ROC-AUC": 0.497, "R@10": 0.00, "R@100": 0.00, "MRR": 0.007, "best-F1": 0.192},
])

PER_MODEL = pd.DataFrame([
    {"Model": "0.5%_seed0", "Method": "AirRep",  "Spearman ρ": 0.0856, "p": 0.051, "AUPRC": 0.066, "ROC-AUC": 0.623, "R@10": 0.00, "R@100": 0.32, "MRR": 0.071},
    {"Model": "0.5%_seed0", "Method": "LoGRA",   "Spearman ρ": 0.0133, "p": 0.762, "AUPRC": 0.044, "ROC-AUC": 0.519, "R@10": 0.00, "R@100": 0.00, "MRR": 0.005},
    {"Model": "0.5%_seed0", "Method": "STRIDE",  "Spearman ρ": 0.2158, "p": 0.000, "AUPRC": 0.131, "ROC-AUC": 0.810, "R@10": 0.10, "R@100": 0.59, "MRR": 0.200},
    {"Model": "0.5%_seed1", "Method": "AirRep",  "Spearman ρ": 0.1098, "p": 0.012, "AUPRC": 0.114, "ROC-AUC": 0.658, "R@10": 0.20, "R@100": 0.36, "MRR": 0.250},
    {"Model": "0.5%_seed1", "Method": "LoGRA",   "Spearman ρ": 0.0392, "p": 0.372, "AUPRC": 0.047, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.00, "MRR": 0.006},
    {"Model": "0.5%_seed1", "Method": "STRIDE",  "Spearman ρ": 0.1178, "p": 0.007, "AUPRC": 0.081, "ROC-AUC": 0.669, "R@10": 0.10, "R@100": 0.41, "MRR": 0.125},
    {"Model": "1%_seed0",   "Method": "AirRep",  "Spearman ρ": 0.1709, "p": 0.000, "AUPRC": 0.182, "ROC-AUC": 0.679, "R@10": 0.30, "R@100": 0.36, "MRR": 1.000},
    {"Model": "1%_seed0",   "Method": "LoGRA",   "Spearman ρ": 0.0154, "p": 0.720, "AUPRC": 0.081, "ROC-AUC": 0.516, "R@10": 0.00, "R@100": 0.00, "MRR": 0.010},
    {"Model": "1%_seed0",   "Method": "STRIDE",  "Spearman ρ": 0.1281, "p": 0.003, "AUPRC": 0.138, "ROC-AUC": 0.634, "R@10": 0.30, "R@100": 0.27, "MRR": 0.333},
    {"Model": "1%_seed1",   "Method": "AirRep",  "Spearman ρ": 0.0956, "p": 0.026, "AUPRC": 0.117, "ROC-AUC": 0.600, "R@10": 0.00, "R@100": 0.31, "MRR": 0.067},
    {"Model": "1%_seed1",   "Method": "LoGRA",   "Spearman ρ": 0.0111, "p": 0.796, "AUPRC": 0.081, "ROC-AUC": 0.512, "R@10": 0.00, "R@100": 0.00, "MRR": 0.007},
    {"Model": "1%_seed1",   "Method": "STRIDE",  "Spearman ρ": 0.0604, "p": 0.159, "AUPRC": 0.109, "ROC-AUC": 0.563, "R@10": 0.20, "R@100": 0.20, "MRR": 0.333},
    {"Model": "1%_seed2",   "Method": "AirRep",  "Spearman ρ": 0.1241, "p": 0.004, "AUPRC": 0.181, "ROC-AUC": 0.630, "R@10": 0.30, "R@100": 0.40, "MRR": 0.333},
    {"Model": "1%_seed2",   "Method": "LoGRA",   "Spearman ρ":-0.1111, "p": 0.009, "AUPRC": 0.065, "ROC-AUC": 0.383, "R@10": 0.00, "R@100": 0.00, "MRR": 0.009},
    {"Model": "1%_seed2",   "Method": "STRIDE",  "Spearman ρ": 0.0537, "p": 0.210, "AUPRC": 0.088, "ROC-AUC": 0.556, "R@10": 0.00, "R@100": 0.13, "MRR": 0.059},
])

PAIRWISE = pd.DataFrame([
    {"Model": "0.5%_seed0", "AirRep↔STRIDE": -0.1209, "AirRep↔LoGRA":  0.0635, "STRIDE↔LoGRA":  0.0268},
    {"Model": "0.5%_seed1", "AirRep↔STRIDE": -0.2116, "AirRep↔LoGRA": -0.0779, "STRIDE↔LoGRA":  0.0946},
    {"Model": "1%_seed0",   "AirRep↔STRIDE":  0.1615, "AirRep↔LoGRA":  0.0093, "STRIDE↔LoGRA":  0.0626},
    {"Model": "1%_seed1",   "AirRep↔STRIDE": -0.1355, "AirRep↔LoGRA":  0.0179, "STRIDE↔LoGRA": -0.0328},
    {"Model": "1%_seed2",   "AirRep↔STRIDE": -0.0041, "AirRep↔LoGRA":  0.0442, "STRIDE↔LoGRA":  0.0369},
    {"Model": "MEAN",       "AirRep↔STRIDE": -0.0621, "AirRep↔LoGRA":  0.0114, "STRIDE↔LoGRA":  0.0376},
])

FINDINGS = """
## Setup

We fine-tune Qwen2.5-0.5B on MATH training data at contamination rates of **0.5% and 1%** (5 models, 2–3 seeds per rate).
A subset of MATH *test* examples is injected as exact replicas into the training pool.

**Task:** given a MATH test query, predict whether it appeared in training data.

**Query set per model:** 500 clean validation examples + leaked examples (~22–45 depending on rate).

---

## Key findings

### 1. AirRep ≈ STRIDE >> LoGRA on detection; LoGRA near-random

All Spearman ρ values are versus the binary leaked/not-leaked label.

**LoGRA's ρ averages −0.006** — statistically indistinguishable from zero, and slightly negative
on one model. Its gradient influence scores have enormous variance (σ ≈ 2.9 × 10⁶) relative to
the mean gap between leaked and clean queries (~250K), making the signal undetectable.

**AirRep and STRIDE are comparable** (ρ ≈ 0.115–0.117) but reach that level via different mechanisms
(see finding 3).

### 2. LoGRA's p-values are all non-significant

Every LoGRA Spearman p-value is > 0.37. AirRep and STRIDE are mostly significant at α = 0.05,
especially at the 1% contamination rate.

### 3. AirRep and STRIDE rankings are essentially uncorrelated with each other

Pairwise Spearman ρ between method rankings (not vs ground truth):

| Pair | Mean ρ |
|------|--------|
| AirRep ↔ STRIDE | −0.06 |
| AirRep ↔ LoGRA  | +0.01 |
| STRIDE ↔ LoGRA  | +0.04 |

AirRep and STRIDE are nearly **anti-correlated** — they flag almost entirely different queries.
Both have signal against the ground truth, but via independent mechanisms. This suggests
an ensemble could outperform either method alone.

### 4. STRIDE false positives are "easy" problems

At 1% contamination, STRIDE's top-20 contains only 4 true positives. False positives are
short, concrete, textbook problems:

> *"Evaluate $\\dfrac{7}{45^2 - 38^2}$"*

> *"How many times does the digit 8 appear from 1 to 1000?"*

> *"If $f(x) = ax + b$ and $f(f(f(x))) = 8x + 21$, find $a + b$."*

STRIDE conflates *"training data has large gradient influence on this query"* with *"this query
was in training."* AirRep avoids this because representation similarity is far more sensitive
to exact/near-duplicate content.

### 5. None of the methods distinguish memorized from merely-leaked

STRIDE score means: memorized = 7.43, leaked-not-memorized = 7.48, clean = 7.07.
The leaked groups are ~0.4 above clean mean, but indistinguishable from each other.

---

## Open questions

1. AirRep and STRIDE flag almost non-overlapping query sets — does an ensemble help?
2. Would STRIDE with more subsets (1000) recover stronger signal, or is gradient noise fundamental?
3. Is LoGRA's failure specific to small models / short sequences, or generic for this task?
4. How do results change at 1.5% contamination (3 more models running now)?
"""

# ── UI ────────────────────────────────────────────────────────────────────────

with gr.Blocks(title="Attribution Method Comparison — MATH Contamination") as demo:
    gr.Markdown(
        "# Attribution Method Comparison: AirRep vs STRIDE vs LoGRA\n"
        "**Task:** detect benchmark contamination in Qwen2.5-0.5B fine-tuned on MATH. "
        "5 models evaluated (0.5%×2 seeds, 1%×3 seeds). "
        "Spearman ρ is against the binary leaked/not-leaked label. "
        "See the **Community** tab to discuss."
    )

    with gr.Tabs():
        with gr.Tab("Summary"):
            gr.Markdown("### Mean metrics over 5 models")
            gr.Dataframe(SUMMARY.round(4), label="Summary")
            gr.Markdown(
                "> **AirRep ≈ STRIDE on Spearman ρ (0.117 vs 0.115). LoGRA ≈ 0.** "
                "STRIDE edges AirRep on ROC-AUC; AirRep leads on AUPRC and MRR."
            )

        with gr.Tab("Per-model (with p-values)"):
            gr.Markdown("### Spearman ρ vs leaked label, per model\n"
                        "LoGRA p-values are all > 0.37 (non-significant). "
                        "AirRep and STRIDE are mostly significant at α=0.05.")
            gr.Dataframe(PER_MODEL.round(4), label="Per-model results")

        with gr.Tab("Pairwise ranking agreement"):
            gr.Markdown(
                "### Do the methods agree on which queries look contaminated?\n"
                "Spearman ρ between each pair of methods' score rankings (not vs ground truth). "
                "Near-zero means the methods are flagging almost entirely different queries."
            )
            gr.Dataframe(PAIRWISE.round(4), label="Pairwise Spearman ρ between method rankings")
            gr.Markdown(
                "> **AirRep↔STRIDE mean ρ = −0.06** — nearly anti-correlated. "
                "Both have independent signal, suggesting an ensemble could outperform either alone."
            )

        with gr.Tab("Findings & Discussion"):
            gr.Markdown(FINDINGS)

demo.launch()