File size: 4,346 Bytes
6371d28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""Generate MIB-style sample data for the dataset visualizer."""
import pandas as pd
import numpy as np
from pathlib import Path

np.random.seed(42)

# MIB uses k in {.001, .002, .005, .01, .02, .05, .1, .2, .5, 1}
K_VALUES = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0]

TASKS = ["IOI", "MCQA", "Arithmetic", "ARC-E", "ARC-C"]
MODELS = ["GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
METHODS = [
    "EAP-IG (CF)",
    "EAP (CF)",
    "NAP-IG (CF)",
    "UGS",
    "IFR",
    "Random",
]

# Realistic-ish faithfulness curves: better methods rise faster, Random stays low
def sample_f(k: float, method: str, task: str) -> float:
    base = np.clip(0.1 + 0.85 * (k ** 0.6) + np.random.uniform(-0.05, 0.05), 0, 1)
    if method == "Random":
        return np.clip(0.2 + 0.3 * k + np.random.uniform(-0.08, 0.08), 0, 1)
    if method == "IFR":
        base *= 0.75
    elif method == "NAP-IG (CF)":
        base *= 0.85
    elif method == "EAP (CF)":
        base *= 0.92
    elif method == "UGS":
        base *= 0.88
    return np.clip(base + np.random.uniform(-0.03, 0.03), 0, 1)


def main():
    out = Path(__file__).resolve().parents[1] / "data"
    out.mkdir(parents=True, exist_ok=True)

    # 1. Faithfulness curves: method, task, model, k, f
    rows = []
    for method in METHODS:
        for task in TASKS[:3]:  # subset for size
            for model in MODELS[:2]:
                for k in K_VALUES:
                    f = sample_f(k, method, task)
                    rows.append({"method": method, "task": task, "model": model, "k": k, "f": round(f, 4)})
    curves = pd.DataFrame(rows)
    curves.to_csv(out / "faithfulness_curves.csv", index=False)

    # 2. Dataset overview (MIB Table 4 style)
    overview = pd.DataFrame([
        {"dataset": "IOI", "split": "Train", "count": 10000},
        {"dataset": "IOI", "split": "Validation", "count": 10000},
        {"dataset": "IOI", "split": "Test (Public)", "count": 10000},
        {"dataset": "IOI", "split": "Test (Private)", "count": 10000},
        {"dataset": "MCQA", "split": "Train", "count": 110},
        {"dataset": "MCQA", "split": "Validation", "count": 50},
        {"dataset": "MCQA", "split": "Test (Public)", "count": 50},
        {"dataset": "MCQA", "split": "Test (Private)", "count": 50},
        {"dataset": "Arithmetic (+)", "split": "Train", "count": 34400},
        {"dataset": "Arithmetic (+)", "split": "Validation", "count": 4920},
        {"dataset": "Arithmetic (+)", "split": "Test (Public)", "count": 4920},
        {"dataset": "Arithmetic (+)", "split": "Test (Private)", "count": 4920},
        {"dataset": "Arithmetic (-)", "split": "Train", "count": 17400},
        {"dataset": "Arithmetic (-)", "split": "Validation", "count": 2484},
        {"dataset": "Arithmetic (-)", "split": "Test (Public)", "count": 2484},
        {"dataset": "Arithmetic (-)", "split": "Test (Private)", "count": 2484},
        {"dataset": "ARC (Easy)", "split": "Train", "count": 2251},
        {"dataset": "ARC (Easy)", "split": "Validation", "count": 570},
        {"dataset": "ARC (Easy)", "split": "Test (Public)", "count": 1188},
        {"dataset": "ARC (Easy)", "split": "Test (Private)", "count": 1188},
        {"dataset": "ARC (Challenge)", "split": "Train", "count": 1119},
        {"dataset": "ARC (Challenge)", "split": "Validation", "count": 299},
        {"dataset": "ARC (Challenge)", "split": "Test (Public)", "count": 586},
        {"dataset": "ARC (Challenge)", "split": "Test (Private)", "count": 586},
    ])
    overview.to_csv(out / "dataset_overview.csv", index=False)

    # 3. Aggregate metrics table (CPR, CMD) — MIB Table 2 style
    metrics = []
    for method in METHODS:
        for task in TASKS:
            for model in MODELS:
                cpr = round(np.random.uniform(0.2, 1.0) if method != "Random" else np.random.uniform(0.2, 0.35), 3)
                cmd = round(np.random.uniform(0.01, 0.4) if method != "Random" else np.random.uniform(0.68, 0.78), 3)
                metrics.append({"method": method, "task": task, "model": model, "CPR": cpr, "CMD": cmd})
    metrics_df = pd.DataFrame(metrics)
    metrics_df.to_csv(out / "metrics_table.csv", index=False)

    print("Wrote:", out / "faithfulness_curves.csv", out / "dataset_overview.csv", out / "metrics_table.csv")


if __name__ == "__main__":
    main()