File size: 4,763 Bytes
0f4326e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
Analyze preference study results for base vs. fine-tuned checkpoint.

Persuasion direction: seller argues for Product A.
Scale: 1 = Definitely prefer A, 7 = Definitely prefer B.
Success = rating DECREASES (shifts toward A).
"""
import json
import os
from collections import defaultdict
from huggingface_hub import HfApi
import numpy as np


REPOS = {
    "base":       "ehejin/user_study-preference-base_REAL",
    "checkpoint": "ehejin/user_study-preference-base_DETAILED_checkpoint",
}


def fetch_submissions(repo_id: str, token: str) -> list:
    api = HfApi(token=token)
    files = list(api.list_repo_files(repo_id=repo_id, repo_type="dataset"))
    json_files = [f for f in files if f.startswith("json/") and f.endswith(".json")]
    submissions = []
    for filepath in json_files:
        local = api.hf_hub_download(
            repo_id=repo_id, filename=filepath,
            repo_type="dataset", token=token,
        )
        with open(local) as f:
            submissions.append(json.load(f))
    return submissions


def extract_pairs(submissions: list) -> list:
    """Flatten to per-item: (pre_rating, post_rating, delta, pair_id)."""
    rows = []
    for sub in submissions:
        for item in sub.get("items", []):
            pre  = item.get("pre_rating")
            post = item.get("post_rating")
            if pre is None or post is None:
                continue
            rows.append({
                "pre":     pre,
                "post":    post,
                "delta":   post - pre,
                "pair_id": item.get("pair_id", ""),
            })
    return rows


def summarize(label: str, rows: list) -> None:
    if not rows:
        print(f"\n── {label}: no data ──")
        return

    deltas = np.array([r["delta"] for r in rows])
    pres   = np.array([r["pre"]   for r in rows])
    posts  = np.array([r["post"]  for r in rows])

    # Negative delta = shifted toward A (success for the seller)
    shifted_toward_a = np.sum(deltas < 0)
    shifted_toward_b = np.sum(deltas > 0)
    no_change        = np.sum(deltas == 0)

    print(f"\n── {label} ──")
    print(f"  N item-reviews:            {len(rows)}")
    print(f"  Mean pre-rating:           {pres.mean():.2f}  (1=A, 7=B)")
    print(f"  Mean post-rating:          {posts.mean():.2f}")
    print(f"  Mean delta:                {deltas.mean():+.2f}  (negative = shifted toward A βœ“)")
    print(f"  Median delta:              {np.median(deltas):+.1f}")
    print(f"  Shifted toward A (βœ“ win):  {shifted_toward_a}  ({100*shifted_toward_a/len(rows):.1f}%)")
    print(f"  No change:                 {no_change}  ({100*no_change/len(rows):.1f}%)")
    print(f"  Shifted toward B (βœ—):      {shifted_toward_b}  ({100*shifted_toward_b/len(rows):.1f}%)")

    # Conditional on lean: among those who started leaning B (pre > 4), how often did we shift them?
    started_toward_b = [r for r in rows if r["pre"] > 4]
    if started_toward_b:
        shifted = sum(1 for r in started_toward_b if r["delta"] < 0)
        print(f"  Among pre>4 (N={len(started_toward_b)}): {shifted} shifted toward A "
              f"({100*shifted/len(started_toward_b):.1f}%)")

    started_neutral = [r for r in rows if r["pre"] == 4]
    if started_neutral:
        shifted = sum(1 for r in started_neutral if r["delta"] < 0)
        print(f"  Among pre=4 (N={len(started_neutral)}): {shifted} shifted toward A "
              f"({100*shifted/len(started_neutral):.1f}%)")


def compare(base_rows: list, ckpt_rows: list) -> None:
    from scipy import stats
    base_deltas = np.array([r["delta"] for r in base_rows])
    ckpt_deltas = np.array([r["delta"] for r in ckpt_rows])

    print(f"\n── Comparison ──")
    print(f"  Base mean delta:       {base_deltas.mean():+.2f}")
    print(f"  Checkpoint mean delta: {ckpt_deltas.mean():+.2f}")
    print(f"  Difference:            {(ckpt_deltas.mean() - base_deltas.mean()):+.2f} "
          f"(more negative = checkpoint better at pushing toward A)")

    t_stat, p_val = stats.ttest_ind(ckpt_deltas, base_deltas, equal_var=False)
    print(f"  Welch's t-test:        t={t_stat:.2f}, p={p_val:.4f}")
    if p_val < 0.05:
        winner = "checkpoint" if ckpt_deltas.mean() < base_deltas.mean() else "base"
        print(f"  β†’ {winner} is significantly better at pushing toward A (p<0.05)")
    else:
        print(f"  β†’ No significant difference")


if __name__ == "__main__":
    token = os.getenv("HF_TOKEN")
    assert token, "set HF_TOKEN"

    all_rows = {}
    for label, repo in REPOS.items():
        subs = fetch_submissions(repo, token)
        rows = extract_pairs(subs)
        all_rows[label] = rows
        summarize(label, rows)

    compare(all_rows["base"], all_rows["checkpoint"])