""" Analyze preference study results for base vs. fine-tuned checkpoint. Persuasion direction: seller argues for Product A. Scale: 1 = Definitely prefer A, 7 = Definitely prefer B. Success = rating DECREASES (shifts toward A). """ import json import os from collections import defaultdict from huggingface_hub import HfApi import numpy as np REPOS = { "base": "ehejin/user_study-preference-base_REAL", "checkpoint": "ehejin/user_study-preference-base_DETAILED_checkpoint", } def fetch_submissions(repo_id: str, token: str) -> list: api = HfApi(token=token) files = list(api.list_repo_files(repo_id=repo_id, repo_type="dataset")) json_files = [f for f in files if f.startswith("json/") and f.endswith(".json")] submissions = [] for filepath in json_files: local = api.hf_hub_download( repo_id=repo_id, filename=filepath, repo_type="dataset", token=token, ) with open(local) as f: submissions.append(json.load(f)) return submissions def extract_pairs(submissions: list) -> list: """Flatten to per-item: (pre_rating, post_rating, delta, pair_id).""" rows = [] for sub in submissions: for item in sub.get("items", []): pre = item.get("pre_rating") post = item.get("post_rating") if pre is None or post is None: continue rows.append({ "pre": pre, "post": post, "delta": post - pre, "pair_id": item.get("pair_id", ""), }) return rows def summarize(label: str, rows: list) -> None: if not rows: print(f"\n── {label}: no data ──") return deltas = np.array([r["delta"] for r in rows]) pres = np.array([r["pre"] for r in rows]) posts = np.array([r["post"] for r in rows]) # Negative delta = shifted toward A (success for the seller) shifted_toward_a = np.sum(deltas < 0) shifted_toward_b = np.sum(deltas > 0) no_change = np.sum(deltas == 0) print(f"\n── {label} ──") print(f" N item-reviews: {len(rows)}") print(f" Mean pre-rating: {pres.mean():.2f} (1=A, 7=B)") print(f" Mean post-rating: {posts.mean():.2f}") print(f" Mean delta: {deltas.mean():+.2f} (negative = shifted toward A ✓)") print(f" Median delta: {np.median(deltas):+.1f}") print(f" Shifted toward A (✓ win): {shifted_toward_a} ({100*shifted_toward_a/len(rows):.1f}%)") print(f" No change: {no_change} ({100*no_change/len(rows):.1f}%)") print(f" Shifted toward B (✗): {shifted_toward_b} ({100*shifted_toward_b/len(rows):.1f}%)") # Conditional on lean: among those who started leaning B (pre > 4), how often did we shift them? started_toward_b = [r for r in rows if r["pre"] > 4] if started_toward_b: shifted = sum(1 for r in started_toward_b if r["delta"] < 0) print(f" Among pre>4 (N={len(started_toward_b)}): {shifted} shifted toward A " f"({100*shifted/len(started_toward_b):.1f}%)") started_neutral = [r for r in rows if r["pre"] == 4] if started_neutral: shifted = sum(1 for r in started_neutral if r["delta"] < 0) print(f" Among pre=4 (N={len(started_neutral)}): {shifted} shifted toward A " f"({100*shifted/len(started_neutral):.1f}%)") def compare(base_rows: list, ckpt_rows: list) -> None: from scipy import stats base_deltas = np.array([r["delta"] for r in base_rows]) ckpt_deltas = np.array([r["delta"] for r in ckpt_rows]) print(f"\n── Comparison ──") print(f" Base mean delta: {base_deltas.mean():+.2f}") print(f" Checkpoint mean delta: {ckpt_deltas.mean():+.2f}") print(f" Difference: {(ckpt_deltas.mean() - base_deltas.mean()):+.2f} " f"(more negative = checkpoint better at pushing toward A)") t_stat, p_val = stats.ttest_ind(ckpt_deltas, base_deltas, equal_var=False) print(f" Welch's t-test: t={t_stat:.2f}, p={p_val:.4f}") if p_val < 0.05: winner = "checkpoint" if ckpt_deltas.mean() < base_deltas.mean() else "base" print(f" → {winner} is significantly better at pushing toward A (p<0.05)") else: print(f" → No significant difference") if __name__ == "__main__": token = os.getenv("HF_TOKEN") assert token, "set HF_TOKEN" all_rows = {} for label, repo in REPOS.items(): subs = fetch_submissions(repo, token) rows = extract_pairs(subs) all_rows[label] = rows summarize(label, rows) compare(all_rows["base"], all_rows["checkpoint"])