Spaces:
Running
Running
| """ | |
| Analyze preference study results for base vs. fine-tuned checkpoint. | |
| Persuasion direction: seller argues for Product A. | |
| Scale: 1 = Definitely prefer A, 7 = Definitely prefer B. | |
| Success = rating DECREASES (shifts toward A). | |
| """ | |
| import json | |
| import os | |
| from collections import defaultdict | |
| from huggingface_hub import HfApi | |
| import numpy as np | |
| REPOS = { | |
| "base": "ehejin/user_study-preference-base_REAL", | |
| "checkpoint": "ehejin/user_study-preference-base_DETAILED_checkpoint", | |
| } | |
| def fetch_submissions(repo_id: str, token: str) -> list: | |
| api = HfApi(token=token) | |
| files = list(api.list_repo_files(repo_id=repo_id, repo_type="dataset")) | |
| json_files = [f for f in files if f.startswith("json/") and f.endswith(".json")] | |
| submissions = [] | |
| for filepath in json_files: | |
| local = api.hf_hub_download( | |
| repo_id=repo_id, filename=filepath, | |
| repo_type="dataset", token=token, | |
| ) | |
| with open(local) as f: | |
| submissions.append(json.load(f)) | |
| return submissions | |
| def extract_pairs(submissions: list) -> list: | |
| """Flatten to per-item: (pre_rating, post_rating, delta, pair_id).""" | |
| rows = [] | |
| for sub in submissions: | |
| for item in sub.get("items", []): | |
| pre = item.get("pre_rating") | |
| post = item.get("post_rating") | |
| if pre is None or post is None: | |
| continue | |
| rows.append({ | |
| "pre": pre, | |
| "post": post, | |
| "delta": post - pre, | |
| "pair_id": item.get("pair_id", ""), | |
| }) | |
| return rows | |
| def summarize(label: str, rows: list) -> None: | |
| if not rows: | |
| print(f"\nββ {label}: no data ββ") | |
| return | |
| deltas = np.array([r["delta"] for r in rows]) | |
| pres = np.array([r["pre"] for r in rows]) | |
| posts = np.array([r["post"] for r in rows]) | |
| # Negative delta = shifted toward A (success for the seller) | |
| shifted_toward_a = np.sum(deltas < 0) | |
| shifted_toward_b = np.sum(deltas > 0) | |
| no_change = np.sum(deltas == 0) | |
| print(f"\nββ {label} ββ") | |
| print(f" N item-reviews: {len(rows)}") | |
| print(f" Mean pre-rating: {pres.mean():.2f} (1=A, 7=B)") | |
| print(f" Mean post-rating: {posts.mean():.2f}") | |
| print(f" Mean delta: {deltas.mean():+.2f} (negative = shifted toward A β)") | |
| print(f" Median delta: {np.median(deltas):+.1f}") | |
| print(f" Shifted toward A (β win): {shifted_toward_a} ({100*shifted_toward_a/len(rows):.1f}%)") | |
| print(f" No change: {no_change} ({100*no_change/len(rows):.1f}%)") | |
| print(f" Shifted toward B (β): {shifted_toward_b} ({100*shifted_toward_b/len(rows):.1f}%)") | |
| # Conditional on lean: among those who started leaning B (pre > 4), how often did we shift them? | |
| started_toward_b = [r for r in rows if r["pre"] > 4] | |
| if started_toward_b: | |
| shifted = sum(1 for r in started_toward_b if r["delta"] < 0) | |
| print(f" Among pre>4 (N={len(started_toward_b)}): {shifted} shifted toward A " | |
| f"({100*shifted/len(started_toward_b):.1f}%)") | |
| started_neutral = [r for r in rows if r["pre"] == 4] | |
| if started_neutral: | |
| shifted = sum(1 for r in started_neutral if r["delta"] < 0) | |
| print(f" Among pre=4 (N={len(started_neutral)}): {shifted} shifted toward A " | |
| f"({100*shifted/len(started_neutral):.1f}%)") | |
| def compare(base_rows: list, ckpt_rows: list) -> None: | |
| from scipy import stats | |
| base_deltas = np.array([r["delta"] for r in base_rows]) | |
| ckpt_deltas = np.array([r["delta"] for r in ckpt_rows]) | |
| print(f"\nββ Comparison ββ") | |
| print(f" Base mean delta: {base_deltas.mean():+.2f}") | |
| print(f" Checkpoint mean delta: {ckpt_deltas.mean():+.2f}") | |
| print(f" Difference: {(ckpt_deltas.mean() - base_deltas.mean()):+.2f} " | |
| f"(more negative = checkpoint better at pushing toward A)") | |
| t_stat, p_val = stats.ttest_ind(ckpt_deltas, base_deltas, equal_var=False) | |
| print(f" Welch's t-test: t={t_stat:.2f}, p={p_val:.4f}") | |
| if p_val < 0.05: | |
| winner = "checkpoint" if ckpt_deltas.mean() < base_deltas.mean() else "base" | |
| print(f" β {winner} is significantly better at pushing toward A (p<0.05)") | |
| else: | |
| print(f" β No significant difference") | |
| if __name__ == "__main__": | |
| token = os.getenv("HF_TOKEN") | |
| assert token, "set HF_TOKEN" | |
| all_rows = {} | |
| for label, repo in REPOS.items(): | |
| subs = fetch_submissions(repo, token) | |
| rows = extract_pairs(subs) | |
| all_rows[label] = rows | |
| summarize(label, rows) | |
| compare(all_rows["base"], all_rows["checkpoint"]) |