prolific_preferences / analysis.py
ehejin's picture
sync w/ detailed repo
0f4326e
"""
Analyze preference study results for base vs. fine-tuned checkpoint.
Persuasion direction: seller argues for Product A.
Scale: 1 = Definitely prefer A, 7 = Definitely prefer B.
Success = rating DECREASES (shifts toward A).
"""
import json
import os
from collections import defaultdict
from huggingface_hub import HfApi
import numpy as np
REPOS = {
"base": "ehejin/user_study-preference-base_REAL",
"checkpoint": "ehejin/user_study-preference-base_DETAILED_checkpoint",
}
def fetch_submissions(repo_id: str, token: str) -> list:
api = HfApi(token=token)
files = list(api.list_repo_files(repo_id=repo_id, repo_type="dataset"))
json_files = [f for f in files if f.startswith("json/") and f.endswith(".json")]
submissions = []
for filepath in json_files:
local = api.hf_hub_download(
repo_id=repo_id, filename=filepath,
repo_type="dataset", token=token,
)
with open(local) as f:
submissions.append(json.load(f))
return submissions
def extract_pairs(submissions: list) -> list:
"""Flatten to per-item: (pre_rating, post_rating, delta, pair_id)."""
rows = []
for sub in submissions:
for item in sub.get("items", []):
pre = item.get("pre_rating")
post = item.get("post_rating")
if pre is None or post is None:
continue
rows.append({
"pre": pre,
"post": post,
"delta": post - pre,
"pair_id": item.get("pair_id", ""),
})
return rows
def summarize(label: str, rows: list) -> None:
if not rows:
print(f"\n── {label}: no data ──")
return
deltas = np.array([r["delta"] for r in rows])
pres = np.array([r["pre"] for r in rows])
posts = np.array([r["post"] for r in rows])
# Negative delta = shifted toward A (success for the seller)
shifted_toward_a = np.sum(deltas < 0)
shifted_toward_b = np.sum(deltas > 0)
no_change = np.sum(deltas == 0)
print(f"\n── {label} ──")
print(f" N item-reviews: {len(rows)}")
print(f" Mean pre-rating: {pres.mean():.2f} (1=A, 7=B)")
print(f" Mean post-rating: {posts.mean():.2f}")
print(f" Mean delta: {deltas.mean():+.2f} (negative = shifted toward A βœ“)")
print(f" Median delta: {np.median(deltas):+.1f}")
print(f" Shifted toward A (βœ“ win): {shifted_toward_a} ({100*shifted_toward_a/len(rows):.1f}%)")
print(f" No change: {no_change} ({100*no_change/len(rows):.1f}%)")
print(f" Shifted toward B (βœ—): {shifted_toward_b} ({100*shifted_toward_b/len(rows):.1f}%)")
# Conditional on lean: among those who started leaning B (pre > 4), how often did we shift them?
started_toward_b = [r for r in rows if r["pre"] > 4]
if started_toward_b:
shifted = sum(1 for r in started_toward_b if r["delta"] < 0)
print(f" Among pre>4 (N={len(started_toward_b)}): {shifted} shifted toward A "
f"({100*shifted/len(started_toward_b):.1f}%)")
started_neutral = [r for r in rows if r["pre"] == 4]
if started_neutral:
shifted = sum(1 for r in started_neutral if r["delta"] < 0)
print(f" Among pre=4 (N={len(started_neutral)}): {shifted} shifted toward A "
f"({100*shifted/len(started_neutral):.1f}%)")
def compare(base_rows: list, ckpt_rows: list) -> None:
from scipy import stats
base_deltas = np.array([r["delta"] for r in base_rows])
ckpt_deltas = np.array([r["delta"] for r in ckpt_rows])
print(f"\n── Comparison ──")
print(f" Base mean delta: {base_deltas.mean():+.2f}")
print(f" Checkpoint mean delta: {ckpt_deltas.mean():+.2f}")
print(f" Difference: {(ckpt_deltas.mean() - base_deltas.mean()):+.2f} "
f"(more negative = checkpoint better at pushing toward A)")
t_stat, p_val = stats.ttest_ind(ckpt_deltas, base_deltas, equal_var=False)
print(f" Welch's t-test: t={t_stat:.2f}, p={p_val:.4f}")
if p_val < 0.05:
winner = "checkpoint" if ckpt_deltas.mean() < base_deltas.mean() else "base"
print(f" β†’ {winner} is significantly better at pushing toward A (p<0.05)")
else:
print(f" β†’ No significant difference")
if __name__ == "__main__":
token = os.getenv("HF_TOKEN")
assert token, "set HF_TOKEN"
all_rows = {}
for label, repo in REPOS.items():
subs = fetch_submissions(repo, token)
rows = extract_pairs(subs)
all_rows[label] = rows
summarize(label, rows)
compare(all_rows["base"], all_rows["checkpoint"])