Spaces:

lspcloud
/

prolific_preferences

Sleeping

File size: 4,763 Bytes

0f4326e

"""
Analyze preference study results for base vs. fine-tuned checkpoint.

Persuasion direction: seller argues for Product A.
Scale: 1 = Definitely prefer A, 7 = Definitely prefer B.
Success = rating DECREASES (shifts toward A).
"""
import json
import os
from collections import defaultdict
from huggingface_hub import HfApi
import numpy as np


REPOS = {
    "base":       "ehejin/user_study-preference-base_REAL",
    "checkpoint": "ehejin/user_study-preference-base_DETAILED_checkpoint",
}


def fetch_submissions(repo_id: str, token: str) -> list:
    api = HfApi(token=token)
    files = list(api.list_repo_files(repo_id=repo_id, repo_type="dataset"))
    json_files = [f for f in files if f.startswith("json/") and f.endswith(".json")]
    submissions = []
    for filepath in json_files:
        local = api.hf_hub_download(
            repo_id=repo_id, filename=filepath,
            repo_type="dataset", token=token,
        )
        with open(local) as f:
            submissions.append(json.load(f))
    return submissions


def extract_pairs(submissions: list) -> list:
    """Flatten to per-item: (pre_rating, post_rating, delta, pair_id)."""
    rows = []
    for sub in submissions:
        for item in sub.get("items", []):
            pre  = item.get("pre_rating")
            post = item.get("post_rating")
            if pre is None or post is None:
                continue
            rows.append({
                "pre":     pre,
                "post":    post,
                "delta":   post - pre,
                "pair_id": item.get("pair_id", ""),
            })
    return rows


def summarize(label: str, rows: list) -> None:
    if not rows:
        print(f"\n── {label}: no data ──")
        return

    deltas = np.array([r["delta"] for r in rows])
    pres   = np.array([r["pre"]   for r in rows])
    posts  = np.array([r["post"]  for r in rows])

    # Negative delta = shifted toward A (success for the seller)
    shifted_toward_a = np.sum(deltas < 0)
    shifted_toward_b = np.sum(deltas > 0)
    no_change        = np.sum(deltas == 0)

    print(f"\n── {label} ──")
    print(f"  N item-reviews:            {len(rows)}")
    print(f"  Mean pre-rating:           {pres.mean():.2f}  (1=A, 7=B)")
    print(f"  Mean post-rating:          {posts.mean():.2f}")
    print(f"  Mean delta:                {deltas.mean():+.2f}  (negative = shifted toward A ✓)")
    print(f"  Median delta:              {np.median(deltas):+.1f}")
    print(f"  Shifted toward A (✓ win):  {shifted_toward_a}  ({100*shifted_toward_a/len(rows):.1f}%)")
    print(f"  No change:                 {no_change}  ({100*no_change/len(rows):.1f}%)")
    print(f"  Shifted toward B (✗):      {shifted_toward_b}  ({100*shifted_toward_b/len(rows):.1f}%)")

    # Conditional on lean: among those who started leaning B (pre > 4), how often did we shift them?
    started_toward_b = [r for r in rows if r["pre"] > 4]
    if started_toward_b:
        shifted = sum(1 for r in started_toward_b if r["delta"] < 0)
        print(f"  Among pre>4 (N={len(started_toward_b)}): {shifted} shifted toward A "
              f"({100*shifted/len(started_toward_b):.1f}%)")

    started_neutral = [r for r in rows if r["pre"] == 4]
    if started_neutral:
        shifted = sum(1 for r in started_neutral if r["delta"] < 0)
        print(f"  Among pre=4 (N={len(started_neutral)}): {shifted} shifted toward A "
              f"({100*shifted/len(started_neutral):.1f}%)")


def compare(base_rows: list, ckpt_rows: list) -> None:
    from scipy import stats
    base_deltas = np.array([r["delta"] for r in base_rows])
    ckpt_deltas = np.array([r["delta"] for r in ckpt_rows])

    print(f"\n── Comparison ──")
    print(f"  Base mean delta:       {base_deltas.mean():+.2f}")
    print(f"  Checkpoint mean delta: {ckpt_deltas.mean():+.2f}")
    print(f"  Difference:            {(ckpt_deltas.mean() - base_deltas.mean()):+.2f} "
          f"(more negative = checkpoint better at pushing toward A)")

    t_stat, p_val = stats.ttest_ind(ckpt_deltas, base_deltas, equal_var=False)
    print(f"  Welch's t-test:        t={t_stat:.2f}, p={p_val:.4f}")
    if p_val < 0.05:
        winner = "checkpoint" if ckpt_deltas.mean() < base_deltas.mean() else "base"
        print(f"  → {winner} is significantly better at pushing toward A (p<0.05)")
    else:
        print(f"  → No significant difference")


if __name__ == "__main__":
    token = os.getenv("HF_TOKEN")
    assert token, "set HF_TOKEN"

    all_rows = {}
    for label, repo in REPOS.items():
        subs = fetch_submissions(repo, token)
        rows = extract_pairs(subs)
        all_rows[label] = rows
        summarize(label, rows)

    compare(all_rows["base"], all_rows["checkpoint"])