Spaces:

lspcloud
/

prolific_preferences

Running

App Files Files Community

prolific_preferences / analysis.py

ehejin

sync w/ detailed repo

0f4326e 21 days ago

raw

history blame contribute delete

4.76 kB

	"""
	Analyze preference study results for base vs. fine-tuned checkpoint.

	Persuasion direction: seller argues for Product A.
	Scale: 1 = Definitely prefer A, 7 = Definitely prefer B.
	Success = rating DECREASES (shifts toward A).
	"""
	import json
	import os
	from collections import defaultdict
	from huggingface_hub import HfApi
	import numpy as np


	REPOS = {
	"base": "ehejin/user_study-preference-base_REAL",
	"checkpoint": "ehejin/user_study-preference-base_DETAILED_checkpoint",
	}


	def fetch_submissions(repo_id: str, token: str) -> list:
	api = HfApi(token=token)
	files = list(api.list_repo_files(repo_id=repo_id, repo_type="dataset"))
	json_files = [f for f in files if f.startswith("json/") and f.endswith(".json")]
	submissions = []
	for filepath in json_files:
	local = api.hf_hub_download(
	repo_id=repo_id, filename=filepath,
	repo_type="dataset", token=token,
	)
	with open(local) as f:
	submissions.append(json.load(f))
	return submissions


	def extract_pairs(submissions: list) -> list:
	"""Flatten to per-item: (pre_rating, post_rating, delta, pair_id)."""
	rows = []
	for sub in submissions:
	for item in sub.get("items", []):
	pre = item.get("pre_rating")
	post = item.get("post_rating")
	if pre is None or post is None:
	continue
	rows.append({
	"pre": pre,
	"post": post,
	"delta": post - pre,
	"pair_id": item.get("pair_id", ""),
	})
	return rows


	def summarize(label: str, rows: list) -> None:
	if not rows:
	print(f"\n── {label}: no data ──")
	return

	deltas = np.array([r["delta"] for r in rows])
	pres = np.array([r["pre"] for r in rows])
	posts = np.array([r["post"] for r in rows])

	# Negative delta = shifted toward A (success for the seller)
	shifted_toward_a = np.sum(deltas < 0)
	shifted_toward_b = np.sum(deltas > 0)
	no_change = np.sum(deltas == 0)

	print(f"\n── {label} ──")
	print(f" N item-reviews: {len(rows)}")
	print(f" Mean pre-rating: {pres.mean():.2f} (1=A, 7=B)")
	print(f" Mean post-rating: {posts.mean():.2f}")
	print(f" Mean delta: {deltas.mean():+.2f} (negative = shifted toward A ✓)")
	print(f" Median delta: {np.median(deltas):+.1f}")
	print(f" Shifted toward A (✓ win): {shifted_toward_a} ({100*shifted_toward_a/len(rows):.1f}%)")
	print(f" No change: {no_change} ({100*no_change/len(rows):.1f}%)")
	print(f" Shifted toward B (✗): {shifted_toward_b} ({100*shifted_toward_b/len(rows):.1f}%)")

	# Conditional on lean: among those who started leaning B (pre > 4), how often did we shift them?
	started_toward_b = [r for r in rows if r["pre"] > 4]
	if started_toward_b:
	shifted = sum(1 for r in started_toward_b if r["delta"] < 0)
	print(f" Among pre>4 (N={len(started_toward_b)}): {shifted} shifted toward A "
	f"({100*shifted/len(started_toward_b):.1f}%)")

	started_neutral = [r for r in rows if r["pre"] == 4]
	if started_neutral:
	shifted = sum(1 for r in started_neutral if r["delta"] < 0)
	print(f" Among pre=4 (N={len(started_neutral)}): {shifted} shifted toward A "
	f"({100*shifted/len(started_neutral):.1f}%)")


	def compare(base_rows: list, ckpt_rows: list) -> None:
	from scipy import stats
	base_deltas = np.array([r["delta"] for r in base_rows])
	ckpt_deltas = np.array([r["delta"] for r in ckpt_rows])

	print(f"\n── Comparison ──")
	print(f" Base mean delta: {base_deltas.mean():+.2f}")
	print(f" Checkpoint mean delta: {ckpt_deltas.mean():+.2f}")
	print(f" Difference: {(ckpt_deltas.mean() - base_deltas.mean()):+.2f} "
	f"(more negative = checkpoint better at pushing toward A)")

	t_stat, p_val = stats.ttest_ind(ckpt_deltas, base_deltas, equal_var=False)
	print(f" Welch's t-test: t={t_stat:.2f}, p={p_val:.4f}")
	if p_val < 0.05:
	winner = "checkpoint" if ckpt_deltas.mean() < base_deltas.mean() else "base"
	print(f" → {winner} is significantly better at pushing toward A (p<0.05)")
	else:
	print(f" → No significant difference")


	if __name__ == "__main__":
	token = os.getenv("HF_TOKEN")
	assert token, "set HF_TOKEN"

	all_rows = {}
	for label, repo in REPOS.items():
	subs = fetch_submissions(repo, token)
	rows = extract_pairs(subs)
	all_rows[label] = rows
	summarize(label, rows)

	compare(all_rows["base"], all_rows["checkpoint"])