Spaces:

xleaps
/

sgo

Running

sgo / scripts /evaluate.py

Eric Xu

Fix XSS, VJP contract, binding, session security, name collisions, and error handling

ff413e9 unverified 3 days ago

11 kB

	"""
	f(θ, x) evaluator — scores an entity against an evaluator cohort.

	The LLM inhabits each evaluator's persona and produces a structured assessment
	of the entity. Domain-agnostic: the system prompt adapts to the entity type.

	Usage:
	uv run python scripts/evaluate.py \
	--entity entities/my_product.md \
	--cohort data/cohort.json \
	--tag baseline \
	--parallel 5
	"""

	import json
	import os
	import re
	import time
	import argparse
	import concurrent.futures
	from collections import Counter
	from datetime import datetime
	from pathlib import Path

	from dotenv import load_dotenv

	PROJECT_ROOT = Path(__file__).resolve().parent.parent
	load_dotenv(PROJECT_ROOT / ".env")

	from openai import OpenAI


	SYSTEM_PROMPT = """You are an evaluation simulator. You will be given:
	1. A detailed persona — a person with specific values, needs, context, and perspective
	2. An entity to evaluate (a product, profile, proposal, pitch, resume, etc.)

	Your job: fully inhabit this persona's perspective and evaluate the entity AS THEY WOULD.

	Be honest and realistic. Not everything is a match. Consider:
	- Their specific needs, budget, constraints, and priorities
	- Whether this entity solves a real problem for them
	- Trust signals and red flags from their perspective
	- Practical fit with their situation
	- What they'd compare this against

	You MUST respond with valid JSON only."""

	# Optional bias-aware addendum, appended to SYSTEM_PROMPT when --bias-calibration is used.
	# Inspired by CoBRA (Liu et al., CHI'26, arXiv:2509.13588).
	BIAS_CALIBRATION_ADDENDUM = """

	Important evaluation guidelines for realistic assessment:
	- Evaluate the SUBSTANCE of the entity, not its rhetorical framing. A gain-framed
	description ("save 30%") and a loss-framed description ("stop wasting 30%") should
	receive similar scores if the underlying value is the same.
	- Weight authority signals (certifications, press mentions, investor logos) proportionally
	to how much this persona's real-world counterpart would actually verify and value them.
	- The ORDER in which information appears should not affect your score. Evaluate the
	complete picture, not just first impressions.
	- Real people have genuine cognitive biases — you should too. But calibrate to realistic
	human levels, not LLM defaults. A credential matters, but it's not everything."""

	EVAL_PROMPT = """## Evaluator Persona

	Name: {name}
	Age: {age}
	Location: {city}, {state}
	Education: {education_level}
	Occupation: {occupation}
	Status: {marital_status}

	{persona}

	---

	## Entity to Evaluate

	{entity}

	---

	## Task

	Inhabit {name}'s perspective completely. Evaluate this entity as they would.

	Return JSON:
	{{
	"score": <1-10, where 1=strong reject, 5=ambivalent, 10=enthusiastic yes>,
	"action": "<positive \| neutral \| negative>",
	"attractions": ["<what works for them, max 3>"],
	"concerns": ["<what gives them pause, max 3>"],
	"dealbreakers": ["<hard no's if any, empty list if none>"],
	"summary": "<1-2 sentences — how they'd describe this to a peer>",
	"reasoning": "<2-3 sentence internal monologue>"
	}}"""


	def evaluate_one(client, model, evaluator, entity_text, system_prompt=None):
	prompt = EVAL_PROMPT.format(
	name=evaluator["name"],
	age=evaluator.get("age", ""),
	city=evaluator.get("city", ""),
	state=evaluator.get("state", ""),
	education_level=evaluator.get("education_level", ""),
	occupation=evaluator.get("occupation", ""),
	marital_status=evaluator.get("marital_status", ""),
	persona=evaluator.get("persona", ""),
	entity=entity_text,
	)
	try:
	resp = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": system_prompt or SYSTEM_PROMPT},
	{"role": "user", "content": prompt},
	],
	response_format={"type": "json_object"},
	max_tokens=16384,
	temperature=0.7,
	)
	content = resp.choices[0].message.content
	if not content:
	return {"error": f"Empty (finish_reason={resp.choices[0].finish_reason})"}
	content = re.sub(r'<think>[\s\S]*?</think>', '', content).strip()
	result = json.loads(content)
	result["_evaluator"] = {
	"name": evaluator["name"],
	"user_id": evaluator.get("user_id"),
	"age": evaluator.get("age"),
	"city": evaluator.get("city"),
	"state": evaluator.get("state"),
	"education_level": evaluator.get("education_level"),
	"occupation": evaluator.get("occupation"),
	"marital_status": evaluator.get("marital_status"),
	}
	return result
	except Exception as e:
	return {"error": str(e), "_evaluator": {"name": evaluator.get("name", "?")}}


	def analyze(results):
	valid = [r for r in results if "score" in r]
	if not valid:
	return "No valid results."

	scores = [r["score"] for r in valid]
	n = len(valid)

	# NPS-style segmentation
	champions = [r for r in valid if r["score"] >= 8]
	persuadable = [r for r in valid if 4 <= r["score"] <= 7]
	not_for_them = [r for r in valid if r["score"] <= 3]

	lines = [f"## Summary ({n} evaluated)\n"]
	lines.append(f"Average score: {sum(scores)/n:.1f}/10\n")
	lines.append(f" Champions (8-10): {len(champions):>3} ({100*len(champions)//n}%) — already sold")
	lines.append(f" Persuadable (4-7): {len(persuadable):>3} ({100*len(persuadable)//n}%) — where to focus")
	lines.append(f" Not for them (1-3): {len(not_for_them):>3} ({100*len(not_for_them)//n}%) — not your audience")

	# What's working (from champions + high persuadable)
	strong = [r for r in valid if r["score"] >= 6]
	lines.append("\n### What's Working")
	lines.append("What resonates with people who are already leaning yes:")
	all_a = [a for r in strong for a in r.get("attractions", [])]
	for a, c in Counter(all_a).most_common(8):
	lines.append(f" [{c}x] {a}")

	# What's holding back the persuadable middle
	lines.append("\n### What's Holding Back the Persuadable Middle")
	lines.append("Concerns from evaluators who scored 4-7 — the ones you can move:")
	mid_concerns = [c for r in persuadable for c in r.get("concerns", [])]
	for c, cnt in Counter(mid_concerns).most_common(8):
	lines.append(f" [{cnt}x] {c}")

	# Dealbreakers (across all)
	all_d = [d for r in valid for d in r.get("dealbreakers", [])]
	if all_d:
	lines.append("\n### Dealbreakers")
	for d, cnt in Counter(all_d).most_common(5):
	lines.append(f" [{cnt}x] {d}")

	# Champions
	sorted_v = sorted(valid, key=lambda r: r["score"], reverse=True)
	lines.append("\n### Champions (top 5)")
	lines.append("These people are your base — understand why they said yes:")
	for r in sorted_v[:5]:
	e = r["_evaluator"]
	lines.append(f" {e['name']}, {e.get('age','')}, {e.get('occupation','')}")
	lines.append(f" {r['score']}/10 — \"{r.get('summary','')}\"")

	# Not for them — shown for context, not as a problem
	if not_for_them:
	lines.append(f"\n### Not For Them ({len(not_for_them)} evaluators)")
	lines.append("These evaluators are not your target audience — their feedback is informational, not actionable:")
	for r in sorted_v[-3:]:
	e = r["_evaluator"]
	lines.append(f" {e['name']}, {e.get('age','')}, {e.get('occupation','')}")
	lines.append(f" {r['score']}/10 — \"{r.get('summary','')}\"")

	return "\n".join(lines)


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--entity", required=True, help="Path to entity document")
	parser.add_argument("--cohort", default="data/cohort.json")
	parser.add_argument("--tag", default=None)
	parser.add_argument("--limit", type=int, default=None)
	parser.add_argument("--parallel", type=int, default=5)
	parser.add_argument("--bias-calibration", action="store_true",
	help="Add CoBRA-inspired bias calibration instructions (arXiv:2509.13588)")
	args = parser.parse_args()

	entity_text = Path(args.entity).read_text()

	client = OpenAI(api_key=os.getenv("LLM_API_KEY"), base_url=os.getenv("LLM_BASE_URL"))
	model = os.getenv("LLM_MODEL_NAME")

	with open(args.cohort) as f:
	cohort = json.load(f)
	if args.limit:
	cohort = cohort[:args.limit]

	sys_prompt = SYSTEM_PROMPT
	if args.bias_calibration:
	sys_prompt += BIAS_CALIBRATION_ADDENDUM
	print("Bias calibration: ON (CoBRA-inspired, arXiv:2509.13588)")

	print(f"Evaluating {len(cohort)} evaluators \| Model: {model} \| Workers: {args.parallel}")

	results = [None] * len(cohort)
	done = [0]
	t0 = time.time()

	def worker(idx, ev):
	return idx, evaluate_one(client, model, ev, entity_text, system_prompt=sys_prompt)

	with concurrent.futures.ThreadPoolExecutor(max_workers=args.parallel) as pool:
	futs = {pool.submit(worker, i, e): i for i, e in enumerate(cohort)}
	for fut in concurrent.futures.as_completed(futs):
	idx, result = fut.result()
	results[idx] = result
	done[0] += 1
	ev = result.get("_evaluator", {})
	score = result.get("score", "?")
	action = result.get("action", "?")
	icon = {"positive": "✅", "neutral": "🤔", "negative": "❌"}.get(action, "?")
	if "error" in result:
	print(f" [{done[0]}/{len(cohort)}] {ev.get('name','?')}: ERROR")
	else:
	print(f" [{done[0]}/{len(cohort)}] {ev.get('name','?')}: {icon} {action} ({score}/10)")

	print(f"\nDone in {time.time()-t0:.1f}s")

	# Save
	tag = args.tag or datetime.now().strftime("%Y%m%d_%H%M%S")
	out_dir = PROJECT_ROOT / "results" / tag
	out_dir.mkdir(parents=True, exist_ok=True)

	with open(out_dir / "raw_results.json", "w") as f:
	json.dump(results, f, ensure_ascii=False, indent=2)

	analysis_text = analyze(results)
	with open(out_dir / "analysis.md", "w") as f:
	f.write(f"# Evaluation: {tag}\n\n")
	f.write(f"- Entity: {args.entity}\n")
	f.write(f"- Cohort: {args.cohort} ({len(results)} evaluators)\n")
	f.write(f"- Model: {model}\n")
	f.write(f"- Date: {datetime.now().isoformat()}\n\n")
	f.write(analysis_text)

	meta = {
	"tag": tag, "entity": args.entity, "cohort": args.cohort,
	"model": model, "cohort_size": len(results),
	"timestamp": datetime.now().isoformat(),
	}
	with open(out_dir / "meta.json", "w") as f:
	json.dump(meta, f, indent=2)

	print(f"\nResults: {out_dir / 'raw_results.json'}")
	print(f"Analysis: {out_dir / 'analysis.md'}")
	print(f"\n{analysis_text}")


	if __name__ == "__main__":
	main()