""" f(θ, x) evaluator — scores an entity against an evaluator cohort. The LLM inhabits each evaluator's persona and produces a structured assessment of the entity. Domain-agnostic: the system prompt adapts to the entity type. Usage: uv run python scripts/evaluate.py \ --entity entities/my_product.md \ --cohort data/cohort.json \ --tag baseline \ --parallel 5 """ import json import os import re import time import argparse import concurrent.futures from collections import Counter from datetime import datetime from pathlib import Path from dotenv import load_dotenv PROJECT_ROOT = Path(__file__).resolve().parent.parent load_dotenv(PROJECT_ROOT / ".env") from openai import OpenAI SYSTEM_PROMPT = """You are an evaluation simulator. You will be given: 1. A detailed persona — a person with specific values, needs, context, and perspective 2. An entity to evaluate (a product, profile, proposal, pitch, resume, etc.) Your job: fully inhabit this persona's perspective and evaluate the entity AS THEY WOULD. Be honest and realistic. Not everything is a match. Consider: - Their specific needs, budget, constraints, and priorities - Whether this entity solves a real problem for them - Trust signals and red flags from their perspective - Practical fit with their situation - What they'd compare this against You MUST respond with valid JSON only.""" # Optional bias-aware addendum, appended to SYSTEM_PROMPT when --bias-calibration is used. # Inspired by CoBRA (Liu et al., CHI'26, arXiv:2509.13588). BIAS_CALIBRATION_ADDENDUM = """ Important evaluation guidelines for realistic assessment: - Evaluate the SUBSTANCE of the entity, not its rhetorical framing. A gain-framed description ("save 30%") and a loss-framed description ("stop wasting 30%") should receive similar scores if the underlying value is the same. - Weight authority signals (certifications, press mentions, investor logos) proportionally to how much this persona's real-world counterpart would actually verify and value them. - The ORDER in which information appears should not affect your score. Evaluate the complete picture, not just first impressions. - Real people have genuine cognitive biases — you should too. But calibrate to realistic human levels, not LLM defaults. A credential matters, but it's not everything.""" EVAL_PROMPT = """## Evaluator Persona Name: {name} Age: {age} Location: {city}, {state} Education: {education_level} Occupation: {occupation} Status: {marital_status} {persona} --- ## Entity to Evaluate {entity} --- ## Task Inhabit {name}'s perspective completely. Evaluate this entity as they would. Return JSON: {{ "score": <1-10, where 1=strong reject, 5=ambivalent, 10=enthusiastic yes>, "action": "", "attractions": [""], "concerns": [""], "dealbreakers": [""], "summary": "<1-2 sentences — how they'd describe this to a peer>", "reasoning": "<2-3 sentence internal monologue>" }}""" def evaluate_one(client, model, evaluator, entity_text, system_prompt=None): prompt = EVAL_PROMPT.format( name=evaluator["name"], age=evaluator.get("age", ""), city=evaluator.get("city", ""), state=evaluator.get("state", ""), education_level=evaluator.get("education_level", ""), occupation=evaluator.get("occupation", ""), marital_status=evaluator.get("marital_status", ""), persona=evaluator.get("persona", ""), entity=entity_text, ) try: resp = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": system_prompt or SYSTEM_PROMPT}, {"role": "user", "content": prompt}, ], response_format={"type": "json_object"}, max_tokens=16384, temperature=0.7, ) content = resp.choices[0].message.content if not content: return {"error": f"Empty (finish_reason={resp.choices[0].finish_reason})"} content = re.sub(r'[\s\S]*?', '', content).strip() result = json.loads(content) result["_evaluator"] = { "name": evaluator["name"], "user_id": evaluator.get("user_id"), "age": evaluator.get("age"), "city": evaluator.get("city"), "state": evaluator.get("state"), "education_level": evaluator.get("education_level"), "occupation": evaluator.get("occupation"), "marital_status": evaluator.get("marital_status"), } return result except Exception as e: return {"error": str(e), "_evaluator": {"name": evaluator.get("name", "?")}} def analyze(results): valid = [r for r in results if "score" in r] if not valid: return "No valid results." scores = [r["score"] for r in valid] n = len(valid) # NPS-style segmentation champions = [r for r in valid if r["score"] >= 8] persuadable = [r for r in valid if 4 <= r["score"] <= 7] not_for_them = [r for r in valid if r["score"] <= 3] lines = [f"## Summary ({n} evaluated)\n"] lines.append(f"Average score: {sum(scores)/n:.1f}/10\n") lines.append(f" Champions (8-10): {len(champions):>3} ({100*len(champions)//n}%) — already sold") lines.append(f" Persuadable (4-7): {len(persuadable):>3} ({100*len(persuadable)//n}%) — where to focus") lines.append(f" Not for them (1-3): {len(not_for_them):>3} ({100*len(not_for_them)//n}%) — not your audience") # What's working (from champions + high persuadable) strong = [r for r in valid if r["score"] >= 6] lines.append("\n### What's Working") lines.append("*What resonates with people who are already leaning yes:*") all_a = [a for r in strong for a in r.get("attractions", [])] for a, c in Counter(all_a).most_common(8): lines.append(f" [{c}x] {a}") # What's holding back the persuadable middle lines.append("\n### What's Holding Back the Persuadable Middle") lines.append("*Concerns from evaluators who scored 4-7 — the ones you can move:*") mid_concerns = [c for r in persuadable for c in r.get("concerns", [])] for c, cnt in Counter(mid_concerns).most_common(8): lines.append(f" [{cnt}x] {c}") # Dealbreakers (across all) all_d = [d for r in valid for d in r.get("dealbreakers", [])] if all_d: lines.append("\n### Dealbreakers") for d, cnt in Counter(all_d).most_common(5): lines.append(f" [{cnt}x] {d}") # Champions sorted_v = sorted(valid, key=lambda r: r["score"], reverse=True) lines.append("\n### Champions (top 5)") lines.append("*These people are your base — understand why they said yes:*") for r in sorted_v[:5]: e = r["_evaluator"] lines.append(f" {e['name']}, {e.get('age','')}, {e.get('occupation','')}") lines.append(f" {r['score']}/10 — \"{r.get('summary','')}\"") # Not for them — shown for context, not as a problem if not_for_them: lines.append(f"\n### Not For Them ({len(not_for_them)} evaluators)") lines.append("*These evaluators are not your target audience — their feedback is informational, not actionable:*") for r in sorted_v[-3:]: e = r["_evaluator"] lines.append(f" {e['name']}, {e.get('age','')}, {e.get('occupation','')}") lines.append(f" {r['score']}/10 — \"{r.get('summary','')}\"") return "\n".join(lines) def main(): parser = argparse.ArgumentParser() parser.add_argument("--entity", required=True, help="Path to entity document") parser.add_argument("--cohort", default="data/cohort.json") parser.add_argument("--tag", default=None) parser.add_argument("--limit", type=int, default=None) parser.add_argument("--parallel", type=int, default=5) parser.add_argument("--bias-calibration", action="store_true", help="Add CoBRA-inspired bias calibration instructions (arXiv:2509.13588)") args = parser.parse_args() entity_text = Path(args.entity).read_text() client = OpenAI(api_key=os.getenv("LLM_API_KEY"), base_url=os.getenv("LLM_BASE_URL")) model = os.getenv("LLM_MODEL_NAME") with open(args.cohort) as f: cohort = json.load(f) if args.limit: cohort = cohort[:args.limit] sys_prompt = SYSTEM_PROMPT if args.bias_calibration: sys_prompt += BIAS_CALIBRATION_ADDENDUM print("Bias calibration: ON (CoBRA-inspired, arXiv:2509.13588)") print(f"Evaluating {len(cohort)} evaluators | Model: {model} | Workers: {args.parallel}") results = [None] * len(cohort) done = [0] t0 = time.time() def worker(idx, ev): return idx, evaluate_one(client, model, ev, entity_text, system_prompt=sys_prompt) with concurrent.futures.ThreadPoolExecutor(max_workers=args.parallel) as pool: futs = {pool.submit(worker, i, e): i for i, e in enumerate(cohort)} for fut in concurrent.futures.as_completed(futs): idx, result = fut.result() results[idx] = result done[0] += 1 ev = result.get("_evaluator", {}) score = result.get("score", "?") action = result.get("action", "?") icon = {"positive": "✅", "neutral": "🤔", "negative": "❌"}.get(action, "?") if "error" in result: print(f" [{done[0]}/{len(cohort)}] {ev.get('name','?')}: ERROR") else: print(f" [{done[0]}/{len(cohort)}] {ev.get('name','?')}: {icon} {action} ({score}/10)") print(f"\nDone in {time.time()-t0:.1f}s") # Save tag = args.tag or datetime.now().strftime("%Y%m%d_%H%M%S") out_dir = PROJECT_ROOT / "results" / tag out_dir.mkdir(parents=True, exist_ok=True) with open(out_dir / "raw_results.json", "w") as f: json.dump(results, f, ensure_ascii=False, indent=2) analysis_text = analyze(results) with open(out_dir / "analysis.md", "w") as f: f.write(f"# Evaluation: {tag}\n\n") f.write(f"- **Entity**: {args.entity}\n") f.write(f"- **Cohort**: {args.cohort} ({len(results)} evaluators)\n") f.write(f"- **Model**: {model}\n") f.write(f"- **Date**: {datetime.now().isoformat()}\n\n") f.write(analysis_text) meta = { "tag": tag, "entity": args.entity, "cohort": args.cohort, "model": model, "cohort_size": len(results), "timestamp": datetime.now().isoformat(), } with open(out_dir / "meta.json", "w") as f: json.dump(meta, f, indent=2) print(f"\nResults: {out_dir / 'raw_results.json'}") print(f"Analysis: {out_dir / 'analysis.md'}") print(f"\n{analysis_text}") if __name__ == "__main__": main()