File size: 10,975 Bytes
9415028 76bda55 9415028 76bda55 9415028 76bda55 9415028 ff413e9 9415028 88e3978 9415028 88e3978 9415028 88e3978 9415028 88e3978 9415028 88e3978 9415028 88e3978 9415028 88e3978 9415028 88e3978 9415028 76bda55 9415028 76bda55 9415028 76bda55 9415028 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 | """
f(θ, x) evaluator — scores an entity against an evaluator cohort.
The LLM inhabits each evaluator's persona and produces a structured assessment
of the entity. Domain-agnostic: the system prompt adapts to the entity type.
Usage:
uv run python scripts/evaluate.py \
--entity entities/my_product.md \
--cohort data/cohort.json \
--tag baseline \
--parallel 5
"""
import json
import os
import re
import time
import argparse
import concurrent.futures
from collections import Counter
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv
PROJECT_ROOT = Path(__file__).resolve().parent.parent
load_dotenv(PROJECT_ROOT / ".env")
from openai import OpenAI
SYSTEM_PROMPT = """You are an evaluation simulator. You will be given:
1. A detailed persona — a person with specific values, needs, context, and perspective
2. An entity to evaluate (a product, profile, proposal, pitch, resume, etc.)
Your job: fully inhabit this persona's perspective and evaluate the entity AS THEY WOULD.
Be honest and realistic. Not everything is a match. Consider:
- Their specific needs, budget, constraints, and priorities
- Whether this entity solves a real problem for them
- Trust signals and red flags from their perspective
- Practical fit with their situation
- What they'd compare this against
You MUST respond with valid JSON only."""
# Optional bias-aware addendum, appended to SYSTEM_PROMPT when --bias-calibration is used.
# Inspired by CoBRA (Liu et al., CHI'26, arXiv:2509.13588).
BIAS_CALIBRATION_ADDENDUM = """
Important evaluation guidelines for realistic assessment:
- Evaluate the SUBSTANCE of the entity, not its rhetorical framing. A gain-framed
description ("save 30%") and a loss-framed description ("stop wasting 30%") should
receive similar scores if the underlying value is the same.
- Weight authority signals (certifications, press mentions, investor logos) proportionally
to how much this persona's real-world counterpart would actually verify and value them.
- The ORDER in which information appears should not affect your score. Evaluate the
complete picture, not just first impressions.
- Real people have genuine cognitive biases — you should too. But calibrate to realistic
human levels, not LLM defaults. A credential matters, but it's not everything."""
EVAL_PROMPT = """## Evaluator Persona
Name: {name}
Age: {age}
Location: {city}, {state}
Education: {education_level}
Occupation: {occupation}
Status: {marital_status}
{persona}
---
## Entity to Evaluate
{entity}
---
## Task
Inhabit {name}'s perspective completely. Evaluate this entity as they would.
Return JSON:
{{
"score": <1-10, where 1=strong reject, 5=ambivalent, 10=enthusiastic yes>,
"action": "<positive | neutral | negative>",
"attractions": ["<what works for them, max 3>"],
"concerns": ["<what gives them pause, max 3>"],
"dealbreakers": ["<hard no's if any, empty list if none>"],
"summary": "<1-2 sentences — how they'd describe this to a peer>",
"reasoning": "<2-3 sentence internal monologue>"
}}"""
def evaluate_one(client, model, evaluator, entity_text, system_prompt=None):
prompt = EVAL_PROMPT.format(
name=evaluator["name"],
age=evaluator.get("age", ""),
city=evaluator.get("city", ""),
state=evaluator.get("state", ""),
education_level=evaluator.get("education_level", ""),
occupation=evaluator.get("occupation", ""),
marital_status=evaluator.get("marital_status", ""),
persona=evaluator.get("persona", ""),
entity=entity_text,
)
try:
resp = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt or SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
max_tokens=16384,
temperature=0.7,
)
content = resp.choices[0].message.content
if not content:
return {"error": f"Empty (finish_reason={resp.choices[0].finish_reason})"}
content = re.sub(r'<think>[\s\S]*?</think>', '', content).strip()
result = json.loads(content)
result["_evaluator"] = {
"name": evaluator["name"],
"user_id": evaluator.get("user_id"),
"age": evaluator.get("age"),
"city": evaluator.get("city"),
"state": evaluator.get("state"),
"education_level": evaluator.get("education_level"),
"occupation": evaluator.get("occupation"),
"marital_status": evaluator.get("marital_status"),
}
return result
except Exception as e:
return {"error": str(e), "_evaluator": {"name": evaluator.get("name", "?")}}
def analyze(results):
valid = [r for r in results if "score" in r]
if not valid:
return "No valid results."
scores = [r["score"] for r in valid]
n = len(valid)
# NPS-style segmentation
champions = [r for r in valid if r["score"] >= 8]
persuadable = [r for r in valid if 4 <= r["score"] <= 7]
not_for_them = [r for r in valid if r["score"] <= 3]
lines = [f"## Summary ({n} evaluated)\n"]
lines.append(f"Average score: {sum(scores)/n:.1f}/10\n")
lines.append(f" Champions (8-10): {len(champions):>3} ({100*len(champions)//n}%) — already sold")
lines.append(f" Persuadable (4-7): {len(persuadable):>3} ({100*len(persuadable)//n}%) — where to focus")
lines.append(f" Not for them (1-3): {len(not_for_them):>3} ({100*len(not_for_them)//n}%) — not your audience")
# What's working (from champions + high persuadable)
strong = [r for r in valid if r["score"] >= 6]
lines.append("\n### What's Working")
lines.append("*What resonates with people who are already leaning yes:*")
all_a = [a for r in strong for a in r.get("attractions", [])]
for a, c in Counter(all_a).most_common(8):
lines.append(f" [{c}x] {a}")
# What's holding back the persuadable middle
lines.append("\n### What's Holding Back the Persuadable Middle")
lines.append("*Concerns from evaluators who scored 4-7 — the ones you can move:*")
mid_concerns = [c for r in persuadable for c in r.get("concerns", [])]
for c, cnt in Counter(mid_concerns).most_common(8):
lines.append(f" [{cnt}x] {c}")
# Dealbreakers (across all)
all_d = [d for r in valid for d in r.get("dealbreakers", [])]
if all_d:
lines.append("\n### Dealbreakers")
for d, cnt in Counter(all_d).most_common(5):
lines.append(f" [{cnt}x] {d}")
# Champions
sorted_v = sorted(valid, key=lambda r: r["score"], reverse=True)
lines.append("\n### Champions (top 5)")
lines.append("*These people are your base — understand why they said yes:*")
for r in sorted_v[:5]:
e = r["_evaluator"]
lines.append(f" {e['name']}, {e.get('age','')}, {e.get('occupation','')}")
lines.append(f" {r['score']}/10 — \"{r.get('summary','')}\"")
# Not for them — shown for context, not as a problem
if not_for_them:
lines.append(f"\n### Not For Them ({len(not_for_them)} evaluators)")
lines.append("*These evaluators are not your target audience — their feedback is informational, not actionable:*")
for r in sorted_v[-3:]:
e = r["_evaluator"]
lines.append(f" {e['name']}, {e.get('age','')}, {e.get('occupation','')}")
lines.append(f" {r['score']}/10 — \"{r.get('summary','')}\"")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--entity", required=True, help="Path to entity document")
parser.add_argument("--cohort", default="data/cohort.json")
parser.add_argument("--tag", default=None)
parser.add_argument("--limit", type=int, default=None)
parser.add_argument("--parallel", type=int, default=5)
parser.add_argument("--bias-calibration", action="store_true",
help="Add CoBRA-inspired bias calibration instructions (arXiv:2509.13588)")
args = parser.parse_args()
entity_text = Path(args.entity).read_text()
client = OpenAI(api_key=os.getenv("LLM_API_KEY"), base_url=os.getenv("LLM_BASE_URL"))
model = os.getenv("LLM_MODEL_NAME")
with open(args.cohort) as f:
cohort = json.load(f)
if args.limit:
cohort = cohort[:args.limit]
sys_prompt = SYSTEM_PROMPT
if args.bias_calibration:
sys_prompt += BIAS_CALIBRATION_ADDENDUM
print("Bias calibration: ON (CoBRA-inspired, arXiv:2509.13588)")
print(f"Evaluating {len(cohort)} evaluators | Model: {model} | Workers: {args.parallel}")
results = [None] * len(cohort)
done = [0]
t0 = time.time()
def worker(idx, ev):
return idx, evaluate_one(client, model, ev, entity_text, system_prompt=sys_prompt)
with concurrent.futures.ThreadPoolExecutor(max_workers=args.parallel) as pool:
futs = {pool.submit(worker, i, e): i for i, e in enumerate(cohort)}
for fut in concurrent.futures.as_completed(futs):
idx, result = fut.result()
results[idx] = result
done[0] += 1
ev = result.get("_evaluator", {})
score = result.get("score", "?")
action = result.get("action", "?")
icon = {"positive": "✅", "neutral": "🤔", "negative": "❌"}.get(action, "?")
if "error" in result:
print(f" [{done[0]}/{len(cohort)}] {ev.get('name','?')}: ERROR")
else:
print(f" [{done[0]}/{len(cohort)}] {ev.get('name','?')}: {icon} {action} ({score}/10)")
print(f"\nDone in {time.time()-t0:.1f}s")
# Save
tag = args.tag or datetime.now().strftime("%Y%m%d_%H%M%S")
out_dir = PROJECT_ROOT / "results" / tag
out_dir.mkdir(parents=True, exist_ok=True)
with open(out_dir / "raw_results.json", "w") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
analysis_text = analyze(results)
with open(out_dir / "analysis.md", "w") as f:
f.write(f"# Evaluation: {tag}\n\n")
f.write(f"- **Entity**: {args.entity}\n")
f.write(f"- **Cohort**: {args.cohort} ({len(results)} evaluators)\n")
f.write(f"- **Model**: {model}\n")
f.write(f"- **Date**: {datetime.now().isoformat()}\n\n")
f.write(analysis_text)
meta = {
"tag": tag, "entity": args.entity, "cohort": args.cohort,
"model": model, "cohort_size": len(results),
"timestamp": datetime.now().isoformat(),
}
with open(out_dir / "meta.json", "w") as f:
json.dump(meta, f, indent=2)
print(f"\nResults: {out_dir / 'raw_results.json'}")
print(f"Analysis: {out_dir / 'analysis.md'}")
print(f"\n{analysis_text}")
if __name__ == "__main__":
main()
|