sgo / scripts /evaluate.py
Eric Xu
Fix XSS, VJP contract, binding, session security, name collisions, and error handling
ff413e9 unverified
"""
f(θ, x) evaluator — scores an entity against an evaluator cohort.
The LLM inhabits each evaluator's persona and produces a structured assessment
of the entity. Domain-agnostic: the system prompt adapts to the entity type.
Usage:
uv run python scripts/evaluate.py \
--entity entities/my_product.md \
--cohort data/cohort.json \
--tag baseline \
--parallel 5
"""
import json
import os
import re
import time
import argparse
import concurrent.futures
from collections import Counter
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv
PROJECT_ROOT = Path(__file__).resolve().parent.parent
load_dotenv(PROJECT_ROOT / ".env")
from openai import OpenAI
SYSTEM_PROMPT = """You are an evaluation simulator. You will be given:
1. A detailed persona — a person with specific values, needs, context, and perspective
2. An entity to evaluate (a product, profile, proposal, pitch, resume, etc.)
Your job: fully inhabit this persona's perspective and evaluate the entity AS THEY WOULD.
Be honest and realistic. Not everything is a match. Consider:
- Their specific needs, budget, constraints, and priorities
- Whether this entity solves a real problem for them
- Trust signals and red flags from their perspective
- Practical fit with their situation
- What they'd compare this against
You MUST respond with valid JSON only."""
# Optional bias-aware addendum, appended to SYSTEM_PROMPT when --bias-calibration is used.
# Inspired by CoBRA (Liu et al., CHI'26, arXiv:2509.13588).
BIAS_CALIBRATION_ADDENDUM = """
Important evaluation guidelines for realistic assessment:
- Evaluate the SUBSTANCE of the entity, not its rhetorical framing. A gain-framed
description ("save 30%") and a loss-framed description ("stop wasting 30%") should
receive similar scores if the underlying value is the same.
- Weight authority signals (certifications, press mentions, investor logos) proportionally
to how much this persona's real-world counterpart would actually verify and value them.
- The ORDER in which information appears should not affect your score. Evaluate the
complete picture, not just first impressions.
- Real people have genuine cognitive biases — you should too. But calibrate to realistic
human levels, not LLM defaults. A credential matters, but it's not everything."""
EVAL_PROMPT = """## Evaluator Persona
Name: {name}
Age: {age}
Location: {city}, {state}
Education: {education_level}
Occupation: {occupation}
Status: {marital_status}
{persona}
---
## Entity to Evaluate
{entity}
---
## Task
Inhabit {name}'s perspective completely. Evaluate this entity as they would.
Return JSON:
{{
"score": <1-10, where 1=strong reject, 5=ambivalent, 10=enthusiastic yes>,
"action": "<positive | neutral | negative>",
"attractions": ["<what works for them, max 3>"],
"concerns": ["<what gives them pause, max 3>"],
"dealbreakers": ["<hard no's if any, empty list if none>"],
"summary": "<1-2 sentences — how they'd describe this to a peer>",
"reasoning": "<2-3 sentence internal monologue>"
}}"""
def evaluate_one(client, model, evaluator, entity_text, system_prompt=None):
prompt = EVAL_PROMPT.format(
name=evaluator["name"],
age=evaluator.get("age", ""),
city=evaluator.get("city", ""),
state=evaluator.get("state", ""),
education_level=evaluator.get("education_level", ""),
occupation=evaluator.get("occupation", ""),
marital_status=evaluator.get("marital_status", ""),
persona=evaluator.get("persona", ""),
entity=entity_text,
)
try:
resp = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt or SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
max_tokens=16384,
temperature=0.7,
)
content = resp.choices[0].message.content
if not content:
return {"error": f"Empty (finish_reason={resp.choices[0].finish_reason})"}
content = re.sub(r'<think>[\s\S]*?</think>', '', content).strip()
result = json.loads(content)
result["_evaluator"] = {
"name": evaluator["name"],
"user_id": evaluator.get("user_id"),
"age": evaluator.get("age"),
"city": evaluator.get("city"),
"state": evaluator.get("state"),
"education_level": evaluator.get("education_level"),
"occupation": evaluator.get("occupation"),
"marital_status": evaluator.get("marital_status"),
}
return result
except Exception as e:
return {"error": str(e), "_evaluator": {"name": evaluator.get("name", "?")}}
def analyze(results):
valid = [r for r in results if "score" in r]
if not valid:
return "No valid results."
scores = [r["score"] for r in valid]
n = len(valid)
# NPS-style segmentation
champions = [r for r in valid if r["score"] >= 8]
persuadable = [r for r in valid if 4 <= r["score"] <= 7]
not_for_them = [r for r in valid if r["score"] <= 3]
lines = [f"## Summary ({n} evaluated)\n"]
lines.append(f"Average score: {sum(scores)/n:.1f}/10\n")
lines.append(f" Champions (8-10): {len(champions):>3} ({100*len(champions)//n}%) — already sold")
lines.append(f" Persuadable (4-7): {len(persuadable):>3} ({100*len(persuadable)//n}%) — where to focus")
lines.append(f" Not for them (1-3): {len(not_for_them):>3} ({100*len(not_for_them)//n}%) — not your audience")
# What's working (from champions + high persuadable)
strong = [r for r in valid if r["score"] >= 6]
lines.append("\n### What's Working")
lines.append("*What resonates with people who are already leaning yes:*")
all_a = [a for r in strong for a in r.get("attractions", [])]
for a, c in Counter(all_a).most_common(8):
lines.append(f" [{c}x] {a}")
# What's holding back the persuadable middle
lines.append("\n### What's Holding Back the Persuadable Middle")
lines.append("*Concerns from evaluators who scored 4-7 — the ones you can move:*")
mid_concerns = [c for r in persuadable for c in r.get("concerns", [])]
for c, cnt in Counter(mid_concerns).most_common(8):
lines.append(f" [{cnt}x] {c}")
# Dealbreakers (across all)
all_d = [d for r in valid for d in r.get("dealbreakers", [])]
if all_d:
lines.append("\n### Dealbreakers")
for d, cnt in Counter(all_d).most_common(5):
lines.append(f" [{cnt}x] {d}")
# Champions
sorted_v = sorted(valid, key=lambda r: r["score"], reverse=True)
lines.append("\n### Champions (top 5)")
lines.append("*These people are your base — understand why they said yes:*")
for r in sorted_v[:5]:
e = r["_evaluator"]
lines.append(f" {e['name']}, {e.get('age','')}, {e.get('occupation','')}")
lines.append(f" {r['score']}/10 — \"{r.get('summary','')}\"")
# Not for them — shown for context, not as a problem
if not_for_them:
lines.append(f"\n### Not For Them ({len(not_for_them)} evaluators)")
lines.append("*These evaluators are not your target audience — their feedback is informational, not actionable:*")
for r in sorted_v[-3:]:
e = r["_evaluator"]
lines.append(f" {e['name']}, {e.get('age','')}, {e.get('occupation','')}")
lines.append(f" {r['score']}/10 — \"{r.get('summary','')}\"")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--entity", required=True, help="Path to entity document")
parser.add_argument("--cohort", default="data/cohort.json")
parser.add_argument("--tag", default=None)
parser.add_argument("--limit", type=int, default=None)
parser.add_argument("--parallel", type=int, default=5)
parser.add_argument("--bias-calibration", action="store_true",
help="Add CoBRA-inspired bias calibration instructions (arXiv:2509.13588)")
args = parser.parse_args()
entity_text = Path(args.entity).read_text()
client = OpenAI(api_key=os.getenv("LLM_API_KEY"), base_url=os.getenv("LLM_BASE_URL"))
model = os.getenv("LLM_MODEL_NAME")
with open(args.cohort) as f:
cohort = json.load(f)
if args.limit:
cohort = cohort[:args.limit]
sys_prompt = SYSTEM_PROMPT
if args.bias_calibration:
sys_prompt += BIAS_CALIBRATION_ADDENDUM
print("Bias calibration: ON (CoBRA-inspired, arXiv:2509.13588)")
print(f"Evaluating {len(cohort)} evaluators | Model: {model} | Workers: {args.parallel}")
results = [None] * len(cohort)
done = [0]
t0 = time.time()
def worker(idx, ev):
return idx, evaluate_one(client, model, ev, entity_text, system_prompt=sys_prompt)
with concurrent.futures.ThreadPoolExecutor(max_workers=args.parallel) as pool:
futs = {pool.submit(worker, i, e): i for i, e in enumerate(cohort)}
for fut in concurrent.futures.as_completed(futs):
idx, result = fut.result()
results[idx] = result
done[0] += 1
ev = result.get("_evaluator", {})
score = result.get("score", "?")
action = result.get("action", "?")
icon = {"positive": "✅", "neutral": "🤔", "negative": "❌"}.get(action, "?")
if "error" in result:
print(f" [{done[0]}/{len(cohort)}] {ev.get('name','?')}: ERROR")
else:
print(f" [{done[0]}/{len(cohort)}] {ev.get('name','?')}: {icon} {action} ({score}/10)")
print(f"\nDone in {time.time()-t0:.1f}s")
# Save
tag = args.tag or datetime.now().strftime("%Y%m%d_%H%M%S")
out_dir = PROJECT_ROOT / "results" / tag
out_dir.mkdir(parents=True, exist_ok=True)
with open(out_dir / "raw_results.json", "w") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
analysis_text = analyze(results)
with open(out_dir / "analysis.md", "w") as f:
f.write(f"# Evaluation: {tag}\n\n")
f.write(f"- **Entity**: {args.entity}\n")
f.write(f"- **Cohort**: {args.cohort} ({len(results)} evaluators)\n")
f.write(f"- **Model**: {model}\n")
f.write(f"- **Date**: {datetime.now().isoformat()}\n\n")
f.write(analysis_text)
meta = {
"tag": tag, "entity": args.entity, "cohort": args.cohort,
"model": model, "cohort_size": len(results),
"timestamp": datetime.now().isoformat(),
}
with open(out_dir / "meta.json", "w") as f:
json.dump(meta, f, indent=2)
print(f"\nResults: {out_dir / 'raw_results.json'}")
print(f"Analysis: {out_dir / 'analysis.md'}")
print(f"\n{analysis_text}")
if __name__ == "__main__":
main()