Spaces:

xleaps
/

sgo

Running

File size: 10,975 Bytes

"""
f(θ, x) evaluator — scores an entity against an evaluator cohort.

The LLM inhabits each evaluator's persona and produces a structured assessment
of the entity. Domain-agnostic: the system prompt adapts to the entity type.

Usage:
    uv run python scripts/evaluate.py \
      --entity entities/my_product.md \
      --cohort data/cohort.json \
      --tag baseline \
      --parallel 5
"""

import json
import os
import re
import time
import argparse
import concurrent.futures
from collections import Counter
from datetime import datetime
from pathlib import Path

from dotenv import load_dotenv

PROJECT_ROOT = Path(__file__).resolve().parent.parent
load_dotenv(PROJECT_ROOT / ".env")

from openai import OpenAI


SYSTEM_PROMPT = """You are an evaluation simulator. You will be given:
1. A detailed persona — a person with specific values, needs, context, and perspective
2. An entity to evaluate (a product, profile, proposal, pitch, resume, etc.)

Your job: fully inhabit this persona's perspective and evaluate the entity AS THEY WOULD.

Be honest and realistic. Not everything is a match. Consider:
- Their specific needs, budget, constraints, and priorities
- Whether this entity solves a real problem for them
- Trust signals and red flags from their perspective
- Practical fit with their situation
- What they'd compare this against

You MUST respond with valid JSON only."""

# Optional bias-aware addendum, appended to SYSTEM_PROMPT when --bias-calibration is used.
# Inspired by CoBRA (Liu et al., CHI'26, arXiv:2509.13588).
BIAS_CALIBRATION_ADDENDUM = """

Important evaluation guidelines for realistic assessment:
- Evaluate the SUBSTANCE of the entity, not its rhetorical framing. A gain-framed
  description ("save 30%") and a loss-framed description ("stop wasting 30%") should
  receive similar scores if the underlying value is the same.
- Weight authority signals (certifications, press mentions, investor logos) proportionally
  to how much this persona's real-world counterpart would actually verify and value them.
- The ORDER in which information appears should not affect your score. Evaluate the
  complete picture, not just first impressions.
- Real people have genuine cognitive biases — you should too. But calibrate to realistic
  human levels, not LLM defaults. A credential matters, but it's not everything."""

EVAL_PROMPT = """## Evaluator Persona

Name: {name}
Age: {age}
Location: {city}, {state}
Education: {education_level}
Occupation: {occupation}
Status: {marital_status}

{persona}

---

## Entity to Evaluate

{entity}

---

## Task

Inhabit {name}'s perspective completely. Evaluate this entity as they would.

Return JSON:
{{
    "score": <1-10, where 1=strong reject, 5=ambivalent, 10=enthusiastic yes>,
    "action": "<positive | neutral | negative>",
    "attractions": ["<what works for them, max 3>"],
    "concerns": ["<what gives them pause, max 3>"],
    "dealbreakers": ["<hard no's if any, empty list if none>"],
    "summary": "<1-2 sentences — how they'd describe this to a peer>",
    "reasoning": "<2-3 sentence internal monologue>"
}}"""


def evaluate_one(client, model, evaluator, entity_text, system_prompt=None):
    prompt = EVAL_PROMPT.format(
        name=evaluator["name"],
        age=evaluator.get("age", ""),
        city=evaluator.get("city", ""),
        state=evaluator.get("state", ""),
        education_level=evaluator.get("education_level", ""),
        occupation=evaluator.get("occupation", ""),
        marital_status=evaluator.get("marital_status", ""),
        persona=evaluator.get("persona", ""),
        entity=entity_text,
    )
    try:
        resp = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt or SYSTEM_PROMPT},
                {"role": "user", "content": prompt},
            ],
            response_format={"type": "json_object"},
            max_tokens=16384,
            temperature=0.7,
        )
        content = resp.choices[0].message.content
        if not content:
            return {"error": f"Empty (finish_reason={resp.choices[0].finish_reason})"}
        content = re.sub(r'<think>[\s\S]*?</think>', '', content).strip()
        result = json.loads(content)
        result["_evaluator"] = {
            "name": evaluator["name"],
            "user_id": evaluator.get("user_id"),
            "age": evaluator.get("age"),
            "city": evaluator.get("city"),
            "state": evaluator.get("state"),
            "education_level": evaluator.get("education_level"),
            "occupation": evaluator.get("occupation"),
            "marital_status": evaluator.get("marital_status"),
        }
        return result
    except Exception as e:
        return {"error": str(e), "_evaluator": {"name": evaluator.get("name", "?")}}


def analyze(results):
    valid = [r for r in results if "score" in r]
    if not valid:
        return "No valid results."

    scores = [r["score"] for r in valid]
    n = len(valid)

    # NPS-style segmentation
    champions = [r for r in valid if r["score"] >= 8]
    persuadable = [r for r in valid if 4 <= r["score"] <= 7]
    not_for_them = [r for r in valid if r["score"] <= 3]

    lines = [f"## Summary ({n} evaluated)\n"]
    lines.append(f"Average score: {sum(scores)/n:.1f}/10\n")
    lines.append(f"  Champions (8-10):     {len(champions):>3} ({100*len(champions)//n}%)  — already sold")
    lines.append(f"  Persuadable (4-7):    {len(persuadable):>3} ({100*len(persuadable)//n}%)  — where to focus")
    lines.append(f"  Not for them (1-3):   {len(not_for_them):>3} ({100*len(not_for_them)//n}%)  — not your audience")

    # What's working (from champions + high persuadable)
    strong = [r for r in valid if r["score"] >= 6]
    lines.append("\n### What's Working")
    lines.append("*What resonates with people who are already leaning yes:*")
    all_a = [a for r in strong for a in r.get("attractions", [])]
    for a, c in Counter(all_a).most_common(8):
        lines.append(f"  [{c}x] {a}")

    # What's holding back the persuadable middle
    lines.append("\n### What's Holding Back the Persuadable Middle")
    lines.append("*Concerns from evaluators who scored 4-7 — the ones you can move:*")
    mid_concerns = [c for r in persuadable for c in r.get("concerns", [])]
    for c, cnt in Counter(mid_concerns).most_common(8):
        lines.append(f"  [{cnt}x] {c}")

    # Dealbreakers (across all)
    all_d = [d for r in valid for d in r.get("dealbreakers", [])]
    if all_d:
        lines.append("\n### Dealbreakers")
        for d, cnt in Counter(all_d).most_common(5):
            lines.append(f"  [{cnt}x] {d}")

    # Champions
    sorted_v = sorted(valid, key=lambda r: r["score"], reverse=True)
    lines.append("\n### Champions (top 5)")
    lines.append("*These people are your base — understand why they said yes:*")
    for r in sorted_v[:5]:
        e = r["_evaluator"]
        lines.append(f"  {e['name']}, {e.get('age','')}, {e.get('occupation','')}")
        lines.append(f"    {r['score']}/10 — \"{r.get('summary','')}\"")

    # Not for them — shown for context, not as a problem
    if not_for_them:
        lines.append(f"\n### Not For Them ({len(not_for_them)} evaluators)")
        lines.append("*These evaluators are not your target audience — their feedback is informational, not actionable:*")
        for r in sorted_v[-3:]:
            e = r["_evaluator"]
            lines.append(f"  {e['name']}, {e.get('age','')}, {e.get('occupation','')}")
            lines.append(f"    {r['score']}/10 — \"{r.get('summary','')}\"")

    return "\n".join(lines)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--entity", required=True, help="Path to entity document")
    parser.add_argument("--cohort", default="data/cohort.json")
    parser.add_argument("--tag", default=None)
    parser.add_argument("--limit", type=int, default=None)
    parser.add_argument("--parallel", type=int, default=5)
    parser.add_argument("--bias-calibration", action="store_true",
                        help="Add CoBRA-inspired bias calibration instructions (arXiv:2509.13588)")
    args = parser.parse_args()

    entity_text = Path(args.entity).read_text()

    client = OpenAI(api_key=os.getenv("LLM_API_KEY"), base_url=os.getenv("LLM_BASE_URL"))
    model = os.getenv("LLM_MODEL_NAME")

    with open(args.cohort) as f:
        cohort = json.load(f)
    if args.limit:
        cohort = cohort[:args.limit]

    sys_prompt = SYSTEM_PROMPT
    if args.bias_calibration:
        sys_prompt += BIAS_CALIBRATION_ADDENDUM
        print("Bias calibration: ON (CoBRA-inspired, arXiv:2509.13588)")

    print(f"Evaluating {len(cohort)} evaluators | Model: {model} | Workers: {args.parallel}")

    results = [None] * len(cohort)
    done = [0]
    t0 = time.time()

    def worker(idx, ev):
        return idx, evaluate_one(client, model, ev, entity_text, system_prompt=sys_prompt)

    with concurrent.futures.ThreadPoolExecutor(max_workers=args.parallel) as pool:
        futs = {pool.submit(worker, i, e): i for i, e in enumerate(cohort)}
        for fut in concurrent.futures.as_completed(futs):
            idx, result = fut.result()
            results[idx] = result
            done[0] += 1
            ev = result.get("_evaluator", {})
            score = result.get("score", "?")
            action = result.get("action", "?")
            icon = {"positive": "✅", "neutral": "🤔", "negative": "❌"}.get(action, "?")
            if "error" in result:
                print(f"  [{done[0]}/{len(cohort)}] {ev.get('name','?')}: ERROR")
            else:
                print(f"  [{done[0]}/{len(cohort)}] {ev.get('name','?')}: {icon} {action} ({score}/10)")

    print(f"\nDone in {time.time()-t0:.1f}s")

    # Save
    tag = args.tag or datetime.now().strftime("%Y%m%d_%H%M%S")
    out_dir = PROJECT_ROOT / "results" / tag
    out_dir.mkdir(parents=True, exist_ok=True)

    with open(out_dir / "raw_results.json", "w") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    analysis_text = analyze(results)
    with open(out_dir / "analysis.md", "w") as f:
        f.write(f"# Evaluation: {tag}\n\n")
        f.write(f"- **Entity**: {args.entity}\n")
        f.write(f"- **Cohort**: {args.cohort} ({len(results)} evaluators)\n")
        f.write(f"- **Model**: {model}\n")
        f.write(f"- **Date**: {datetime.now().isoformat()}\n\n")
        f.write(analysis_text)

    meta = {
        "tag": tag, "entity": args.entity, "cohort": args.cohort,
        "model": model, "cohort_size": len(results),
        "timestamp": datetime.now().isoformat(),
    }
    with open(out_dir / "meta.json", "w") as f:
        json.dump(meta, f, indent=2)

    print(f"\nResults:  {out_dir / 'raw_results.json'}")
    print(f"Analysis: {out_dir / 'analysis.md'}")
    print(f"\n{analysis_text}")


if __name__ == "__main__":
    main()