File size: 10,975 Bytes
9415028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76bda55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9415028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76bda55
9415028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76bda55
9415028
 
 
 
 
 
 
 
 
 
 
 
 
ff413e9
9415028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88e3978
 
 
 
9415028
88e3978
 
 
 
 
 
 
 
 
 
 
9415028
 
 
88e3978
 
 
 
 
9415028
 
88e3978
9415028
 
88e3978
 
9415028
 
88e3978
9415028
88e3978
 
9415028
 
 
 
 
88e3978
 
 
 
 
 
 
 
9415028
 
 
 
 
 
 
 
 
 
 
76bda55
 
9415028
 
 
 
 
 
 
 
 
 
 
 
76bda55
 
 
 
 
9415028
 
 
 
 
 
 
76bda55
9415028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
"""
f(θ, x) evaluator — scores an entity against an evaluator cohort.

The LLM inhabits each evaluator's persona and produces a structured assessment
of the entity. Domain-agnostic: the system prompt adapts to the entity type.

Usage:
    uv run python scripts/evaluate.py \
      --entity entities/my_product.md \
      --cohort data/cohort.json \
      --tag baseline \
      --parallel 5
"""

import json
import os
import re
import time
import argparse
import concurrent.futures
from collections import Counter
from datetime import datetime
from pathlib import Path

from dotenv import load_dotenv

PROJECT_ROOT = Path(__file__).resolve().parent.parent
load_dotenv(PROJECT_ROOT / ".env")

from openai import OpenAI


SYSTEM_PROMPT = """You are an evaluation simulator. You will be given:
1. A detailed persona — a person with specific values, needs, context, and perspective
2. An entity to evaluate (a product, profile, proposal, pitch, resume, etc.)

Your job: fully inhabit this persona's perspective and evaluate the entity AS THEY WOULD.

Be honest and realistic. Not everything is a match. Consider:
- Their specific needs, budget, constraints, and priorities
- Whether this entity solves a real problem for them
- Trust signals and red flags from their perspective
- Practical fit with their situation
- What they'd compare this against

You MUST respond with valid JSON only."""

# Optional bias-aware addendum, appended to SYSTEM_PROMPT when --bias-calibration is used.
# Inspired by CoBRA (Liu et al., CHI'26, arXiv:2509.13588).
BIAS_CALIBRATION_ADDENDUM = """

Important evaluation guidelines for realistic assessment:
- Evaluate the SUBSTANCE of the entity, not its rhetorical framing. A gain-framed
  description ("save 30%") and a loss-framed description ("stop wasting 30%") should
  receive similar scores if the underlying value is the same.
- Weight authority signals (certifications, press mentions, investor logos) proportionally
  to how much this persona's real-world counterpart would actually verify and value them.
- The ORDER in which information appears should not affect your score. Evaluate the
  complete picture, not just first impressions.
- Real people have genuine cognitive biases — you should too. But calibrate to realistic
  human levels, not LLM defaults. A credential matters, but it's not everything."""

EVAL_PROMPT = """## Evaluator Persona

Name: {name}
Age: {age}
Location: {city}, {state}
Education: {education_level}
Occupation: {occupation}
Status: {marital_status}

{persona}

---

## Entity to Evaluate

{entity}

---

## Task

Inhabit {name}'s perspective completely. Evaluate this entity as they would.

Return JSON:
{{
    "score": <1-10, where 1=strong reject, 5=ambivalent, 10=enthusiastic yes>,
    "action": "<positive | neutral | negative>",
    "attractions": ["<what works for them, max 3>"],
    "concerns": ["<what gives them pause, max 3>"],
    "dealbreakers": ["<hard no's if any, empty list if none>"],
    "summary": "<1-2 sentences — how they'd describe this to a peer>",
    "reasoning": "<2-3 sentence internal monologue>"
}}"""


def evaluate_one(client, model, evaluator, entity_text, system_prompt=None):
    prompt = EVAL_PROMPT.format(
        name=evaluator["name"],
        age=evaluator.get("age", ""),
        city=evaluator.get("city", ""),
        state=evaluator.get("state", ""),
        education_level=evaluator.get("education_level", ""),
        occupation=evaluator.get("occupation", ""),
        marital_status=evaluator.get("marital_status", ""),
        persona=evaluator.get("persona", ""),
        entity=entity_text,
    )
    try:
        resp = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt or SYSTEM_PROMPT},
                {"role": "user", "content": prompt},
            ],
            response_format={"type": "json_object"},
            max_tokens=16384,
            temperature=0.7,
        )
        content = resp.choices[0].message.content
        if not content:
            return {"error": f"Empty (finish_reason={resp.choices[0].finish_reason})"}
        content = re.sub(r'<think>[\s\S]*?</think>', '', content).strip()
        result = json.loads(content)
        result["_evaluator"] = {
            "name": evaluator["name"],
            "user_id": evaluator.get("user_id"),
            "age": evaluator.get("age"),
            "city": evaluator.get("city"),
            "state": evaluator.get("state"),
            "education_level": evaluator.get("education_level"),
            "occupation": evaluator.get("occupation"),
            "marital_status": evaluator.get("marital_status"),
        }
        return result
    except Exception as e:
        return {"error": str(e), "_evaluator": {"name": evaluator.get("name", "?")}}


def analyze(results):
    valid = [r for r in results if "score" in r]
    if not valid:
        return "No valid results."

    scores = [r["score"] for r in valid]
    n = len(valid)

    # NPS-style segmentation
    champions = [r for r in valid if r["score"] >= 8]
    persuadable = [r for r in valid if 4 <= r["score"] <= 7]
    not_for_them = [r for r in valid if r["score"] <= 3]

    lines = [f"## Summary ({n} evaluated)\n"]
    lines.append(f"Average score: {sum(scores)/n:.1f}/10\n")
    lines.append(f"  Champions (8-10):     {len(champions):>3} ({100*len(champions)//n}%)  — already sold")
    lines.append(f"  Persuadable (4-7):    {len(persuadable):>3} ({100*len(persuadable)//n}%)  — where to focus")
    lines.append(f"  Not for them (1-3):   {len(not_for_them):>3} ({100*len(not_for_them)//n}%)  — not your audience")

    # What's working (from champions + high persuadable)
    strong = [r for r in valid if r["score"] >= 6]
    lines.append("\n### What's Working")
    lines.append("*What resonates with people who are already leaning yes:*")
    all_a = [a for r in strong for a in r.get("attractions", [])]
    for a, c in Counter(all_a).most_common(8):
        lines.append(f"  [{c}x] {a}")

    # What's holding back the persuadable middle
    lines.append("\n### What's Holding Back the Persuadable Middle")
    lines.append("*Concerns from evaluators who scored 4-7 — the ones you can move:*")
    mid_concerns = [c for r in persuadable for c in r.get("concerns", [])]
    for c, cnt in Counter(mid_concerns).most_common(8):
        lines.append(f"  [{cnt}x] {c}")

    # Dealbreakers (across all)
    all_d = [d for r in valid for d in r.get("dealbreakers", [])]
    if all_d:
        lines.append("\n### Dealbreakers")
        for d, cnt in Counter(all_d).most_common(5):
            lines.append(f"  [{cnt}x] {d}")

    # Champions
    sorted_v = sorted(valid, key=lambda r: r["score"], reverse=True)
    lines.append("\n### Champions (top 5)")
    lines.append("*These people are your base — understand why they said yes:*")
    for r in sorted_v[:5]:
        e = r["_evaluator"]
        lines.append(f"  {e['name']}, {e.get('age','')}, {e.get('occupation','')}")
        lines.append(f"    {r['score']}/10 — \"{r.get('summary','')}\"")

    # Not for them — shown for context, not as a problem
    if not_for_them:
        lines.append(f"\n### Not For Them ({len(not_for_them)} evaluators)")
        lines.append("*These evaluators are not your target audience — their feedback is informational, not actionable:*")
        for r in sorted_v[-3:]:
            e = r["_evaluator"]
            lines.append(f"  {e['name']}, {e.get('age','')}, {e.get('occupation','')}")
            lines.append(f"    {r['score']}/10 — \"{r.get('summary','')}\"")

    return "\n".join(lines)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--entity", required=True, help="Path to entity document")
    parser.add_argument("--cohort", default="data/cohort.json")
    parser.add_argument("--tag", default=None)
    parser.add_argument("--limit", type=int, default=None)
    parser.add_argument("--parallel", type=int, default=5)
    parser.add_argument("--bias-calibration", action="store_true",
                        help="Add CoBRA-inspired bias calibration instructions (arXiv:2509.13588)")
    args = parser.parse_args()

    entity_text = Path(args.entity).read_text()

    client = OpenAI(api_key=os.getenv("LLM_API_KEY"), base_url=os.getenv("LLM_BASE_URL"))
    model = os.getenv("LLM_MODEL_NAME")

    with open(args.cohort) as f:
        cohort = json.load(f)
    if args.limit:
        cohort = cohort[:args.limit]

    sys_prompt = SYSTEM_PROMPT
    if args.bias_calibration:
        sys_prompt += BIAS_CALIBRATION_ADDENDUM
        print("Bias calibration: ON (CoBRA-inspired, arXiv:2509.13588)")

    print(f"Evaluating {len(cohort)} evaluators | Model: {model} | Workers: {args.parallel}")

    results = [None] * len(cohort)
    done = [0]
    t0 = time.time()

    def worker(idx, ev):
        return idx, evaluate_one(client, model, ev, entity_text, system_prompt=sys_prompt)

    with concurrent.futures.ThreadPoolExecutor(max_workers=args.parallel) as pool:
        futs = {pool.submit(worker, i, e): i for i, e in enumerate(cohort)}
        for fut in concurrent.futures.as_completed(futs):
            idx, result = fut.result()
            results[idx] = result
            done[0] += 1
            ev = result.get("_evaluator", {})
            score = result.get("score", "?")
            action = result.get("action", "?")
            icon = {"positive": "✅", "neutral": "🤔", "negative": "❌"}.get(action, "?")
            if "error" in result:
                print(f"  [{done[0]}/{len(cohort)}] {ev.get('name','?')}: ERROR")
            else:
                print(f"  [{done[0]}/{len(cohort)}] {ev.get('name','?')}: {icon} {action} ({score}/10)")

    print(f"\nDone in {time.time()-t0:.1f}s")

    # Save
    tag = args.tag or datetime.now().strftime("%Y%m%d_%H%M%S")
    out_dir = PROJECT_ROOT / "results" / tag
    out_dir.mkdir(parents=True, exist_ok=True)

    with open(out_dir / "raw_results.json", "w") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    analysis_text = analyze(results)
    with open(out_dir / "analysis.md", "w") as f:
        f.write(f"# Evaluation: {tag}\n\n")
        f.write(f"- **Entity**: {args.entity}\n")
        f.write(f"- **Cohort**: {args.cohort} ({len(results)} evaluators)\n")
        f.write(f"- **Model**: {model}\n")
        f.write(f"- **Date**: {datetime.now().isoformat()}\n\n")
        f.write(analysis_text)

    meta = {
        "tag": tag, "entity": args.entity, "cohort": args.cohort,
        "model": model, "cohort_size": len(results),
        "timestamp": datetime.now().isoformat(),
    }
    with open(out_dir / "meta.json", "w") as f:
        json.dump(meta, f, indent=2)

    print(f"\nResults:  {out_dir / 'raw_results.json'}")
    print(f"Analysis: {out_dir / 'analysis.md'}")
    print(f"\n{analysis_text}")


if __name__ == "__main__":
    main()