Spaces:

xleaps
/

sgo

Running

File size: 5,222 Bytes

"""
LLM-generated cohort — for domains where Nemotron doesn't fit.

When you need personas that don't exist in the population dataset (e.g., B2B
buyer personas, VC investors, hiring managers), this script generates them
via LLM with explicit stratification constraints.

WARNING: See README.md § The Seeding Problem. LLM-generated personas are
subject to mode collapse and invisible bias. Use census-grounded datasets
(Nemotron) when possible. This script is the fallback.

Usage:
    uv run python scripts/generate_cohort.py \
      --description "B2B SaaS buyers evaluating a data pipeline tool" \
      --segments '[
        {"label": "Solo dev, bootstrap", "count": 8},
        {"label": "Startup eng manager, Series A", "count": 8},
        {"label": "Enterprise CTO, 500+ employees", "count": 8},
        {"label": "Data analyst, non-technical", "count": 8},
        {"label": "DevOps engineer, mid-size company", "count": 8}
      ]' \
      --output data/cohort.json
"""

import json
import os
import re
import argparse
import concurrent.futures
from pathlib import Path

from dotenv import load_dotenv

PROJECT_ROOT = Path(__file__).resolve().parent.parent
load_dotenv(PROJECT_ROOT / ".env")

from openai import OpenAI

SYSTEM_PROMPT = """You generate realistic, diverse personas for evaluation simulations.
Each persona must be a distinct, internally consistent individual — not a stereotype.
Include: name, age, location, education, occupation, personality traits, values,
priorities, budget constraints, technical background, and decision-making style.
Vary across gender, ethnicity, geography, and temperament.

You MUST respond with valid JSON only."""

GENERATE_PROMPT = """Generate {count} distinct personas matching this segment:

Segment: {segment_label}
Context: {description}

Each persona should be 200-400 words and feel like a real person, not a marketing archetype.

Return JSON:
{{
    "personas": [
        {{
            "name": "<realistic full name>",
            "age": <integer>,
            "sex": "<Male | Female>",
            "city": "<city>",
            "state": "<state abbreviation>",
            "country": "USA",
            "education_level": "<high_school | bachelors | graduate | etc>",
            "occupation": "<specific job title>",
            "marital_status": "<single | married | other>",
            "interests": ["<hobby or skill, 3-5 items>"],
            "persona": "<200-400 word detailed persona narrative>",
            "segment": "{segment_label}"
        }}
    ]
}}"""


def generate_segment(client, model, segment_label, count, description):
    prompt = GENERATE_PROMPT.format(
        count=count, segment_label=segment_label, description=description
    )
    try:
        resp = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": prompt},
            ],
            response_format={"type": "json_object"},
            max_tokens=16384,
            temperature=0.8,
        )
        content = resp.choices[0].message.content
        if not content:
            return []
        content = re.sub(r'<think>[\s\S]*?</think>', '', content).strip()
        data = json.loads(content)
        return data.get("personas", [])
    except Exception as e:
        print(f"  ERROR generating '{segment_label}': {e}")
        return []


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--description", required=True, help="Context for persona generation")
    parser.add_argument("--segments", required=True, type=json.loads,
                        help='JSON array: [{"label": "...", "count": N}, ...]')
    parser.add_argument("--output", default="data/cohort.json")
    parser.add_argument("--parallel", type=int, default=3)
    args = parser.parse_args()

    client = OpenAI(api_key=os.getenv("LLM_API_KEY"), base_url=os.getenv("LLM_BASE_URL"))
    model = os.getenv("LLM_MODEL_NAME")

    print(f"Generating personas | Model: {model}")
    print(f"Context: {args.description}")
    print(f"Segments: {len(args.segments)}\n")

    print("⚠️  WARNING: LLM-generated personas are subject to mode collapse.")
    print("   Use census-grounded datasets (Nemotron) when possible.\n")

    all_personas = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=args.parallel) as pool:
        futs = {
            pool.submit(generate_segment, client, model,
                        seg["label"], seg["count"], args.description): seg
            for seg in args.segments
        }
        for fut in concurrent.futures.as_completed(futs):
            seg = futs[fut]
            personas = fut.result()
            print(f"  {seg['label']}: {len(personas)} personas generated")
            all_personas.extend(personas)

    # Assign user_ids
    for i, p in enumerate(all_personas):
        p["user_id"] = i

    Path(args.output).parent.mkdir(parents=True, exist_ok=True)
    with open(args.output, "w") as f:
        json.dump(all_personas, f, ensure_ascii=False, indent=2)

    print(f"\nSaved {len(all_personas)} personas to {args.output}")


if __name__ == "__main__":
    main()