memorybridge / scripts /build_persona.py
kimandrew927's picture
Initial Space deployment
1004967
"""
Build Persona Script.
Generates synthetic persona data (narratives, social posts, AAC logs, gesture logs)
for a given persona using the configured data_generation model (Qwen3-32B via Groq).
Skips any file that already meets the minimum entry count β€” safe to re-run.
Use --force to regenerate all files regardless.
Usage:
python scripts/build_persona.py --persona alex_rivera
python scripts/build_persona.py --persona alex_rivera --force
python scripts/build_persona.py --persona persona_2 --output-dir data/personas
"""
from __future__ import annotations
import argparse
import asyncio
import json
import re
import sys
from pathlib import Path
from typing import Any
from langchain_core.messages import HumanMessage, SystemMessage
# Minimum entry counts β€” files with at least this many lines are skipped
_MIN_COUNTS = {
"narratives.jsonl": 50,
"social_posts.jsonl": 50,
"aac_logs.jsonl": 100,
"gesture_logs.jsonl": 50,
}
_GESTURES = ["thumbs_up", "thumbs_down", "open_palm", "pointing", "closed_fist", "wave", "neutral"]
_AFFECTS = ["happy", "frustrated", "surprised", "neutral"]
_NO_THINK = "/no_think\n" # suppress Qwen3 CoT for deterministic structured output
# ── Helpers ───────────────────────────────────────────────────────────────────
def _count_valid_lines(path: Path) -> int:
"""Count non-empty, non-placeholder JSONL lines."""
if not path.exists():
return 0
count = 0
with path.open(encoding="utf-8") as f:
for line in f:
line = line.strip()
if line and '"placeholder"' not in line:
count += 1
return count
def _extract_json_list(raw: str) -> list[dict]:
"""Extract a JSON array from LLM output, stripping think blocks and markdown fences."""
# Strip <think> blocks
raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL)
raw = re.sub(r"<thinking>.*?</thinking>", "", raw, flags=re.DOTALL)
# Strip markdown fences
raw = re.sub(r"^```(?:json)?\s*", "", raw.strip())
raw = re.sub(r"\s*```$", "", raw.strip())
# Find first [...] block
match = re.search(r"\[.*\]", raw, re.DOTALL)
if not match:
raise ValueError(f"No JSON array found in response:\n{raw[:400]}")
return json.loads(match.group(0))
async def _llm_invoke(llm: Any, system: str, user: str) -> str:
resp = await llm.ainvoke([
SystemMessage(content=_NO_THINK + system),
HumanMessage(content=user),
])
return resp.content if hasattr(resp, "content") else str(resp)
def _append_jsonl(path: Path, records: list[dict]) -> None:
with path.open("a", encoding="utf-8") as f:
for r in records:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
print(f" β†’ appended {len(records)} records to {path.name}")
# ── Generator functions ───────────────────────────────────────────────────────
async def _generate_narratives(llm: Any, profile: dict, out_path: Path, needed: int) -> None:
"""Generate first-person autobiographical narrative paragraphs."""
profile_card = profile.get("profile_card", "")
bio = json.dumps(profile.get("biography", {}), indent=2)
topics = [
"aac_journey", "family", "bills", "work", "daily_life",
"cooking", "childhood", "leo", "disability_identity", "humor",
]
batch = min(needed, 20)
system = (
"You generate synthetic first-person narrative paragraphs for an AAC user persona. "
"Each paragraph is 2-4 sentences, written in the user's authentic voice. "
"Output a JSON array of objects with keys: text, topic. "
"No preamble, no markdown, output only the JSON array."
)
user = (
f"Persona: {profile_card}\n\nBiography: {bio}\n\n"
f"Topics to vary across entries: {topics}\n\n"
f"Generate {batch} narrative paragraphs. "
"Match the persona's tone exactly: witty, short, informal, self-deprecating. "
"Avoid formal language. Add 'source': 'generated' to each object."
)
records = []
while len(records) < needed:
raw = await _llm_invoke(llm, system, user)
batch_records = _extract_json_list(raw)
for r in batch_records:
if "text" in r and "topic" in r:
r.setdefault("source", "generated")
records.append(r)
if len(records) < needed:
user = user.replace(f"Generate {batch}", f"Generate {min(needed - len(records), 20)}")
_append_jsonl(out_path, records[:needed])
async def _generate_social_posts(llm: Any, profile: dict, out_path: Path, needed: int) -> None:
"""Generate social media posts (Twitter/Facebook style)."""
profile_card = profile.get("profile_card", "")
topics = [
"bills", "cooking", "leo", "work", "daily_life",
"aac", "sports", "weather", "humor", "family",
]
system = (
"You generate synthetic social media posts for an AAC user persona. "
"Posts are 1-3 sentences, casual, Twitter/Facebook style. "
"Output a JSON array of objects with keys: text, topic, sentiment. "
"sentiment must be one of: positive, negative, neutral. "
"No preamble, output only the JSON array."
)
user = (
f"Persona: {profile_card}\n\nTopics: {topics}\n\n"
f"Generate {min(needed, 25)} social media posts. "
"Capture the persona's dry humor, Bills fandom, and dad-life. "
"Moderate emoji use. Add 'source': 'generated' to each object."
)
records = []
while len(records) < needed:
raw = await _llm_invoke(llm, system, user)
batch_records = _extract_json_list(raw)
for r in batch_records:
if "text" in r and "topic" in r:
r.setdefault("sentiment", "neutral")
r.setdefault("source", "generated")
records.append(r)
if len(records) < needed:
user = user.replace(
f"Generate {min(needed, 25)}",
f"Generate {min(needed - len(records), 25)}"
)
_append_jsonl(out_path, records[:needed])
async def _generate_aac_logs(llm: Any, profile: dict, out_path: Path, needed: int) -> None:
"""Generate multi-turn AAC conversation logs."""
profile_card = profile.get("profile_card", "")
scenarios = [
"daily_checkin", "lunch_planning", "football_discussion", "family_update",
"work_update", "medical_appointment", "friend_visit", "evening_wind_down",
"weather_chat", "game_night",
]
system = (
"You generate synthetic multi-turn AAC conversations between a communication partner and an AAC user. "
"Output a JSON array of turn objects with keys: turn (int), role ('partner' or 'user'), "
"text, scenario, topic, sentiment. "
"The user's responses are short, punchy, and match the persona's style. "
"Each scenario block should be 6-10 turns. "
"No preamble, output only the JSON array."
)
user = (
f"Persona: {profile_card}\n\nScenarios to cover: {scenarios}\n\n"
f"Generate approximately {min(needed, 60)} turns across 2-3 different scenarios. "
"Number turns sequentially within each scenario starting at 1. "
"User responses must be brief (1-2 sentences max), witty, and authentic."
)
records = []
while len(records) < needed:
raw = await _llm_invoke(llm, system, user)
batch_records = _extract_json_list(raw)
for r in batch_records:
if "text" in r and "role" in r:
r.setdefault("scenario", "daily_checkin")
r.setdefault("topic", "daily_life")
r.setdefault("sentiment", "neutral")
records.append(r)
if len(records) < needed:
user = user.replace(
f"Generate approximately {min(needed, 60)}",
f"Generate approximately {min(needed - len(records), 60)}"
)
_append_jsonl(out_path, records[:needed])
async def _generate_gesture_logs(llm: Any, profile: dict, out_path: Path, needed: int) -> None:
"""Generate gesture-annotated interaction records."""
profile_card = profile.get("profile_card", "")
shortcuts = profile.get("aac_shortcuts", {}).get("letter_shortcuts", {})
system = (
"You generate synthetic gesture-annotated AAC interaction logs. "
"Each record captures a moment where the AAC user used a gesture or air-signed a letter "
"alongside a partner query and their selected response. "
"Output a JSON array of objects with keys: "
"scenario (string), partner_query (string), gesture (string), "
"affect (string), air_sign_letter (string or null), "
"selected_response (string), topic (string), sentiment (string). "
f"gesture must be one of: {_GESTURES}. "
f"affect must be one of: {_AFFECTS}. "
"air_sign_letter is a single uppercase letter or null. "
"No preamble, output only the JSON array."
)
user = (
f"Persona: {profile_card}\n\n"
f"Letter shortcuts: {json.dumps(shortcuts)}\n\n"
f"Generate {min(needed, 50)} gesture-annotated interaction records. "
"Vary gestures, affects, and scenarios. "
"When air_sign_letter is set, the selected_response should reference "
"one of that letter's shortcut words. "
"Keep responses short and in the persona's voice."
)
records = []
while len(records) < needed:
raw = await _llm_invoke(llm, system, user)
batch_records = _extract_json_list(raw)
for r in batch_records:
if "partner_query" in r and "selected_response" in r:
r.setdefault("scenario", "daily_life")
r.setdefault("gesture", "neutral")
r.setdefault("affect", "neutral")
r.setdefault("air_sign_letter", None)
r.setdefault("topic", "daily_life")
r.setdefault("sentiment", "neutral")
records.append(r)
if len(records) < needed:
user = user.replace(
f"Generate {min(needed, 50)}",
f"Generate {min(needed - len(records), 50)}"
)
_append_jsonl(out_path, records[:needed])
# ── Main ──────────────────────────────────────────────────────────────────────
_GENERATORS = {
"narratives.jsonl": _generate_narratives,
"social_posts.jsonl": _generate_social_posts,
"aac_logs.jsonl": _generate_aac_logs,
"gesture_logs.jsonl": _generate_gesture_logs,
}
async def build_persona(
persona_id: str,
config_path: str,
output_dir: str,
force: bool = False,
) -> None:
from memorybridge.core.models import ModelRegistry
from memorybridge.memory.profile_loader import ProfileLoader
registry = ModelRegistry(config_path)
loader = ProfileLoader(config_path)
profile = loader.load(persona_id)
out_path = Path(output_dir) / persona_id
out_path.mkdir(parents=True, exist_ok=True)
provider = registry.get_config("models", "data_generation", "provider")
model = registry.get_config("models", "data_generation", "model")
print(f"Building persona data for: {persona_id}")
print(f"Output directory: {out_path}")
print(f"Model: {provider} / {model}")
llm = registry.get_llm("data_generation")
# Ensure letter_shortcuts.json is present
shortcuts_path = out_path / "letter_shortcuts.json"
if not shortcuts_path.exists():
shortcuts = profile.get("aac_shortcuts", {}).get("letter_shortcuts", {})
shortcuts_path.write_text(json.dumps(shortcuts, indent=2, ensure_ascii=False))
print(f" Written letter_shortcuts.json")
for filename, min_count in _MIN_COUNTS.items():
filepath = out_path / filename
existing = _count_valid_lines(filepath)
if not force and existing >= min_count:
print(f" Skipping {filename} β€” {existing}/{min_count} entries already present")
continue
needed = min_count - existing if not force else min_count
print(f" Generating {filename} β€” need {needed} more entries ({existing} existing)...")
if force and filepath.exists():
filepath.unlink()
try:
await _GENERATORS[filename](llm, profile, filepath, needed)
except Exception as exc:
print(f" ERROR generating {filename}: {exc}", file=sys.stderr)
print("\nDone.")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Generate synthetic persona corpus data using an LLM."
)
parser.add_argument(
"--persona", required=True,
help="Persona ID (e.g., alex_rivera). Must have a matching config/personas/{id}.json."
)
parser.add_argument(
"--config", default="memorybridge/config/settings.yaml",
help="Path to settings.yaml."
)
parser.add_argument(
"--output-dir", default="data/personas",
help="Output directory for generated data."
)
parser.add_argument(
"--force", action="store_true",
help="Regenerate all files even if minimum counts are already met."
)
return parser.parse_args()
def main() -> None:
args = parse_args()
asyncio.run(build_persona(args.persona, args.config, args.output_dir, args.force))
if __name__ == "__main__":
main()