""" Build Persona Script. Generates synthetic persona data (narratives, social posts, AAC logs, gesture logs) for a given persona using the configured data_generation model (Qwen3-32B via Groq). Skips any file that already meets the minimum entry count — safe to re-run. Use --force to regenerate all files regardless. Usage: python scripts/build_persona.py --persona alex_rivera python scripts/build_persona.py --persona alex_rivera --force python scripts/build_persona.py --persona persona_2 --output-dir data/personas """ from __future__ import annotations import argparse import asyncio import json import re import sys from pathlib import Path from typing import Any from langchain_core.messages import HumanMessage, SystemMessage # Minimum entry counts — files with at least this many lines are skipped _MIN_COUNTS = { "narratives.jsonl": 50, "social_posts.jsonl": 50, "aac_logs.jsonl": 100, "gesture_logs.jsonl": 50, } _GESTURES = ["thumbs_up", "thumbs_down", "open_palm", "pointing", "closed_fist", "wave", "neutral"] _AFFECTS = ["happy", "frustrated", "surprised", "neutral"] _NO_THINK = "/no_think\n" # suppress Qwen3 CoT for deterministic structured output # ── Helpers ─────────────────────────────────────────────────────────────────── def _count_valid_lines(path: Path) -> int: """Count non-empty, non-placeholder JSONL lines.""" if not path.exists(): return 0 count = 0 with path.open(encoding="utf-8") as f: for line in f: line = line.strip() if line and '"placeholder"' not in line: count += 1 return count def _extract_json_list(raw: str) -> list[dict]: """Extract a JSON array from LLM output, stripping think blocks and markdown fences.""" # Strip blocks raw = re.sub(r".*?", "", raw, flags=re.DOTALL) raw = re.sub(r".*?", "", raw, flags=re.DOTALL) # Strip markdown fences raw = re.sub(r"^```(?:json)?\s*", "", raw.strip()) raw = re.sub(r"\s*```$", "", raw.strip()) # Find first [...] block match = re.search(r"\[.*\]", raw, re.DOTALL) if not match: raise ValueError(f"No JSON array found in response:\n{raw[:400]}") return json.loads(match.group(0)) async def _llm_invoke(llm: Any, system: str, user: str) -> str: resp = await llm.ainvoke([ SystemMessage(content=_NO_THINK + system), HumanMessage(content=user), ]) return resp.content if hasattr(resp, "content") else str(resp) def _append_jsonl(path: Path, records: list[dict]) -> None: with path.open("a", encoding="utf-8") as f: for r in records: f.write(json.dumps(r, ensure_ascii=False) + "\n") print(f" → appended {len(records)} records to {path.name}") # ── Generator functions ─────────────────────────────────────────────────────── async def _generate_narratives(llm: Any, profile: dict, out_path: Path, needed: int) -> None: """Generate first-person autobiographical narrative paragraphs.""" profile_card = profile.get("profile_card", "") bio = json.dumps(profile.get("biography", {}), indent=2) topics = [ "aac_journey", "family", "bills", "work", "daily_life", "cooking", "childhood", "leo", "disability_identity", "humor", ] batch = min(needed, 20) system = ( "You generate synthetic first-person narrative paragraphs for an AAC user persona. " "Each paragraph is 2-4 sentences, written in the user's authentic voice. " "Output a JSON array of objects with keys: text, topic. " "No preamble, no markdown, output only the JSON array." ) user = ( f"Persona: {profile_card}\n\nBiography: {bio}\n\n" f"Topics to vary across entries: {topics}\n\n" f"Generate {batch} narrative paragraphs. " "Match the persona's tone exactly: witty, short, informal, self-deprecating. " "Avoid formal language. Add 'source': 'generated' to each object." ) records = [] while len(records) < needed: raw = await _llm_invoke(llm, system, user) batch_records = _extract_json_list(raw) for r in batch_records: if "text" in r and "topic" in r: r.setdefault("source", "generated") records.append(r) if len(records) < needed: user = user.replace(f"Generate {batch}", f"Generate {min(needed - len(records), 20)}") _append_jsonl(out_path, records[:needed]) async def _generate_social_posts(llm: Any, profile: dict, out_path: Path, needed: int) -> None: """Generate social media posts (Twitter/Facebook style).""" profile_card = profile.get("profile_card", "") topics = [ "bills", "cooking", "leo", "work", "daily_life", "aac", "sports", "weather", "humor", "family", ] system = ( "You generate synthetic social media posts for an AAC user persona. " "Posts are 1-3 sentences, casual, Twitter/Facebook style. " "Output a JSON array of objects with keys: text, topic, sentiment. " "sentiment must be one of: positive, negative, neutral. " "No preamble, output only the JSON array." ) user = ( f"Persona: {profile_card}\n\nTopics: {topics}\n\n" f"Generate {min(needed, 25)} social media posts. " "Capture the persona's dry humor, Bills fandom, and dad-life. " "Moderate emoji use. Add 'source': 'generated' to each object." ) records = [] while len(records) < needed: raw = await _llm_invoke(llm, system, user) batch_records = _extract_json_list(raw) for r in batch_records: if "text" in r and "topic" in r: r.setdefault("sentiment", "neutral") r.setdefault("source", "generated") records.append(r) if len(records) < needed: user = user.replace( f"Generate {min(needed, 25)}", f"Generate {min(needed - len(records), 25)}" ) _append_jsonl(out_path, records[:needed]) async def _generate_aac_logs(llm: Any, profile: dict, out_path: Path, needed: int) -> None: """Generate multi-turn AAC conversation logs.""" profile_card = profile.get("profile_card", "") scenarios = [ "daily_checkin", "lunch_planning", "football_discussion", "family_update", "work_update", "medical_appointment", "friend_visit", "evening_wind_down", "weather_chat", "game_night", ] system = ( "You generate synthetic multi-turn AAC conversations between a communication partner and an AAC user. " "Output a JSON array of turn objects with keys: turn (int), role ('partner' or 'user'), " "text, scenario, topic, sentiment. " "The user's responses are short, punchy, and match the persona's style. " "Each scenario block should be 6-10 turns. " "No preamble, output only the JSON array." ) user = ( f"Persona: {profile_card}\n\nScenarios to cover: {scenarios}\n\n" f"Generate approximately {min(needed, 60)} turns across 2-3 different scenarios. " "Number turns sequentially within each scenario starting at 1. " "User responses must be brief (1-2 sentences max), witty, and authentic." ) records = [] while len(records) < needed: raw = await _llm_invoke(llm, system, user) batch_records = _extract_json_list(raw) for r in batch_records: if "text" in r and "role" in r: r.setdefault("scenario", "daily_checkin") r.setdefault("topic", "daily_life") r.setdefault("sentiment", "neutral") records.append(r) if len(records) < needed: user = user.replace( f"Generate approximately {min(needed, 60)}", f"Generate approximately {min(needed - len(records), 60)}" ) _append_jsonl(out_path, records[:needed]) async def _generate_gesture_logs(llm: Any, profile: dict, out_path: Path, needed: int) -> None: """Generate gesture-annotated interaction records.""" profile_card = profile.get("profile_card", "") shortcuts = profile.get("aac_shortcuts", {}).get("letter_shortcuts", {}) system = ( "You generate synthetic gesture-annotated AAC interaction logs. " "Each record captures a moment where the AAC user used a gesture or air-signed a letter " "alongside a partner query and their selected response. " "Output a JSON array of objects with keys: " "scenario (string), partner_query (string), gesture (string), " "affect (string), air_sign_letter (string or null), " "selected_response (string), topic (string), sentiment (string). " f"gesture must be one of: {_GESTURES}. " f"affect must be one of: {_AFFECTS}. " "air_sign_letter is a single uppercase letter or null. " "No preamble, output only the JSON array." ) user = ( f"Persona: {profile_card}\n\n" f"Letter shortcuts: {json.dumps(shortcuts)}\n\n" f"Generate {min(needed, 50)} gesture-annotated interaction records. " "Vary gestures, affects, and scenarios. " "When air_sign_letter is set, the selected_response should reference " "one of that letter's shortcut words. " "Keep responses short and in the persona's voice." ) records = [] while len(records) < needed: raw = await _llm_invoke(llm, system, user) batch_records = _extract_json_list(raw) for r in batch_records: if "partner_query" in r and "selected_response" in r: r.setdefault("scenario", "daily_life") r.setdefault("gesture", "neutral") r.setdefault("affect", "neutral") r.setdefault("air_sign_letter", None) r.setdefault("topic", "daily_life") r.setdefault("sentiment", "neutral") records.append(r) if len(records) < needed: user = user.replace( f"Generate {min(needed, 50)}", f"Generate {min(needed - len(records), 50)}" ) _append_jsonl(out_path, records[:needed]) # ── Main ────────────────────────────────────────────────────────────────────── _GENERATORS = { "narratives.jsonl": _generate_narratives, "social_posts.jsonl": _generate_social_posts, "aac_logs.jsonl": _generate_aac_logs, "gesture_logs.jsonl": _generate_gesture_logs, } async def build_persona( persona_id: str, config_path: str, output_dir: str, force: bool = False, ) -> None: from memorybridge.core.models import ModelRegistry from memorybridge.memory.profile_loader import ProfileLoader registry = ModelRegistry(config_path) loader = ProfileLoader(config_path) profile = loader.load(persona_id) out_path = Path(output_dir) / persona_id out_path.mkdir(parents=True, exist_ok=True) provider = registry.get_config("models", "data_generation", "provider") model = registry.get_config("models", "data_generation", "model") print(f"Building persona data for: {persona_id}") print(f"Output directory: {out_path}") print(f"Model: {provider} / {model}") llm = registry.get_llm("data_generation") # Ensure letter_shortcuts.json is present shortcuts_path = out_path / "letter_shortcuts.json" if not shortcuts_path.exists(): shortcuts = profile.get("aac_shortcuts", {}).get("letter_shortcuts", {}) shortcuts_path.write_text(json.dumps(shortcuts, indent=2, ensure_ascii=False)) print(f" Written letter_shortcuts.json") for filename, min_count in _MIN_COUNTS.items(): filepath = out_path / filename existing = _count_valid_lines(filepath) if not force and existing >= min_count: print(f" Skipping {filename} — {existing}/{min_count} entries already present") continue needed = min_count - existing if not force else min_count print(f" Generating {filename} — need {needed} more entries ({existing} existing)...") if force and filepath.exists(): filepath.unlink() try: await _GENERATORS[filename](llm, profile, filepath, needed) except Exception as exc: print(f" ERROR generating {filename}: {exc}", file=sys.stderr) print("\nDone.") def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Generate synthetic persona corpus data using an LLM." ) parser.add_argument( "--persona", required=True, help="Persona ID (e.g., alex_rivera). Must have a matching config/personas/{id}.json." ) parser.add_argument( "--config", default="memorybridge/config/settings.yaml", help="Path to settings.yaml." ) parser.add_argument( "--output-dir", default="data/personas", help="Output directory for generated data." ) parser.add_argument( "--force", action="store_true", help="Regenerate all files even if minimum counts are already met." ) return parser.parse_args() def main() -> None: args = parse_args() asyncio.run(build_persona(args.persona, args.config, args.output_dir, args.force)) if __name__ == "__main__": main()