Spaces:
Build error
Build error
| """ | |
| Build Persona Script. | |
| Generates synthetic persona data (narratives, social posts, AAC logs, gesture logs) | |
| for a given persona using the configured data_generation model (Qwen3-32B via Groq). | |
| Skips any file that already meets the minimum entry count β safe to re-run. | |
| Use --force to regenerate all files regardless. | |
| Usage: | |
| python scripts/build_persona.py --persona alex_rivera | |
| python scripts/build_persona.py --persona alex_rivera --force | |
| python scripts/build_persona.py --persona persona_2 --output-dir data/personas | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import asyncio | |
| import json | |
| import re | |
| import sys | |
| from pathlib import Path | |
| from typing import Any | |
| from langchain_core.messages import HumanMessage, SystemMessage | |
| # Minimum entry counts β files with at least this many lines are skipped | |
| _MIN_COUNTS = { | |
| "narratives.jsonl": 50, | |
| "social_posts.jsonl": 50, | |
| "aac_logs.jsonl": 100, | |
| "gesture_logs.jsonl": 50, | |
| } | |
| _GESTURES = ["thumbs_up", "thumbs_down", "open_palm", "pointing", "closed_fist", "wave", "neutral"] | |
| _AFFECTS = ["happy", "frustrated", "surprised", "neutral"] | |
| _NO_THINK = "/no_think\n" # suppress Qwen3 CoT for deterministic structured output | |
| # ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _count_valid_lines(path: Path) -> int: | |
| """Count non-empty, non-placeholder JSONL lines.""" | |
| if not path.exists(): | |
| return 0 | |
| count = 0 | |
| with path.open(encoding="utf-8") as f: | |
| for line in f: | |
| line = line.strip() | |
| if line and '"placeholder"' not in line: | |
| count += 1 | |
| return count | |
| def _extract_json_list(raw: str) -> list[dict]: | |
| """Extract a JSON array from LLM output, stripping think blocks and markdown fences.""" | |
| # Strip <think> blocks | |
| raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL) | |
| raw = re.sub(r"<thinking>.*?</thinking>", "", raw, flags=re.DOTALL) | |
| # Strip markdown fences | |
| raw = re.sub(r"^```(?:json)?\s*", "", raw.strip()) | |
| raw = re.sub(r"\s*```$", "", raw.strip()) | |
| # Find first [...] block | |
| match = re.search(r"\[.*\]", raw, re.DOTALL) | |
| if not match: | |
| raise ValueError(f"No JSON array found in response:\n{raw[:400]}") | |
| return json.loads(match.group(0)) | |
| async def _llm_invoke(llm: Any, system: str, user: str) -> str: | |
| resp = await llm.ainvoke([ | |
| SystemMessage(content=_NO_THINK + system), | |
| HumanMessage(content=user), | |
| ]) | |
| return resp.content if hasattr(resp, "content") else str(resp) | |
| def _append_jsonl(path: Path, records: list[dict]) -> None: | |
| with path.open("a", encoding="utf-8") as f: | |
| for r in records: | |
| f.write(json.dumps(r, ensure_ascii=False) + "\n") | |
| print(f" β appended {len(records)} records to {path.name}") | |
| # ββ Generator functions βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def _generate_narratives(llm: Any, profile: dict, out_path: Path, needed: int) -> None: | |
| """Generate first-person autobiographical narrative paragraphs.""" | |
| profile_card = profile.get("profile_card", "") | |
| bio = json.dumps(profile.get("biography", {}), indent=2) | |
| topics = [ | |
| "aac_journey", "family", "bills", "work", "daily_life", | |
| "cooking", "childhood", "leo", "disability_identity", "humor", | |
| ] | |
| batch = min(needed, 20) | |
| system = ( | |
| "You generate synthetic first-person narrative paragraphs for an AAC user persona. " | |
| "Each paragraph is 2-4 sentences, written in the user's authentic voice. " | |
| "Output a JSON array of objects with keys: text, topic. " | |
| "No preamble, no markdown, output only the JSON array." | |
| ) | |
| user = ( | |
| f"Persona: {profile_card}\n\nBiography: {bio}\n\n" | |
| f"Topics to vary across entries: {topics}\n\n" | |
| f"Generate {batch} narrative paragraphs. " | |
| "Match the persona's tone exactly: witty, short, informal, self-deprecating. " | |
| "Avoid formal language. Add 'source': 'generated' to each object." | |
| ) | |
| records = [] | |
| while len(records) < needed: | |
| raw = await _llm_invoke(llm, system, user) | |
| batch_records = _extract_json_list(raw) | |
| for r in batch_records: | |
| if "text" in r and "topic" in r: | |
| r.setdefault("source", "generated") | |
| records.append(r) | |
| if len(records) < needed: | |
| user = user.replace(f"Generate {batch}", f"Generate {min(needed - len(records), 20)}") | |
| _append_jsonl(out_path, records[:needed]) | |
| async def _generate_social_posts(llm: Any, profile: dict, out_path: Path, needed: int) -> None: | |
| """Generate social media posts (Twitter/Facebook style).""" | |
| profile_card = profile.get("profile_card", "") | |
| topics = [ | |
| "bills", "cooking", "leo", "work", "daily_life", | |
| "aac", "sports", "weather", "humor", "family", | |
| ] | |
| system = ( | |
| "You generate synthetic social media posts for an AAC user persona. " | |
| "Posts are 1-3 sentences, casual, Twitter/Facebook style. " | |
| "Output a JSON array of objects with keys: text, topic, sentiment. " | |
| "sentiment must be one of: positive, negative, neutral. " | |
| "No preamble, output only the JSON array." | |
| ) | |
| user = ( | |
| f"Persona: {profile_card}\n\nTopics: {topics}\n\n" | |
| f"Generate {min(needed, 25)} social media posts. " | |
| "Capture the persona's dry humor, Bills fandom, and dad-life. " | |
| "Moderate emoji use. Add 'source': 'generated' to each object." | |
| ) | |
| records = [] | |
| while len(records) < needed: | |
| raw = await _llm_invoke(llm, system, user) | |
| batch_records = _extract_json_list(raw) | |
| for r in batch_records: | |
| if "text" in r and "topic" in r: | |
| r.setdefault("sentiment", "neutral") | |
| r.setdefault("source", "generated") | |
| records.append(r) | |
| if len(records) < needed: | |
| user = user.replace( | |
| f"Generate {min(needed, 25)}", | |
| f"Generate {min(needed - len(records), 25)}" | |
| ) | |
| _append_jsonl(out_path, records[:needed]) | |
| async def _generate_aac_logs(llm: Any, profile: dict, out_path: Path, needed: int) -> None: | |
| """Generate multi-turn AAC conversation logs.""" | |
| profile_card = profile.get("profile_card", "") | |
| scenarios = [ | |
| "daily_checkin", "lunch_planning", "football_discussion", "family_update", | |
| "work_update", "medical_appointment", "friend_visit", "evening_wind_down", | |
| "weather_chat", "game_night", | |
| ] | |
| system = ( | |
| "You generate synthetic multi-turn AAC conversations between a communication partner and an AAC user. " | |
| "Output a JSON array of turn objects with keys: turn (int), role ('partner' or 'user'), " | |
| "text, scenario, topic, sentiment. " | |
| "The user's responses are short, punchy, and match the persona's style. " | |
| "Each scenario block should be 6-10 turns. " | |
| "No preamble, output only the JSON array." | |
| ) | |
| user = ( | |
| f"Persona: {profile_card}\n\nScenarios to cover: {scenarios}\n\n" | |
| f"Generate approximately {min(needed, 60)} turns across 2-3 different scenarios. " | |
| "Number turns sequentially within each scenario starting at 1. " | |
| "User responses must be brief (1-2 sentences max), witty, and authentic." | |
| ) | |
| records = [] | |
| while len(records) < needed: | |
| raw = await _llm_invoke(llm, system, user) | |
| batch_records = _extract_json_list(raw) | |
| for r in batch_records: | |
| if "text" in r and "role" in r: | |
| r.setdefault("scenario", "daily_checkin") | |
| r.setdefault("topic", "daily_life") | |
| r.setdefault("sentiment", "neutral") | |
| records.append(r) | |
| if len(records) < needed: | |
| user = user.replace( | |
| f"Generate approximately {min(needed, 60)}", | |
| f"Generate approximately {min(needed - len(records), 60)}" | |
| ) | |
| _append_jsonl(out_path, records[:needed]) | |
| async def _generate_gesture_logs(llm: Any, profile: dict, out_path: Path, needed: int) -> None: | |
| """Generate gesture-annotated interaction records.""" | |
| profile_card = profile.get("profile_card", "") | |
| shortcuts = profile.get("aac_shortcuts", {}).get("letter_shortcuts", {}) | |
| system = ( | |
| "You generate synthetic gesture-annotated AAC interaction logs. " | |
| "Each record captures a moment where the AAC user used a gesture or air-signed a letter " | |
| "alongside a partner query and their selected response. " | |
| "Output a JSON array of objects with keys: " | |
| "scenario (string), partner_query (string), gesture (string), " | |
| "affect (string), air_sign_letter (string or null), " | |
| "selected_response (string), topic (string), sentiment (string). " | |
| f"gesture must be one of: {_GESTURES}. " | |
| f"affect must be one of: {_AFFECTS}. " | |
| "air_sign_letter is a single uppercase letter or null. " | |
| "No preamble, output only the JSON array." | |
| ) | |
| user = ( | |
| f"Persona: {profile_card}\n\n" | |
| f"Letter shortcuts: {json.dumps(shortcuts)}\n\n" | |
| f"Generate {min(needed, 50)} gesture-annotated interaction records. " | |
| "Vary gestures, affects, and scenarios. " | |
| "When air_sign_letter is set, the selected_response should reference " | |
| "one of that letter's shortcut words. " | |
| "Keep responses short and in the persona's voice." | |
| ) | |
| records = [] | |
| while len(records) < needed: | |
| raw = await _llm_invoke(llm, system, user) | |
| batch_records = _extract_json_list(raw) | |
| for r in batch_records: | |
| if "partner_query" in r and "selected_response" in r: | |
| r.setdefault("scenario", "daily_life") | |
| r.setdefault("gesture", "neutral") | |
| r.setdefault("affect", "neutral") | |
| r.setdefault("air_sign_letter", None) | |
| r.setdefault("topic", "daily_life") | |
| r.setdefault("sentiment", "neutral") | |
| records.append(r) | |
| if len(records) < needed: | |
| user = user.replace( | |
| f"Generate {min(needed, 50)}", | |
| f"Generate {min(needed - len(records), 50)}" | |
| ) | |
| _append_jsonl(out_path, records[:needed]) | |
| # ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _GENERATORS = { | |
| "narratives.jsonl": _generate_narratives, | |
| "social_posts.jsonl": _generate_social_posts, | |
| "aac_logs.jsonl": _generate_aac_logs, | |
| "gesture_logs.jsonl": _generate_gesture_logs, | |
| } | |
| async def build_persona( | |
| persona_id: str, | |
| config_path: str, | |
| output_dir: str, | |
| force: bool = False, | |
| ) -> None: | |
| from memorybridge.core.models import ModelRegistry | |
| from memorybridge.memory.profile_loader import ProfileLoader | |
| registry = ModelRegistry(config_path) | |
| loader = ProfileLoader(config_path) | |
| profile = loader.load(persona_id) | |
| out_path = Path(output_dir) / persona_id | |
| out_path.mkdir(parents=True, exist_ok=True) | |
| provider = registry.get_config("models", "data_generation", "provider") | |
| model = registry.get_config("models", "data_generation", "model") | |
| print(f"Building persona data for: {persona_id}") | |
| print(f"Output directory: {out_path}") | |
| print(f"Model: {provider} / {model}") | |
| llm = registry.get_llm("data_generation") | |
| # Ensure letter_shortcuts.json is present | |
| shortcuts_path = out_path / "letter_shortcuts.json" | |
| if not shortcuts_path.exists(): | |
| shortcuts = profile.get("aac_shortcuts", {}).get("letter_shortcuts", {}) | |
| shortcuts_path.write_text(json.dumps(shortcuts, indent=2, ensure_ascii=False)) | |
| print(f" Written letter_shortcuts.json") | |
| for filename, min_count in _MIN_COUNTS.items(): | |
| filepath = out_path / filename | |
| existing = _count_valid_lines(filepath) | |
| if not force and existing >= min_count: | |
| print(f" Skipping {filename} β {existing}/{min_count} entries already present") | |
| continue | |
| needed = min_count - existing if not force else min_count | |
| print(f" Generating {filename} β need {needed} more entries ({existing} existing)...") | |
| if force and filepath.exists(): | |
| filepath.unlink() | |
| try: | |
| await _GENERATORS[filename](llm, profile, filepath, needed) | |
| except Exception as exc: | |
| print(f" ERROR generating {filename}: {exc}", file=sys.stderr) | |
| print("\nDone.") | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser( | |
| description="Generate synthetic persona corpus data using an LLM." | |
| ) | |
| parser.add_argument( | |
| "--persona", required=True, | |
| help="Persona ID (e.g., alex_rivera). Must have a matching config/personas/{id}.json." | |
| ) | |
| parser.add_argument( | |
| "--config", default="memorybridge/config/settings.yaml", | |
| help="Path to settings.yaml." | |
| ) | |
| parser.add_argument( | |
| "--output-dir", default="data/personas", | |
| help="Output directory for generated data." | |
| ) | |
| parser.add_argument( | |
| "--force", action="store_true", | |
| help="Regenerate all files even if minimum counts are already met." | |
| ) | |
| return parser.parse_args() | |
| def main() -> None: | |
| args = parse_args() | |
| asyncio.run(build_persona(args.persona, args.config, args.output_dir, args.force)) | |
| if __name__ == "__main__": | |
| main() | |