Spaces:

kimandrew927
/

memorybridge

Build error

App Files Files Community

memorybridge / scripts /build_persona.py

kimandrew927

Initial Space deployment

1004967 about 1 month ago

raw

history blame contribute delete

13.9 kB

	"""
	Build Persona Script.

	Generates synthetic persona data (narratives, social posts, AAC logs, gesture logs)
	for a given persona using the configured data_generation model (Qwen3-32B via Groq).

	Skips any file that already meets the minimum entry count — safe to re-run.
	Use --force to regenerate all files regardless.

	Usage:
	python scripts/build_persona.py --persona alex_rivera
	python scripts/build_persona.py --persona alex_rivera --force
	python scripts/build_persona.py --persona persona_2 --output-dir data/personas
	"""

	from __future__ import annotations

	import argparse
	import asyncio
	import json
	import re
	import sys
	from pathlib import Path
	from typing import Any

	from langchain_core.messages import HumanMessage, SystemMessage

	# Minimum entry counts — files with at least this many lines are skipped
	_MIN_COUNTS = {
	"narratives.jsonl": 50,
	"social_posts.jsonl": 50,
	"aac_logs.jsonl": 100,
	"gesture_logs.jsonl": 50,
	}

	_GESTURES = ["thumbs_up", "thumbs_down", "open_palm", "pointing", "closed_fist", "wave", "neutral"]
	_AFFECTS = ["happy", "frustrated", "surprised", "neutral"]

	_NO_THINK = "/no_think\n" # suppress Qwen3 CoT for deterministic structured output


	# ── Helpers ───────────────────────────────────────────────────────────────────

	def _count_valid_lines(path: Path) -> int:
	"""Count non-empty, non-placeholder JSONL lines."""
	if not path.exists():
	return 0
	count = 0
	with path.open(encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if line and '"placeholder"' not in line:
	count += 1
	return count


	def _extract_json_list(raw: str) -> list[dict]:
	"""Extract a JSON array from LLM output, stripping think blocks and markdown fences."""
	# Strip <think> blocks
	raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL)
	raw = re.sub(r"<thinking>.*?</thinking>", "", raw, flags=re.DOTALL)
	# Strip markdown fences
	raw = re.sub(r"^```(?:json)?\s*", "", raw.strip())
	raw = re.sub(r"\s*```$", "", raw.strip())
	# Find first [...] block
	match = re.search(r"\[.*\]", raw, re.DOTALL)
	if not match:
	raise ValueError(f"No JSON array found in response:\n{raw[:400]}")
	return json.loads(match.group(0))


	async def _llm_invoke(llm: Any, system: str, user: str) -> str:
	resp = await llm.ainvoke([
	SystemMessage(content=_NO_THINK + system),
	HumanMessage(content=user),
	])
	return resp.content if hasattr(resp, "content") else str(resp)


	def _append_jsonl(path: Path, records: list[dict]) -> None:
	with path.open("a", encoding="utf-8") as f:
	for r in records:
	f.write(json.dumps(r, ensure_ascii=False) + "\n")
	print(f" → appended {len(records)} records to {path.name}")


	# ── Generator functions ───────────────────────────────────────────────────────

	async def _generate_narratives(llm: Any, profile: dict, out_path: Path, needed: int) -> None:
	"""Generate first-person autobiographical narrative paragraphs."""
	profile_card = profile.get("profile_card", "")
	bio = json.dumps(profile.get("biography", {}), indent=2)
	topics = [
	"aac_journey", "family", "bills", "work", "daily_life",
	"cooking", "childhood", "leo", "disability_identity", "humor",
	]

	batch = min(needed, 20)
	system = (
	"You generate synthetic first-person narrative paragraphs for an AAC user persona. "
	"Each paragraph is 2-4 sentences, written in the user's authentic voice. "
	"Output a JSON array of objects with keys: text, topic. "
	"No preamble, no markdown, output only the JSON array."
	)
	user = (
	f"Persona: {profile_card}\n\nBiography: {bio}\n\n"
	f"Topics to vary across entries: {topics}\n\n"
	f"Generate {batch} narrative paragraphs. "
	"Match the persona's tone exactly: witty, short, informal, self-deprecating. "
	"Avoid formal language. Add 'source': 'generated' to each object."
	)

	records = []
	while len(records) < needed:
	raw = await _llm_invoke(llm, system, user)
	batch_records = _extract_json_list(raw)
	for r in batch_records:
	if "text" in r and "topic" in r:
	r.setdefault("source", "generated")
	records.append(r)
	if len(records) < needed:
	user = user.replace(f"Generate {batch}", f"Generate {min(needed - len(records), 20)}")

	_append_jsonl(out_path, records[:needed])


	async def _generate_social_posts(llm: Any, profile: dict, out_path: Path, needed: int) -> None:
	"""Generate social media posts (Twitter/Facebook style)."""
	profile_card = profile.get("profile_card", "")
	topics = [
	"bills", "cooking", "leo", "work", "daily_life",
	"aac", "sports", "weather", "humor", "family",
	]
	system = (
	"You generate synthetic social media posts for an AAC user persona. "
	"Posts are 1-3 sentences, casual, Twitter/Facebook style. "
	"Output a JSON array of objects with keys: text, topic, sentiment. "
	"sentiment must be one of: positive, negative, neutral. "
	"No preamble, output only the JSON array."
	)
	user = (
	f"Persona: {profile_card}\n\nTopics: {topics}\n\n"
	f"Generate {min(needed, 25)} social media posts. "
	"Capture the persona's dry humor, Bills fandom, and dad-life. "
	"Moderate emoji use. Add 'source': 'generated' to each object."
	)

	records = []
	while len(records) < needed:
	raw = await _llm_invoke(llm, system, user)
	batch_records = _extract_json_list(raw)
	for r in batch_records:
	if "text" in r and "topic" in r:
	r.setdefault("sentiment", "neutral")
	r.setdefault("source", "generated")
	records.append(r)
	if len(records) < needed:
	user = user.replace(
	f"Generate {min(needed, 25)}",
	f"Generate {min(needed - len(records), 25)}"
	)

	_append_jsonl(out_path, records[:needed])


	async def _generate_aac_logs(llm: Any, profile: dict, out_path: Path, needed: int) -> None:
	"""Generate multi-turn AAC conversation logs."""
	profile_card = profile.get("profile_card", "")
	scenarios = [
	"daily_checkin", "lunch_planning", "football_discussion", "family_update",
	"work_update", "medical_appointment", "friend_visit", "evening_wind_down",
	"weather_chat", "game_night",
	]
	system = (
	"You generate synthetic multi-turn AAC conversations between a communication partner and an AAC user. "
	"Output a JSON array of turn objects with keys: turn (int), role ('partner' or 'user'), "
	"text, scenario, topic, sentiment. "
	"The user's responses are short, punchy, and match the persona's style. "
	"Each scenario block should be 6-10 turns. "
	"No preamble, output only the JSON array."
	)
	user = (
	f"Persona: {profile_card}\n\nScenarios to cover: {scenarios}\n\n"
	f"Generate approximately {min(needed, 60)} turns across 2-3 different scenarios. "
	"Number turns sequentially within each scenario starting at 1. "
	"User responses must be brief (1-2 sentences max), witty, and authentic."
	)

	records = []
	while len(records) < needed:
	raw = await _llm_invoke(llm, system, user)
	batch_records = _extract_json_list(raw)
	for r in batch_records:
	if "text" in r and "role" in r:
	r.setdefault("scenario", "daily_checkin")
	r.setdefault("topic", "daily_life")
	r.setdefault("sentiment", "neutral")
	records.append(r)
	if len(records) < needed:
	user = user.replace(
	f"Generate approximately {min(needed, 60)}",
	f"Generate approximately {min(needed - len(records), 60)}"
	)

	_append_jsonl(out_path, records[:needed])


	async def _generate_gesture_logs(llm: Any, profile: dict, out_path: Path, needed: int) -> None:
	"""Generate gesture-annotated interaction records."""
	profile_card = profile.get("profile_card", "")
	shortcuts = profile.get("aac_shortcuts", {}).get("letter_shortcuts", {})
	system = (
	"You generate synthetic gesture-annotated AAC interaction logs. "
	"Each record captures a moment where the AAC user used a gesture or air-signed a letter "
	"alongside a partner query and their selected response. "
	"Output a JSON array of objects with keys: "
	"scenario (string), partner_query (string), gesture (string), "
	"affect (string), air_sign_letter (string or null), "
	"selected_response (string), topic (string), sentiment (string). "
	f"gesture must be one of: {_GESTURES}. "
	f"affect must be one of: {_AFFECTS}. "
	"air_sign_letter is a single uppercase letter or null. "
	"No preamble, output only the JSON array."
	)
	user = (
	f"Persona: {profile_card}\n\n"
	f"Letter shortcuts: {json.dumps(shortcuts)}\n\n"
	f"Generate {min(needed, 50)} gesture-annotated interaction records. "
	"Vary gestures, affects, and scenarios. "
	"When air_sign_letter is set, the selected_response should reference "
	"one of that letter's shortcut words. "
	"Keep responses short and in the persona's voice."
	)

	records = []
	while len(records) < needed:
	raw = await _llm_invoke(llm, system, user)
	batch_records = _extract_json_list(raw)
	for r in batch_records:
	if "partner_query" in r and "selected_response" in r:
	r.setdefault("scenario", "daily_life")
	r.setdefault("gesture", "neutral")
	r.setdefault("affect", "neutral")
	r.setdefault("air_sign_letter", None)
	r.setdefault("topic", "daily_life")
	r.setdefault("sentiment", "neutral")
	records.append(r)
	if len(records) < needed:
	user = user.replace(
	f"Generate {min(needed, 50)}",
	f"Generate {min(needed - len(records), 50)}"
	)

	_append_jsonl(out_path, records[:needed])


	# ── Main ──────────────────────────────────────────────────────────────────────

	_GENERATORS = {
	"narratives.jsonl": _generate_narratives,
	"social_posts.jsonl": _generate_social_posts,
	"aac_logs.jsonl": _generate_aac_logs,
	"gesture_logs.jsonl": _generate_gesture_logs,
	}


	async def build_persona(
	persona_id: str,
	config_path: str,
	output_dir: str,
	force: bool = False,
	) -> None:
	from memorybridge.core.models import ModelRegistry
	from memorybridge.memory.profile_loader import ProfileLoader

	registry = ModelRegistry(config_path)
	loader = ProfileLoader(config_path)
	profile = loader.load(persona_id)

	out_path = Path(output_dir) / persona_id
	out_path.mkdir(parents=True, exist_ok=True)

	provider = registry.get_config("models", "data_generation", "provider")
	model = registry.get_config("models", "data_generation", "model")
	print(f"Building persona data for: {persona_id}")
	print(f"Output directory: {out_path}")
	print(f"Model: {provider} / {model}")

	llm = registry.get_llm("data_generation")

	# Ensure letter_shortcuts.json is present
	shortcuts_path = out_path / "letter_shortcuts.json"
	if not shortcuts_path.exists():
	shortcuts = profile.get("aac_shortcuts", {}).get("letter_shortcuts", {})
	shortcuts_path.write_text(json.dumps(shortcuts, indent=2, ensure_ascii=False))
	print(f" Written letter_shortcuts.json")

	for filename, min_count in _MIN_COUNTS.items():
	filepath = out_path / filename
	existing = _count_valid_lines(filepath)

	if not force and existing >= min_count:
	print(f" Skipping {filename} — {existing}/{min_count} entries already present")
	continue

	needed = min_count - existing if not force else min_count
	print(f" Generating {filename} — need {needed} more entries ({existing} existing)...")

	if force and filepath.exists():
	filepath.unlink()

	try:
	await _GENERATORS[filename](llm, profile, filepath, needed)
	except Exception as exc:
	print(f" ERROR generating {filename}: {exc}", file=sys.stderr)

	print("\nDone.")


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(
	description="Generate synthetic persona corpus data using an LLM."
	)
	parser.add_argument(
	"--persona", required=True,
	help="Persona ID (e.g., alex_rivera). Must have a matching config/personas/{id}.json."
	)
	parser.add_argument(
	"--config", default="memorybridge/config/settings.yaml",
	help="Path to settings.yaml."
	)
	parser.add_argument(
	"--output-dir", default="data/personas",
	help="Output directory for generated data."
	)
	parser.add_argument(
	"--force", action="store_true",
	help="Regenerate all files even if minimum counts are already met."
	)
	return parser.parse_args()


	def main() -> None:
	args = parse_args()
	asyncio.run(build_persona(args.persona, args.config, args.output_dir, args.force))


	if __name__ == "__main__":
	main()