"""Export a small, public-safe set of WitGym traces (Sharing is Caring). This writes deterministic-ish JSONL traces for a fixed set of canonical prompts. It intentionally omits any raw Office transcript text from retrieved scenes. """ from __future__ import annotations import json import os import time import hashlib import sys from pathlib import Path CANONICAL_SINGLE_TURN = [ "I just got promoted to manager and I have no idea what I'm doing.", "My coworker keeps stealing my lunch from the fridge.", "My boss says he trusts me, but he rewrites every message I send.", ] def _scene_id(setup: str, response: str) -> str: h = hashlib.sha256((setup + "\n" + response).encode("utf-8")).hexdigest() return h[:12] def _safe_scene(scene) -> dict: # Do not export setup/response verbatim; this keeps traces inspectable without # publishing transcript text. return { "scene_id": _scene_id(scene.setup, scene.response), "show": scene.show, "character": scene.character, "archetype": scene.archetype.value, "tension_type": scene.tension_type.value, "violation_distance": scene.violation_distance.value, } def _pipeline_logs(result) -> list[dict]: meta = result.metadata scenes = result.retrieved_scenes candidates = result.candidates return [ { "step": "metadata", "status": "ok", "detail": f"twist={meta.twist_potential} archetype={meta.archetype.value}", }, { "step": "retrieval", "status": "ok", "detail": ", ".join(f"{s.character}:{s.archetype.value}" for s in scenes) or "no precedent scenes", }, { "step": "candidate_generation", "status": "ok", "detail": ", ".join(f"{c.persona}:{len(c.text.split())}w" for c in candidates) or "no candidates", }, {"step": "ranking", "status": "ok", "detail": result.winning_persona or "none"}, {"step": "compression", "status": "ok", "detail": "selected line finalized"}, ] def _run_single(engine, user_input: str) -> dict: t0 = time.time() result = engine.respond(user_input) dt = time.time() - t0 meta = result.metadata return { "kind": "single_turn", "input": user_input, "route": result.route, "model_id": os.getenv("LLM_MODEL_ID", "Qwen/Qwen3.5-27B"), "llm_backend": os.getenv("LLM_BACKEND", "hf_api"), "latency_s": round(dt, 2), "metadata": { "surface": meta.surface, "subtext": meta.subtext, "behavioral_observation": getattr(meta, "behavioral_observation", None), "archetype": meta.archetype.value, "archetype_confidence": meta.archetype_confidence, "tension_type": meta.tension_type.value, "power_dynamic": meta.power_dynamic, "speaker_strategy": meta.speaker_strategy, "obvious_response": meta.obvious_response, "violation_distance": meta.violation_distance.value, "twist_potential": meta.twist_potential, "connector": meta.connector, }, "retrieved_scenes": [_safe_scene(s) for s in result.retrieved_scenes], "candidates": [ {"persona": c.persona, "word_count": len(c.text.split())} for c in result.candidates ], "winning_persona": result.winning_persona, "selected": result.selected, "logs": _pipeline_logs(result), } def main() -> int: # Prefer HF API for reproducible “no local weights” runs. os.environ.setdefault("LLM_BACKEND", "hf_api") # Allow running without installing the package (pip/uv editable install). repo_root = Path(__file__).resolve().parents[1] if str(repo_root) not in sys.path: sys.path.insert(0, str(repo_root)) from witgym.engine import WitGymEngine out_path = Path("data/public_traces.jsonl") out_path.parent.mkdir(parents=True, exist_ok=True) engine = WitGymEngine(index_path="data/index.npz") rows: list[dict] = [] for prompt in CANONICAL_SINGLE_TURN: rows.append(_run_single(engine, prompt)) with out_path.open("w", encoding="utf-8") as f: for row in rows: f.write(json.dumps(row, ensure_ascii=False) + "\n") print(f"✓ Wrote {len(rows)} trace rows → {out_path}") return 0 if __name__ == "__main__": raise SystemExit(main())