"""Export REAL agent traces for the Hub (Build Small "Sharing is Caring" badge). Captures two genuine traces from the live, fully-local stack: 1. CASE GENERATION - every prompt the pipeline sends to the in-process llama.cpp model and the raw completion that came back, for one complete authored case; 2. LIVE INTERROGATION - a short playthrough against the served case: questions (one with evidence presented), the suspect's spoken reply, and the server-authoritative suspicion/flags, with wall-clock latency per turn. Writes ``traces/case0_traces.jsonl`` + ``traces/README.md``. Upload with: python scripts/export_traces.py # produce the files python scripts/export_traces.py --push # produce AND push to the Hub dataset """ from __future__ import annotations import json import sys import time from pathlib import Path ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT / "src")) from case_zero.config import get_settings # noqa: E402 from case_zero.generator.pipeline import generate_case # noqa: E402 from case_zero.llm.backend import GenParams, LLMBackend, make_backend # noqa: E402 DATASET_ID = "HusseinEid/case0-traces" OUT_DIR = ROOT / "traces" _DATASET_README = """--- license: apache-2.0 tags: - build-small-hackathon - agent-trace - text-generation pretty_name: Case Zero agent traces --- # Case Zero - agent traces Real traces from [Case Zero](https://huggingface.co/spaces/build-small-hackathon/case0), a procedural detective game where a single **Qwen2.5-1.5B** model (in-process llama.cpp, CPU-only, no cloud APIs) authors a complete mystery and then role-plays every suspect live under interrogation. `case0_traces.jsonl` - one JSON object per line: - `type: "generation_call"` - one pipeline LLM call while authoring a case: the exact `prompt`, the raw `completion`, sampling params, and latency. Two calls author a full case (world+cast, then mystery); deterministic Python assembles and solver-checks it. - `type: "interrogation_turn"` - one live turn against the running game server: the player's `question` (optionally `presented_clue`), the suspect's spoken `reply`, and the server-authoritative `suspicion` / `flags` that came back, with latency. Everything was produced by the shipped game code - no hand-editing, no cloud calls. """ class _TracingBackend: """Wraps the real backend and records every (prompt, completion) pair.""" def __init__(self, inner: LLMBackend) -> None: self._inner = inner self.calls: list[dict] = [] def generate(self, prompt: str, params: GenParams) -> str: t0 = time.time() out = self._inner.generate(prompt, params) self.calls.append({ "type": "generation_call", "prompt": prompt, "completion": out, "temperature": params.temperature, "max_tokens": params.max_tokens, "constrained": bool(params.grammar or params.json_schema), "latency_s": round(time.time() - t0, 2), }) return out def stream(self, prompt: str, params: GenParams): yield self.generate(prompt, params) def _generation_trace(records: list[dict]) -> None: backend = _TracingBackend(make_backend(get_settings())) result = generate_case(backend, seed=77321) for call in backend.calls: records.append(call) records.append({ "type": "generation_result", "case_id": result.case.case_id, "crime_kind": result.case.crime_kind.value, "title": result.case.title, "solvable": result.report.ok, "attempts": result.attempts, "n_suspects": len(result.case.suspects), "n_clues": len(result.case.clues), }) def _interrogation_trace(records: list[dict]) -> None: from starlette.testclient import TestClient from case_zero.api.server import build_server client = TestClient(build_server()) case = client.post("/api/case", json={}).json() run_id = case["runId"] pub = case["case"] records.append({ "type": "case_served", "case_id": pub["id"], "kind": pub.get("kind", "homicide"), "title": pub["title"], }) sus = pub["suspects"] breaking = pub["evidence"][0]["id"] plan = [ (sus[0]["id"], "Where were you when it happened?", None), (sus[0]["id"], "Did you have any quarrel with the victim?", None), (sus[1]["id"], "Walk me through your evening, minute by minute.", None), (sus[1]["id"], "Explain this.", breaking), ] for sus_id, question, clue in plan: t0 = time.time() body: dict = {"freeText": question} if clue: body["presentEvidenceId"] = clue r = client.post(f"/api/run/{run_id}/interrogate/{sus_id}", json=body).json() records.append({ "type": "interrogation_turn", "suspect": sus_id, "question": question, "presented_clue": clue, "reply": r.get("reply"), "suspicion": r.get("suspicion"), "suspicion_delta": r.get("suspicionDelta"), "flags": r.get("flags"), "latency_s": round(time.time() - t0, 2), }) def main() -> int: OUT_DIR.mkdir(parents=True, exist_ok=True) records: list[dict] = [] print("tracing one full case generation (two model calls)...") _generation_trace(records) print("tracing a live interrogation playthrough...") _interrogation_trace(records) path = OUT_DIR / "case0_traces.jsonl" with path.open("w", encoding="utf-8") as fh: for rec in records: fh.write(json.dumps(rec, ensure_ascii=False) + "\n") (OUT_DIR / "README.md").write_text(_DATASET_README, encoding="utf-8") print(f"wrote {len(records)} records -> {path}") if "--push" in sys.argv: from huggingface_hub import HfApi api = HfApi() api.create_repo(DATASET_ID, repo_type="dataset", exist_ok=True) api.upload_folder(repo_id=DATASET_ID, repo_type="dataset", folder_path=str(OUT_DIR)) print(f"pushed -> https://huggingface.co/datasets/{DATASET_ID}") return 0 if __name__ == "__main__": raise SystemExit(main())