Spaces:

build-small-hackathon
/

case0

Running

App Files Files Community

case0 / scripts /export_traces.py

HusseinEid

feat: multi-crime cases, scene+exhibit pixel art, background AI generation

80cd1f2 verified about 9 hours ago

raw

history blame

6.25 kB

	"""Export REAL agent traces for the Hub (Build Small "Sharing is Caring" badge).

	Captures two genuine traces from the live, fully-local stack:
	1. CASE GENERATION - every prompt the pipeline sends to the in-process llama.cpp model
	and the raw completion that came back, for one complete authored case;
	2. LIVE INTERROGATION - a short playthrough against the served case: questions (one with
	evidence presented), the suspect's spoken reply, and the server-authoritative
	suspicion/flags, with wall-clock latency per turn.

	Writes ``traces/case0_traces.jsonl`` + ``traces/README.md``. Upload with:

	python scripts/export_traces.py # produce the files
	python scripts/export_traces.py --push # produce AND push to the Hub dataset
	"""

	from __future__ import annotations

	import json
	import sys
	import time
	from pathlib import Path

	ROOT = Path(__file__).resolve().parent.parent
	sys.path.insert(0, str(ROOT / "src"))

	from case_zero.config import get_settings # noqa: E402
	from case_zero.generator.pipeline import generate_case # noqa: E402
	from case_zero.llm.backend import GenParams, LLMBackend, make_backend # noqa: E402

	DATASET_ID = "HusseinEid/case0-traces"
	OUT_DIR = ROOT / "traces"

	_DATASET_README = """---
	license: apache-2.0
	tags:
	- build-small-hackathon
	- agent-trace
	- text-generation
	pretty_name: Case Zero agent traces
	---

	# Case Zero - agent traces

	Real traces from [Case Zero](https://huggingface.co/spaces/build-small-hackathon/case0),
	a procedural detective game where a single Qwen2.5-1.5B model (in-process llama.cpp,
	CPU-only, no cloud APIs) authors a complete mystery and then role-plays every suspect
	live under interrogation.

	`case0_traces.jsonl` - one JSON object per line:

	- `type: "generation_call"` - one pipeline LLM call while authoring a case: the exact
	`prompt`, the raw `completion`, sampling params, and latency. Two calls author a full
	case (world+cast, then mystery); deterministic Python assembles and solver-checks it.
	- `type: "interrogation_turn"` - one live turn against the running game server: the
	player's `question` (optionally `presented_clue`), the suspect's spoken `reply`, and
	the server-authoritative `suspicion` / `flags` that came back, with latency.

	Everything was produced by the shipped game code - no hand-editing, no cloud calls.
	"""


	class _TracingBackend:
	"""Wraps the real backend and records every (prompt, completion) pair."""

	def __init__(self, inner: LLMBackend) -> None:
	self._inner = inner
	self.calls: list[dict] = []

	def generate(self, prompt: str, params: GenParams) -> str:
	t0 = time.time()
	out = self._inner.generate(prompt, params)
	self.calls.append({
	"type": "generation_call",
	"prompt": prompt,
	"completion": out,
	"temperature": params.temperature,
	"max_tokens": params.max_tokens,
	"constrained": bool(params.grammar or params.json_schema),
	"latency_s": round(time.time() - t0, 2),
	})
	return out

	def stream(self, prompt: str, params: GenParams):
	yield self.generate(prompt, params)


	def _generation_trace(records: list[dict]) -> None:
	backend = _TracingBackend(make_backend(get_settings()))
	result = generate_case(backend, seed=77321)
	for call in backend.calls:
	records.append(call)
	records.append({
	"type": "generation_result",
	"case_id": result.case.case_id,
	"crime_kind": result.case.crime_kind.value,
	"title": result.case.title,
	"solvable": result.report.ok,
	"attempts": result.attempts,
	"n_suspects": len(result.case.suspects),
	"n_clues": len(result.case.clues),
	})


	def _interrogation_trace(records: list[dict]) -> None:
	from starlette.testclient import TestClient

	from case_zero.api.server import build_server

	client = TestClient(build_server())
	case = client.post("/api/case", json={}).json()
	run_id = case["runId"]
	pub = case["case"]
	records.append({
	"type": "case_served",
	"case_id": pub["id"],
	"kind": pub.get("kind", "homicide"),
	"title": pub["title"],
	})
	sus = pub["suspects"]
	breaking = pub["evidence"][0]["id"]
	plan = [
	(sus[0]["id"], "Where were you when it happened?", None),
	(sus[0]["id"], "Did you have any quarrel with the victim?", None),
	(sus[1]["id"], "Walk me through your evening, minute by minute.", None),
	(sus[1]["id"], "Explain this.", breaking),
	]
	for sus_id, question, clue in plan:
	t0 = time.time()
	body: dict = {"freeText": question}
	if clue:
	body["presentEvidenceId"] = clue
	r = client.post(f"/api/run/{run_id}/interrogate/{sus_id}", json=body).json()
	records.append({
	"type": "interrogation_turn",
	"suspect": sus_id,
	"question": question,
	"presented_clue": clue,
	"reply": r.get("reply"),
	"suspicion": r.get("suspicion"),
	"suspicion_delta": r.get("suspicionDelta"),
	"flags": r.get("flags"),
	"latency_s": round(time.time() - t0, 2),
	})


	def main() -> int:
	OUT_DIR.mkdir(parents=True, exist_ok=True)
	records: list[dict] = []
	print("tracing one full case generation (two model calls)...")
	_generation_trace(records)
	print("tracing a live interrogation playthrough...")
	_interrogation_trace(records)

	path = OUT_DIR / "case0_traces.jsonl"
	with path.open("w", encoding="utf-8") as fh:
	for rec in records:
	fh.write(json.dumps(rec, ensure_ascii=False) + "\n")
	(OUT_DIR / "README.md").write_text(_DATASET_README, encoding="utf-8")
	print(f"wrote {len(records)} records -> {path}")

	if "--push" in sys.argv:
	from huggingface_hub import HfApi

	api = HfApi()
	api.create_repo(DATASET_ID, repo_type="dataset", exist_ok=True)
	api.upload_folder(repo_id=DATASET_ID, repo_type="dataset", folder_path=str(OUT_DIR))
	print(f"pushed -> https://huggingface.co/datasets/{DATASET_ID}")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())