Spaces:

build-small-hackathon
/

case0

Running

File size: 6,245 Bytes

80cd1f2

"""Export REAL agent traces for the Hub (Build Small "Sharing is Caring" badge).

Captures two genuine traces from the live, fully-local stack:
1. CASE GENERATION - every prompt the pipeline sends to the in-process llama.cpp model
   and the raw completion that came back, for one complete authored case;
2. LIVE INTERROGATION - a short playthrough against the served case: questions (one with
   evidence presented), the suspect's spoken reply, and the server-authoritative
   suspicion/flags, with wall-clock latency per turn.

Writes ``traces/case0_traces.jsonl`` + ``traces/README.md``. Upload with:

    python scripts/export_traces.py            # produce the files
    python scripts/export_traces.py --push     # produce AND push to the Hub dataset
"""

from __future__ import annotations

import json
import sys
import time
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT / "src"))

from case_zero.config import get_settings  # noqa: E402
from case_zero.generator.pipeline import generate_case  # noqa: E402
from case_zero.llm.backend import GenParams, LLMBackend, make_backend  # noqa: E402

DATASET_ID = "HusseinEid/case0-traces"
OUT_DIR = ROOT / "traces"

_DATASET_README = """---
license: apache-2.0
tags:
  - build-small-hackathon
  - agent-trace
  - text-generation
pretty_name: Case Zero agent traces
---

# Case Zero - agent traces

Real traces from [Case Zero](https://huggingface.co/spaces/build-small-hackathon/case0),
a procedural detective game where a single **Qwen2.5-1.5B** model (in-process llama.cpp,
CPU-only, no cloud APIs) authors a complete mystery and then role-plays every suspect
live under interrogation.

`case0_traces.jsonl` - one JSON object per line:

- `type: "generation_call"` - one pipeline LLM call while authoring a case: the exact
  `prompt`, the raw `completion`, sampling params, and latency. Two calls author a full
  case (world+cast, then mystery); deterministic Python assembles and solver-checks it.
- `type: "interrogation_turn"` - one live turn against the running game server: the
  player's `question` (optionally `presented_clue`), the suspect's spoken `reply`, and
  the server-authoritative `suspicion` / `flags` that came back, with latency.

Everything was produced by the shipped game code - no hand-editing, no cloud calls.
"""


class _TracingBackend:
    """Wraps the real backend and records every (prompt, completion) pair."""

    def __init__(self, inner: LLMBackend) -> None:
        self._inner = inner
        self.calls: list[dict] = []

    def generate(self, prompt: str, params: GenParams) -> str:
        t0 = time.time()
        out = self._inner.generate(prompt, params)
        self.calls.append({
            "type": "generation_call",
            "prompt": prompt,
            "completion": out,
            "temperature": params.temperature,
            "max_tokens": params.max_tokens,
            "constrained": bool(params.grammar or params.json_schema),
            "latency_s": round(time.time() - t0, 2),
        })
        return out

    def stream(self, prompt: str, params: GenParams):
        yield self.generate(prompt, params)


def _generation_trace(records: list[dict]) -> None:
    backend = _TracingBackend(make_backend(get_settings()))
    result = generate_case(backend, seed=77321)
    for call in backend.calls:
        records.append(call)
    records.append({
        "type": "generation_result",
        "case_id": result.case.case_id,
        "crime_kind": result.case.crime_kind.value,
        "title": result.case.title,
        "solvable": result.report.ok,
        "attempts": result.attempts,
        "n_suspects": len(result.case.suspects),
        "n_clues": len(result.case.clues),
    })


def _interrogation_trace(records: list[dict]) -> None:
    from starlette.testclient import TestClient

    from case_zero.api.server import build_server

    client = TestClient(build_server())
    case = client.post("/api/case", json={}).json()
    run_id = case["runId"]
    pub = case["case"]
    records.append({
        "type": "case_served",
        "case_id": pub["id"],
        "kind": pub.get("kind", "homicide"),
        "title": pub["title"],
    })
    sus = pub["suspects"]
    breaking = pub["evidence"][0]["id"]
    plan = [
        (sus[0]["id"], "Where were you when it happened?", None),
        (sus[0]["id"], "Did you have any quarrel with the victim?", None),
        (sus[1]["id"], "Walk me through your evening, minute by minute.", None),
        (sus[1]["id"], "Explain this.", breaking),
    ]
    for sus_id, question, clue in plan:
        t0 = time.time()
        body: dict = {"freeText": question}
        if clue:
            body["presentEvidenceId"] = clue
        r = client.post(f"/api/run/{run_id}/interrogate/{sus_id}", json=body).json()
        records.append({
            "type": "interrogation_turn",
            "suspect": sus_id,
            "question": question,
            "presented_clue": clue,
            "reply": r.get("reply"),
            "suspicion": r.get("suspicion"),
            "suspicion_delta": r.get("suspicionDelta"),
            "flags": r.get("flags"),
            "latency_s": round(time.time() - t0, 2),
        })


def main() -> int:
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    records: list[dict] = []
    print("tracing one full case generation (two model calls)...")
    _generation_trace(records)
    print("tracing a live interrogation playthrough...")
    _interrogation_trace(records)

    path = OUT_DIR / "case0_traces.jsonl"
    with path.open("w", encoding="utf-8") as fh:
        for rec in records:
            fh.write(json.dumps(rec, ensure_ascii=False) + "\n")
    (OUT_DIR / "README.md").write_text(_DATASET_README, encoding="utf-8")
    print(f"wrote {len(records)} records -> {path}")

    if "--push" in sys.argv:
        from huggingface_hub import HfApi

        api = HfApi()
        api.create_repo(DATASET_ID, repo_type="dataset", exist_ok=True)
        api.upload_folder(repo_id=DATASET_ID, repo_type="dataset", folder_path=str(OUT_DIR))
        print(f"pushed -> https://huggingface.co/datasets/{DATASET_ID}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())