trace-reports / analyze.py
mervenoyan's picture
Add source session id to each sin and force smart-quote wrapping
fabd7ab
"""InferenceClient calls: map (per-session digests) + reduce (bulletin)."""
import datetime as dt
import hashlib
import json
import os
from concurrent.futures import ThreadPoolExecutor
from huggingface_hub import InferenceClient
from extract import event_role, event_tool_names
MODEL = "Qwen/Qwen3.5-9B"
_NO_THINK = {"chat_template_kwargs": {"enable_thinking": False}}
def get_client(token: str | None = None) -> InferenceClient:
"""Build the InferenceClient. Centralised so OAuth swap is one place."""
if token is None:
token = os.environ.get("HF_TOKEN")
if not token:
raise RuntimeError(
"HF_TOKEN is not set. Export it in your shell or pass token= explicitly."
)
return InferenceClient(model=MODEL, token=token)
# ---------- map: per-session digest ----------
_DIGEST_SYSTEM = """You are analysing a single coding-agent session transcript. The TRANSCRIPT shows messages between a HUMAN USER and an AGENT (the AI). Return signals about the HUMAN USER only — never about the agent.
Return STRICT JSON:
{
"session_id": <echo>,
"intent": "<one sentence: what the user was trying to do>",
"top_quotes": [<1-3 short verbatim quotes from USER messages only>],
"tells": [<3-5 short strings: signals about the user — frustration, confidence, knowledge gaps, communication style, premature optimization, doc-avoidance, etc.>],
"mood": "<one short phrase: the session's emotional arc>"
}
Hard rules:
- Only include things the user actually said or did. Do not attribute agent behaviour to the user.
- top_quotes must literally appear in user messages.
- Be concise and specific. No invented quotes."""
def digest_session(client: InferenceClient, transcript: str, session_id: str) -> dict:
user_prompt = f"session_id: {session_id}\n\nTranscript:\n{transcript}"
try:
resp = client.chat_completion(
messages=[
{"role": "system", "content": _DIGEST_SYSTEM},
{"role": "user", "content": user_prompt},
],
response_format={"type": "json_object"},
max_tokens=800,
temperature=0,
extra_body=_NO_THINK,
)
raw = resp.choices[0].message.content or "{}"
data = json.loads(raw)
data.setdefault("session_id", session_id)
return data
except Exception as e:
return {"session_id": session_id, "error": str(e)}
def digest_all(
client: InferenceClient,
transcripts: list[tuple[str, str]],
max_workers: int = 8,
) -> list[dict]:
"""Run digest_session over all transcripts in parallel. Drops error entries."""
def _one(item):
sid, text = item
return digest_session(client, text, sid)
with ThreadPoolExecutor(max_workers=max_workers) as ex:
results = list(ex.map(_one, transcripts))
return [r for r in results if "error" not in r]
# ---------- stats from raw events ----------
def _parse_ts(ts: str) -> dt.datetime | None:
try:
return dt.datetime.fromisoformat(ts.replace("Z", "+00:00"))
except Exception:
return None
def compute_stats(sessions: list[tuple[str, list[dict]]]) -> dict:
"""Count user turns, distinct tool names, and the first→last timestamp span.
Format-agnostic (Claude-Code style and pi-sessions style both handled)."""
turns = 0
tools: set[str] = set()
timestamps: list[dt.datetime] = []
for _path, events in sessions:
for ev in events:
if event_role(ev) == "user":
turns += 1
ts = ev.get("timestamp")
if isinstance(ts, str):
parsed = _parse_ts(ts)
if parsed:
timestamps.append(parsed)
tools.update(event_tool_names(ev))
span = ""
if timestamps:
timestamps.sort()
first, last = timestamps[0], timestamps[-1]
if first.year == last.year:
span = f"{first.strftime('%b %d')}{last.strftime('%b %d, %Y')}"
else:
span = f"{first.strftime('%b %d, %Y')}{last.strftime('%b %d, %Y')}"
return {"turns": turns, "tools": len(tools), "span": span}
def serial_for(user: str) -> str:
"""Stable per-user 4-digit serial."""
h = int(hashlib.sha256(user.encode("utf-8")).hexdigest(), 16)
return f"PR-{h % 10000:04d}"
# ---------- reduce: bulletin generation ----------
# Adapted from the design handoff's CONTENT_PROMPT.md.
_BULLETIN_SYSTEM = """You are the Hugging Face Roastery. You read agent-trace dataset digests and write a gently savage personality bulletin about the HUMAN USER who was prompting the agent — never about the agent itself. The output is a vintage printed card; every field has a strict length budget. Be specific, be funny, never punch down.
You will receive:
- user: the Hugging Face handle of the operator.
- dataset: the Hub dataset ID being analysed.
- digests: a JSON list of per-session digests already extracted from the traces (intent, top_quotes, tells, mood).
Return EXACTLY one JSON object, no prose, no markdown:
{
"user": "<bare handle, no @>",
"archetype": ["The <adjective>", "<Noun>"],
"tagline": "<130-170 chars, 2-3 italic lines, sentences only, end on a punchline>",
"sins": [
{"n":"01","title":"<50-90 chars: one concrete user behaviour, sentence case, no quotes>","meta":"<30-110 chars: a VERBATIM quote from a top_quotes entry — raw text only, NO surrounding quote marks (render adds them)>","source":"<the exact session_id of the digest the quote was taken from>"},
{"n":"02","title":"...","meta":"...","source":"..."},
{"n":"03","title":"...","meta":"...","source":"..."}
],
"forecast": {"headline":"The week ahead","body":"<270-340 chars, horoscope-style, end with 'Lucky <x>: <y>. Avoid: <z>.'>"}
}
Field budgets (hard limits — overflow breaks the layout):
- archetype[0]: 8-18 chars (line 1, usually "The <adjective>")
- archetype[1]: 6-14 chars (line 2, title-cased punch noun)
- tagline: 130-170 chars
- sins[].title: 50-90 chars
- sins[].meta: 30-110 chars (raw verbatim user quote, no surrounding quote marks)
- sins[].source: the session_id from the digest the quote came from
- forecast.body: 270-340 chars, ends with "Lucky <x>: <y>. Avoid: <z>."
The sins array MUST contain exactly 3 objects. Do not emit fewer.
Voice:
- Sharp but loving — group-chat energy, not insult-comic. Roast habits a thoughtful friend would call out.
- Sentence case for titles. Smart quotes ( " " ), en-dashes ( – ), em-dashes ( — ). No exclamation marks. No emojis.
- Specific, not generic. Every observation must be grounded in something the digests actually contain.
Hard rules:
1. Roast the USER, not the agent. The user cannot run code; only the agent can. Wrong: "Parsed JSON with a regex twice." (that's the agent). Right: "Asked the agent to parse JSON with a regex twice." / "Demanded a regex over a JSON parser, against advice."
2. EVERY sins[].meta MUST be a verbatim top_quote from one of the digests. Emit the raw text only — NO surrounding quote marks (the renderer wraps it). No paraphrasing, no rewording, no analysis. Just the user's own words. If no top_quote fits a sin you've drafted, pick a different sin that does have a fitting quote.
3. EVERY sins[].source MUST be the exact `session_id` value of the digest the quote came from. Copy it verbatim — do not shorten, rename, or invent.
4. The title is the roast sentence (no quotes inside it); the meta below it is the receipt — the user's own words that prove the sin. Title and meta must be different content, not paraphrases of each other.
5. No PII. No emails, no real names, no private repos. Public handles and public dataset names are fine.
6. No identity punching. Roast process and habits — not who the user is. Off-limits: appearance, nationality, gender, politics, illness. Fair game: ignoring docs, refactor addiction, regex misuse, vibes-driven coding, asking the same thing six ways, premature optimisation, late-night commits.
Procedure:
1. Skim the digests for recurring patterns (repeated questions, premature optimisation, doc avoidance, tone, tool misuse, mood arc).
2. Pick ONE crisp archetype. Examples: The Premature Optimizer · The Vibes Driver · The Doc Avoider · The Refactor Romantic · The Confidence Auditor · The Apology Engineer · The TODO Composer. Invent freely.
3. Pick three sins the digests support. For each: write a roast sentence as the title, pick a verbatim top_quote that proves the sin and place it (raw, no quote marks) as the meta, and set source to that digest's session_id.
4. Tagline: 2-3 short sentences piling on the archetype with concrete examples. End on a punchline.
5. Horoscope: one absurd technical prediction grounded in a real user pattern. Close with "Lucky <something>: <x>. Avoid: <y>."
6. Validate lengths against budgets. Trim or pad before emitting.
7. Emit JSON only. No code fences. No commentary."""
def bulletin(
client: InferenceClient,
digests: list[dict],
user: str,
dataset_id: str,
) -> dict:
"""Generate the report content (archetype, tagline, sins, forecast). One JSON call."""
user_prompt = (
f"user: {user}\n"
f"dataset: {dataset_id}\n\n"
f"digests (JSON list):\n{json.dumps(digests, ensure_ascii=False, indent=2)}\n\n"
"Reminder: emit EXACTLY 3 sins. Each sin needs `title` (the roast), "
"`meta` (a VERBATIM top_quote, raw text only — no surrounding quote "
"marks; the renderer wraps them), and `source` (the session_id of the "
"digest the quote was taken from, copied verbatim). "
"Tagline ≤170 chars; forecast.body ≤340 chars."
)
resp = client.chat_completion(
messages=[
{"role": "system", "content": _BULLETIN_SYSTEM},
{"role": "user", "content": user_prompt},
],
response_format={"type": "json_object"},
max_tokens=1500,
temperature=0,
extra_body=_NO_THINK,
)
raw = resp.choices[0].message.content or "{}"
return json.loads(raw)
def build_report(
client: InferenceClient,
digests: list[dict],
user: str,
dataset_id: str,
stats: dict,
) -> dict:
"""Combine model output + computed stats into the full report dict for render.py."""
data = bulletin(client, digests, user, dataset_id)
today = dt.date.today().strftime("%b %d, %Y")
archetype = data.get("archetype") or ["The", "Unreadable"]
if not isinstance(archetype, list) or len(archetype) < 2:
archetype = ["The", "Unreadable"]
sins = data.get("sins") or []
sins = sins[:3] + [{"n": f"{i+1:02d}", "title": "—", "meta": "—", "source": ""} for i in range(len(sins), 3)]
forecast = data.get("forecast") or {"headline": "The week ahead", "body": "The cards are quiet today."}
return {
"user": str(data.get("user") or user),
"archetype": [str(archetype[0]), str(archetype[1])],
"tagline": str(data.get("tagline") or ""),
"sins": [
{
"n": str(s.get("n") or f"{i+1:02d}"),
"title": str(s.get("title") or "—"),
"meta": str(s.get("meta") or "—"),
"source": str(s.get("source") or ""),
}
for i, s in enumerate(sins[:3])
],
"forecast": {
"headline": str(forecast.get("headline") or "The week ahead"),
"body": str(forecast.get("body") or ""),
},
"dataset": dataset_id,
"turns": int(stats.get("turns") or 0),
"tools": int(stats.get("tools") or 0),
"span": str(stats.get("span") or ""),
"generated": today,
"serial": serial_for(user),
}