Parse both trace formats: Claude-Code (sessions/<date>/) and pi-sessions (root-level message events with toolCall/toolResult roles)
Browse files- analyze.py +6 -10
- dataset.py +32 -5
- extract.py +54 -9
analyze.py
CHANGED
|
@@ -8,6 +8,8 @@ from concurrent.futures import ThreadPoolExecutor
|
|
| 8 |
|
| 9 |
from huggingface_hub import InferenceClient
|
| 10 |
|
|
|
|
|
|
|
| 11 |
MODEL = "Qwen/Qwen3.6-35B-A3B"
|
| 12 |
|
| 13 |
_NO_THINK = {"chat_template_kwargs": {"enable_thinking": False}}
|
|
@@ -90,27 +92,21 @@ def _parse_ts(ts: str) -> dt.datetime | None:
|
|
| 90 |
|
| 91 |
|
| 92 |
def compute_stats(sessions: list[tuple[str, list[dict]]]) -> dict:
|
| 93 |
-
"""Count user turns, distinct tool names, and the first→last timestamp span.
|
|
|
|
| 94 |
turns = 0
|
| 95 |
tools: set[str] = set()
|
| 96 |
timestamps: list[dt.datetime] = []
|
| 97 |
for _path, events in sessions:
|
| 98 |
for ev in events:
|
| 99 |
-
if
|
| 100 |
turns += 1
|
| 101 |
ts = ev.get("timestamp")
|
| 102 |
if isinstance(ts, str):
|
| 103 |
parsed = _parse_ts(ts)
|
| 104 |
if parsed:
|
| 105 |
timestamps.append(parsed)
|
| 106 |
-
|
| 107 |
-
content = msg.get("content")
|
| 108 |
-
if isinstance(content, list):
|
| 109 |
-
for block in content:
|
| 110 |
-
if isinstance(block, dict) and block.get("type") == "tool_use":
|
| 111 |
-
name = block.get("name")
|
| 112 |
-
if isinstance(name, str) and name:
|
| 113 |
-
tools.add(name)
|
| 114 |
|
| 115 |
span = ""
|
| 116 |
if timestamps:
|
|
|
|
| 8 |
|
| 9 |
from huggingface_hub import InferenceClient
|
| 10 |
|
| 11 |
+
from extract import event_role, event_tool_names
|
| 12 |
+
|
| 13 |
MODEL = "Qwen/Qwen3.6-35B-A3B"
|
| 14 |
|
| 15 |
_NO_THINK = {"chat_template_kwargs": {"enable_thinking": False}}
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
def compute_stats(sessions: list[tuple[str, list[dict]]]) -> dict:
|
| 95 |
+
"""Count user turns, distinct tool names, and the first→last timestamp span.
|
| 96 |
+
Format-agnostic (Claude-Code style and pi-sessions style both handled)."""
|
| 97 |
turns = 0
|
| 98 |
tools: set[str] = set()
|
| 99 |
timestamps: list[dt.datetime] = []
|
| 100 |
for _path, events in sessions:
|
| 101 |
for ev in events:
|
| 102 |
+
if event_role(ev) == "user":
|
| 103 |
turns += 1
|
| 104 |
ts = ev.get("timestamp")
|
| 105 |
if isinstance(ts, str):
|
| 106 |
parsed = _parse_ts(ts)
|
| 107 |
if parsed:
|
| 108 |
timestamps.append(parsed)
|
| 109 |
+
tools.update(event_tool_names(ev))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
span = ""
|
| 112 |
if timestamps:
|
dataset.py
CHANGED
|
@@ -1,4 +1,10 @@
|
|
| 1 |
-
"""Hugging Face Hub I/O for agent-trace JSONL session files.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import json
|
| 4 |
import re
|
|
@@ -6,14 +12,35 @@ from pathlib import Path
|
|
| 6 |
|
| 7 |
from huggingface_hub import HfApi, hf_hub_download
|
| 8 |
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def list_sessions(repo_id: str) -> list[str]:
|
| 13 |
-
"""Return JSONL session paths from a dataset repo, newest
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
info = HfApi().dataset_info(repo_id)
|
| 15 |
-
|
| 16 |
-
paths
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
return paths
|
| 18 |
|
| 19 |
|
|
|
|
| 1 |
+
"""Hugging Face Hub I/O for agent-trace JSONL session files.
|
| 2 |
+
|
| 3 |
+
Accepts both layouts:
|
| 4 |
+
- `sessions/<YYYY-MM-DD>/<uuid>.jsonl` (e.g. `merve/ml-intern-sessions`)
|
| 5 |
+
- `<YYYY-MM-DDTHH-MM-SS>_<uuid>.jsonl` at the dataset root or under subdirs
|
| 6 |
+
(e.g. `julien-c/pi-sessions`)
|
| 7 |
+
"""
|
| 8 |
|
| 9 |
import json
|
| 10 |
import re
|
|
|
|
| 12 |
|
| 13 |
from huggingface_hub import HfApi, hf_hub_download
|
| 14 |
|
| 15 |
+
_DATE_RE = re.compile(r"(\d{4}-\d{2}-\d{2})")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _sort_key(path: str) -> tuple:
|
| 19 |
+
"""Sortable key: (date prefix found anywhere in path, full path)."""
|
| 20 |
+
m = _DATE_RE.search(path)
|
| 21 |
+
return (m.group(1) if m else "0000-00-00", path)
|
| 22 |
|
| 23 |
|
| 24 |
def list_sessions(repo_id: str) -> list[str]:
|
| 25 |
+
"""Return JSONL session paths from a dataset repo, newest first.
|
| 26 |
+
|
| 27 |
+
Any `.jsonl` file in the repo is treated as a session candidate. Duplicates
|
| 28 |
+
that share a basename (some repos mirror the same file under subdirs) are
|
| 29 |
+
deduped, keeping the first occurrence.
|
| 30 |
+
"""
|
| 31 |
info = HfApi().dataset_info(repo_id)
|
| 32 |
+
seen: set[str] = set()
|
| 33 |
+
paths: list[str] = []
|
| 34 |
+
for s in info.siblings:
|
| 35 |
+
name = s.rfilename
|
| 36 |
+
if not name.endswith(".jsonl"):
|
| 37 |
+
continue
|
| 38 |
+
base = name.rsplit("/", 1)[-1]
|
| 39 |
+
if base in seen:
|
| 40 |
+
continue
|
| 41 |
+
seen.add(base)
|
| 42 |
+
paths.append(name)
|
| 43 |
+
paths.sort(key=_sort_key, reverse=True)
|
| 44 |
return paths
|
| 45 |
|
| 46 |
|
extract.py
CHANGED
|
@@ -1,8 +1,49 @@
|
|
| 1 |
-
"""Pure transforms on agent-trace event lists. No I/O.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from typing import Any
|
| 4 |
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
def _user_content_to_text(content: Any) -> str:
|
| 7 |
if isinstance(content, str):
|
| 8 |
return content
|
|
@@ -22,6 +63,7 @@ def _user_content_to_text(content: Any) -> str:
|
|
| 22 |
|
| 23 |
|
| 24 |
def _assistant_content_to_text(content: Any) -> str:
|
|
|
|
| 25 |
if isinstance(content, str):
|
| 26 |
return content
|
| 27 |
if isinstance(content, list):
|
|
@@ -36,16 +78,19 @@ def _assistant_content_to_text(content: Any) -> str:
|
|
| 36 |
def events_to_transcript(events: list[dict]) -> str:
|
| 37 |
lines: list[str] = []
|
| 38 |
for ev in events:
|
|
|
|
|
|
|
|
|
|
| 39 |
msg = ev.get("message") or {}
|
| 40 |
content = msg.get("content")
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
if
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
if
|
| 48 |
-
|
| 49 |
return "\n\n".join(lines)
|
| 50 |
|
| 51 |
|
|
|
|
| 1 |
+
"""Pure transforms on agent-trace event lists. No I/O.
|
| 2 |
+
|
| 3 |
+
Supports two on-disk formats:
|
| 4 |
+
1. Claude-Code style — `{type: "user"|"assistant", message: {role, content}}`.
|
| 5 |
+
Example dataset: `merve/ml-intern-sessions`.
|
| 6 |
+
2. pi-sessions style — `{type: "message", message: {role: "user"|"assistant"|"toolResult", content: [...]}}`.
|
| 7 |
+
Example dataset: `julien-c/pi-sessions`. Tool calls use `toolCall` blocks;
|
| 8 |
+
tool outputs come back as role=toolResult messages which we drop.
|
| 9 |
+
"""
|
| 10 |
|
| 11 |
from typing import Any
|
| 12 |
|
| 13 |
|
| 14 |
+
def event_role(ev: dict) -> str | None:
|
| 15 |
+
"""Normalised role of a trace event. Returns 'user' / 'assistant' or None
|
| 16 |
+
for non-content events (session metadata, tool results, model_change, etc.)."""
|
| 17 |
+
t = ev.get("type")
|
| 18 |
+
if t in ("user", "assistant"):
|
| 19 |
+
return t
|
| 20 |
+
if t == "message":
|
| 21 |
+
msg = ev.get("message") or {}
|
| 22 |
+
role = msg.get("role")
|
| 23 |
+
if role in ("user", "assistant"):
|
| 24 |
+
return role
|
| 25 |
+
return None
|
| 26 |
+
return None
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def event_tool_names(ev: dict) -> list[str]:
|
| 30 |
+
"""Extract tool names invoked in this event, across both formats."""
|
| 31 |
+
msg = ev.get("message") or {}
|
| 32 |
+
content = msg.get("content")
|
| 33 |
+
if not isinstance(content, list):
|
| 34 |
+
return []
|
| 35 |
+
out: list[str] = []
|
| 36 |
+
for block in content:
|
| 37 |
+
if not isinstance(block, dict):
|
| 38 |
+
continue
|
| 39 |
+
bt = block.get("type")
|
| 40 |
+
if bt in ("tool_use", "toolCall"):
|
| 41 |
+
name = block.get("name")
|
| 42 |
+
if isinstance(name, str) and name:
|
| 43 |
+
out.append(name)
|
| 44 |
+
return out
|
| 45 |
+
|
| 46 |
+
|
| 47 |
def _user_content_to_text(content: Any) -> str:
|
| 48 |
if isinstance(content, str):
|
| 49 |
return content
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
def _assistant_content_to_text(content: Any) -> str:
|
| 66 |
+
"""Concatenate text blocks; drop thinking / tool_use / toolCall blocks."""
|
| 67 |
if isinstance(content, str):
|
| 68 |
return content
|
| 69 |
if isinstance(content, list):
|
|
|
|
| 78 |
def events_to_transcript(events: list[dict]) -> str:
|
| 79 |
lines: list[str] = []
|
| 80 |
for ev in events:
|
| 81 |
+
role = event_role(ev)
|
| 82 |
+
if role not in ("user", "assistant"):
|
| 83 |
+
continue
|
| 84 |
msg = ev.get("message") or {}
|
| 85 |
content = msg.get("content")
|
| 86 |
+
text = (
|
| 87 |
+
_user_content_to_text(content)
|
| 88 |
+
if role == "user"
|
| 89 |
+
else _assistant_content_to_text(content)
|
| 90 |
+
).strip()
|
| 91 |
+
if text:
|
| 92 |
+
label = "User" if role == "user" else "Assistant"
|
| 93 |
+
lines.append(f"{label}: {text}")
|
| 94 |
return "\n\n".join(lines)
|
| 95 |
|
| 96 |
|