merve HF Staff commited on
Commit
2f1a55a
·
verified ·
1 Parent(s): caefa89

Parse both trace formats: Claude-Code (sessions/<date>/) and pi-sessions (root-level message events with toolCall/toolResult roles)

Browse files
Files changed (3) hide show
  1. analyze.py +6 -10
  2. dataset.py +32 -5
  3. extract.py +54 -9
analyze.py CHANGED
@@ -8,6 +8,8 @@ from concurrent.futures import ThreadPoolExecutor
8
 
9
  from huggingface_hub import InferenceClient
10
 
 
 
11
  MODEL = "Qwen/Qwen3.6-35B-A3B"
12
 
13
  _NO_THINK = {"chat_template_kwargs": {"enable_thinking": False}}
@@ -90,27 +92,21 @@ def _parse_ts(ts: str) -> dt.datetime | None:
90
 
91
 
92
  def compute_stats(sessions: list[tuple[str, list[dict]]]) -> dict:
93
- """Count user turns, distinct tool names, and the first→last timestamp span."""
 
94
  turns = 0
95
  tools: set[str] = set()
96
  timestamps: list[dt.datetime] = []
97
  for _path, events in sessions:
98
  for ev in events:
99
- if ev.get("type") == "user":
100
  turns += 1
101
  ts = ev.get("timestamp")
102
  if isinstance(ts, str):
103
  parsed = _parse_ts(ts)
104
  if parsed:
105
  timestamps.append(parsed)
106
- msg = ev.get("message") or {}
107
- content = msg.get("content")
108
- if isinstance(content, list):
109
- for block in content:
110
- if isinstance(block, dict) and block.get("type") == "tool_use":
111
- name = block.get("name")
112
- if isinstance(name, str) and name:
113
- tools.add(name)
114
 
115
  span = ""
116
  if timestamps:
 
8
 
9
  from huggingface_hub import InferenceClient
10
 
11
+ from extract import event_role, event_tool_names
12
+
13
  MODEL = "Qwen/Qwen3.6-35B-A3B"
14
 
15
  _NO_THINK = {"chat_template_kwargs": {"enable_thinking": False}}
 
92
 
93
 
94
  def compute_stats(sessions: list[tuple[str, list[dict]]]) -> dict:
95
+ """Count user turns, distinct tool names, and the first→last timestamp span.
96
+ Format-agnostic (Claude-Code style and pi-sessions style both handled)."""
97
  turns = 0
98
  tools: set[str] = set()
99
  timestamps: list[dt.datetime] = []
100
  for _path, events in sessions:
101
  for ev in events:
102
+ if event_role(ev) == "user":
103
  turns += 1
104
  ts = ev.get("timestamp")
105
  if isinstance(ts, str):
106
  parsed = _parse_ts(ts)
107
  if parsed:
108
  timestamps.append(parsed)
109
+ tools.update(event_tool_names(ev))
 
 
 
 
 
 
 
110
 
111
  span = ""
112
  if timestamps:
dataset.py CHANGED
@@ -1,4 +1,10 @@
1
- """Hugging Face Hub I/O for agent-trace JSONL session files."""
 
 
 
 
 
 
2
 
3
  import json
4
  import re
@@ -6,14 +12,35 @@ from pathlib import Path
6
 
7
  from huggingface_hub import HfApi, hf_hub_download
8
 
9
- _SESSION_RE = re.compile(r"^sessions/(\d{4}-\d{2}-\d{2})/[^/]+\.jsonl$")
 
 
 
 
 
 
10
 
11
 
12
  def list_sessions(repo_id: str) -> list[str]:
13
- """Return JSONL session paths from a dataset repo, newest date first."""
 
 
 
 
 
14
  info = HfApi().dataset_info(repo_id)
15
- paths = [s.rfilename for s in info.siblings if _SESSION_RE.match(s.rfilename)]
16
- paths.sort(key=lambda p: _SESSION_RE.match(p).group(1), reverse=True)
 
 
 
 
 
 
 
 
 
 
17
  return paths
18
 
19
 
 
1
+ """Hugging Face Hub I/O for agent-trace JSONL session files.
2
+
3
+ Accepts both layouts:
4
+ - `sessions/<YYYY-MM-DD>/<uuid>.jsonl` (e.g. `merve/ml-intern-sessions`)
5
+ - `<YYYY-MM-DDTHH-MM-SS>_<uuid>.jsonl` at the dataset root or under subdirs
6
+ (e.g. `julien-c/pi-sessions`)
7
+ """
8
 
9
  import json
10
  import re
 
12
 
13
  from huggingface_hub import HfApi, hf_hub_download
14
 
15
+ _DATE_RE = re.compile(r"(\d{4}-\d{2}-\d{2})")
16
+
17
+
18
+ def _sort_key(path: str) -> tuple:
19
+ """Sortable key: (date prefix found anywhere in path, full path)."""
20
+ m = _DATE_RE.search(path)
21
+ return (m.group(1) if m else "0000-00-00", path)
22
 
23
 
24
  def list_sessions(repo_id: str) -> list[str]:
25
+ """Return JSONL session paths from a dataset repo, newest first.
26
+
27
+ Any `.jsonl` file in the repo is treated as a session candidate. Duplicates
28
+ that share a basename (some repos mirror the same file under subdirs) are
29
+ deduped, keeping the first occurrence.
30
+ """
31
  info = HfApi().dataset_info(repo_id)
32
+ seen: set[str] = set()
33
+ paths: list[str] = []
34
+ for s in info.siblings:
35
+ name = s.rfilename
36
+ if not name.endswith(".jsonl"):
37
+ continue
38
+ base = name.rsplit("/", 1)[-1]
39
+ if base in seen:
40
+ continue
41
+ seen.add(base)
42
+ paths.append(name)
43
+ paths.sort(key=_sort_key, reverse=True)
44
  return paths
45
 
46
 
extract.py CHANGED
@@ -1,8 +1,49 @@
1
- """Pure transforms on agent-trace event lists. No I/O."""
 
 
 
 
 
 
 
 
2
 
3
  from typing import Any
4
 
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  def _user_content_to_text(content: Any) -> str:
7
  if isinstance(content, str):
8
  return content
@@ -22,6 +63,7 @@ def _user_content_to_text(content: Any) -> str:
22
 
23
 
24
  def _assistant_content_to_text(content: Any) -> str:
 
25
  if isinstance(content, str):
26
  return content
27
  if isinstance(content, list):
@@ -36,16 +78,19 @@ def _assistant_content_to_text(content: Any) -> str:
36
  def events_to_transcript(events: list[dict]) -> str:
37
  lines: list[str] = []
38
  for ev in events:
 
 
 
39
  msg = ev.get("message") or {}
40
  content = msg.get("content")
41
- if ev.get("type") == "user":
42
- text = _user_content_to_text(content).strip()
43
- if text:
44
- lines.append(f"User: {text}")
45
- elif ev.get("type") == "assistant":
46
- text = _assistant_content_to_text(content).strip()
47
- if text:
48
- lines.append(f"Assistant: {text}")
49
  return "\n\n".join(lines)
50
 
51
 
 
1
+ """Pure transforms on agent-trace event lists. No I/O.
2
+
3
+ Supports two on-disk formats:
4
+ 1. Claude-Code style — `{type: "user"|"assistant", message: {role, content}}`.
5
+ Example dataset: `merve/ml-intern-sessions`.
6
+ 2. pi-sessions style — `{type: "message", message: {role: "user"|"assistant"|"toolResult", content: [...]}}`.
7
+ Example dataset: `julien-c/pi-sessions`. Tool calls use `toolCall` blocks;
8
+ tool outputs come back as role=toolResult messages which we drop.
9
+ """
10
 
11
  from typing import Any
12
 
13
 
14
+ def event_role(ev: dict) -> str | None:
15
+ """Normalised role of a trace event. Returns 'user' / 'assistant' or None
16
+ for non-content events (session metadata, tool results, model_change, etc.)."""
17
+ t = ev.get("type")
18
+ if t in ("user", "assistant"):
19
+ return t
20
+ if t == "message":
21
+ msg = ev.get("message") or {}
22
+ role = msg.get("role")
23
+ if role in ("user", "assistant"):
24
+ return role
25
+ return None
26
+ return None
27
+
28
+
29
+ def event_tool_names(ev: dict) -> list[str]:
30
+ """Extract tool names invoked in this event, across both formats."""
31
+ msg = ev.get("message") or {}
32
+ content = msg.get("content")
33
+ if not isinstance(content, list):
34
+ return []
35
+ out: list[str] = []
36
+ for block in content:
37
+ if not isinstance(block, dict):
38
+ continue
39
+ bt = block.get("type")
40
+ if bt in ("tool_use", "toolCall"):
41
+ name = block.get("name")
42
+ if isinstance(name, str) and name:
43
+ out.append(name)
44
+ return out
45
+
46
+
47
  def _user_content_to_text(content: Any) -> str:
48
  if isinstance(content, str):
49
  return content
 
63
 
64
 
65
  def _assistant_content_to_text(content: Any) -> str:
66
+ """Concatenate text blocks; drop thinking / tool_use / toolCall blocks."""
67
  if isinstance(content, str):
68
  return content
69
  if isinstance(content, list):
 
78
  def events_to_transcript(events: list[dict]) -> str:
79
  lines: list[str] = []
80
  for ev in events:
81
+ role = event_role(ev)
82
+ if role not in ("user", "assistant"):
83
+ continue
84
  msg = ev.get("message") or {}
85
  content = msg.get("content")
86
+ text = (
87
+ _user_content_to_text(content)
88
+ if role == "user"
89
+ else _assistant_content_to_text(content)
90
+ ).strip()
91
+ if text:
92
+ label = "User" if role == "user" else "Assistant"
93
+ lines.append(f"{label}: {text}")
94
  return "\n\n".join(lines)
95
 
96