trace-reports / dataset.py
merve's picture
merve HF Staff
Parse both trace formats: Claude-Code (sessions/<date>/) and pi-sessions (root-level message events with toolCall/toolResult roles)
2f1a55a verified
"""Hugging Face Hub I/O for agent-trace JSONL session files.
Accepts both layouts:
- `sessions/<YYYY-MM-DD>/<uuid>.jsonl` (e.g. `merve/ml-intern-sessions`)
- `<YYYY-MM-DDTHH-MM-SS>_<uuid>.jsonl` at the dataset root or under subdirs
(e.g. `julien-c/pi-sessions`)
"""
import json
import re
from pathlib import Path
from huggingface_hub import HfApi, hf_hub_download
_DATE_RE = re.compile(r"(\d{4}-\d{2}-\d{2})")
def _sort_key(path: str) -> tuple:
"""Sortable key: (date prefix found anywhere in path, full path)."""
m = _DATE_RE.search(path)
return (m.group(1) if m else "0000-00-00", path)
def list_sessions(repo_id: str) -> list[str]:
"""Return JSONL session paths from a dataset repo, newest first.
Any `.jsonl` file in the repo is treated as a session candidate. Duplicates
that share a basename (some repos mirror the same file under subdirs) are
deduped, keeping the first occurrence.
"""
info = HfApi().dataset_info(repo_id)
seen: set[str] = set()
paths: list[str] = []
for s in info.siblings:
name = s.rfilename
if not name.endswith(".jsonl"):
continue
base = name.rsplit("/", 1)[-1]
if base in seen:
continue
seen.add(base)
paths.append(name)
paths.sort(key=_sort_key, reverse=True)
return paths
def fetch_sessions(repo_id: str, n: int) -> list[tuple[str, list[dict]]]:
"""Download up to `n` session files (newest first), parse JSONL into events.
Returns a list of (session_path, events) tuples. Sessions that fail to
parse are skipped.
"""
paths = list_sessions(repo_id)[:n]
out: list[tuple[str, list[dict]]] = []
for path in paths:
local = hf_hub_download(repo_id, path, repo_type="dataset")
events: list[dict] = []
try:
for line in Path(local).read_text().splitlines():
line = line.strip()
if not line:
continue
events.append(json.loads(line))
except json.JSONDecodeError:
continue
if events:
out.append((path, events))
return out