"""Hugging Face Hub I/O for agent-trace JSONL session files. Accepts both layouts: - `sessions//.jsonl` (e.g. `merve/ml-intern-sessions`) - `_.jsonl` at the dataset root or under subdirs (e.g. `julien-c/pi-sessions`) """ import json import re from pathlib import Path from huggingface_hub import HfApi, hf_hub_download _DATE_RE = re.compile(r"(\d{4}-\d{2}-\d{2})") def _sort_key(path: str) -> tuple: """Sortable key: (date prefix found anywhere in path, full path).""" m = _DATE_RE.search(path) return (m.group(1) if m else "0000-00-00", path) def list_sessions(repo_id: str) -> list[str]: """Return JSONL session paths from a dataset repo, newest first. Any `.jsonl` file in the repo is treated as a session candidate. Duplicates that share a basename (some repos mirror the same file under subdirs) are deduped, keeping the first occurrence. """ info = HfApi().dataset_info(repo_id) seen: set[str] = set() paths: list[str] = [] for s in info.siblings: name = s.rfilename if not name.endswith(".jsonl"): continue base = name.rsplit("/", 1)[-1] if base in seen: continue seen.add(base) paths.append(name) paths.sort(key=_sort_key, reverse=True) return paths def fetch_sessions(repo_id: str, n: int) -> list[tuple[str, list[dict]]]: """Download up to `n` session files (newest first), parse JSONL into events. Returns a list of (session_path, events) tuples. Sessions that fail to parse are skipped. """ paths = list_sessions(repo_id)[:n] out: list[tuple[str, list[dict]]] = [] for path in paths: local = hf_hub_download(repo_id, path, repo_type="dataset") events: list[dict] = [] try: for line in Path(local).read_text().splitlines(): line = line.strip() if not line: continue events.append(json.loads(line)) except json.JSONDecodeError: continue if events: out.append((path, events)) return out