| """Hugging Face Hub I/O for agent-trace JSONL session files. |
| |
| Accepts both layouts: |
| - `sessions/<YYYY-MM-DD>/<uuid>.jsonl` (e.g. `merve/ml-intern-sessions`) |
| - `<YYYY-MM-DDTHH-MM-SS>_<uuid>.jsonl` at the dataset root or under subdirs |
| (e.g. `julien-c/pi-sessions`) |
| """ |
|
|
| import json |
| import re |
| from pathlib import Path |
|
|
| from huggingface_hub import HfApi, hf_hub_download |
|
|
| _DATE_RE = re.compile(r"(\d{4}-\d{2}-\d{2})") |
|
|
|
|
| def _sort_key(path: str) -> tuple: |
| """Sortable key: (date prefix found anywhere in path, full path).""" |
| m = _DATE_RE.search(path) |
| return (m.group(1) if m else "0000-00-00", path) |
|
|
|
|
| def list_sessions(repo_id: str) -> list[str]: |
| """Return JSONL session paths from a dataset repo, newest first. |
| |
| Any `.jsonl` file in the repo is treated as a session candidate. Duplicates |
| that share a basename (some repos mirror the same file under subdirs) are |
| deduped, keeping the first occurrence. |
| """ |
| info = HfApi().dataset_info(repo_id) |
| seen: set[str] = set() |
| paths: list[str] = [] |
| for s in info.siblings: |
| name = s.rfilename |
| if not name.endswith(".jsonl"): |
| continue |
| base = name.rsplit("/", 1)[-1] |
| if base in seen: |
| continue |
| seen.add(base) |
| paths.append(name) |
| paths.sort(key=_sort_key, reverse=True) |
| return paths |
|
|
|
|
| def fetch_sessions(repo_id: str, n: int) -> list[tuple[str, list[dict]]]: |
| """Download up to `n` session files (newest first), parse JSONL into events. |
| |
| Returns a list of (session_path, events) tuples. Sessions that fail to |
| parse are skipped. |
| """ |
| paths = list_sessions(repo_id)[:n] |
| out: list[tuple[str, list[dict]]] = [] |
| for path in paths: |
| local = hf_hub_download(repo_id, path, repo_type="dataset") |
| events: list[dict] = [] |
| try: |
| for line in Path(local).read_text().splitlines(): |
| line = line.strip() |
| if not line: |
| continue |
| events.append(json.loads(line)) |
| except json.JSONDecodeError: |
| continue |
| if events: |
| out.append((path, events)) |
| return out |
|
|