File size: 2,159 Bytes
2f1a55a
 
 
 
 
 
 
1252cb9
 
 
 
 
 
 
2f1a55a
 
 
 
 
 
 
1252cb9
 
 
2f1a55a
 
 
 
 
 
1252cb9
2f1a55a
 
 
 
 
 
 
 
 
 
 
 
1252cb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""Hugging Face Hub I/O for agent-trace JSONL session files.

Accepts both layouts:
- `sessions/<YYYY-MM-DD>/<uuid>.jsonl` (e.g. `merve/ml-intern-sessions`)
- `<YYYY-MM-DDTHH-MM-SS>_<uuid>.jsonl` at the dataset root or under subdirs
  (e.g. `julien-c/pi-sessions`)
"""

import json
import re
from pathlib import Path

from huggingface_hub import HfApi, hf_hub_download

_DATE_RE = re.compile(r"(\d{4}-\d{2}-\d{2})")


def _sort_key(path: str) -> tuple:
    """Sortable key: (date prefix found anywhere in path, full path)."""
    m = _DATE_RE.search(path)
    return (m.group(1) if m else "0000-00-00", path)


def list_sessions(repo_id: str) -> list[str]:
    """Return JSONL session paths from a dataset repo, newest first.

    Any `.jsonl` file in the repo is treated as a session candidate. Duplicates
    that share a basename (some repos mirror the same file under subdirs) are
    deduped, keeping the first occurrence.
    """
    info = HfApi().dataset_info(repo_id)
    seen: set[str] = set()
    paths: list[str] = []
    for s in info.siblings:
        name = s.rfilename
        if not name.endswith(".jsonl"):
            continue
        base = name.rsplit("/", 1)[-1]
        if base in seen:
            continue
        seen.add(base)
        paths.append(name)
    paths.sort(key=_sort_key, reverse=True)
    return paths


def fetch_sessions(repo_id: str, n: int) -> list[tuple[str, list[dict]]]:
    """Download up to `n` session files (newest first), parse JSONL into events.

    Returns a list of (session_path, events) tuples. Sessions that fail to
    parse are skipped.
    """
    paths = list_sessions(repo_id)[:n]
    out: list[tuple[str, list[dict]]] = []
    for path in paths:
        local = hf_hub_download(repo_id, path, repo_type="dataset")
        events: list[dict] = []
        try:
            for line in Path(local).read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                events.append(json.loads(line))
        except json.JSONDecodeError:
            continue
        if events:
            out.append((path, events))
    return out