File size: 1,839 Bytes
a610111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5fc740
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""
hf_cr_index.py — Shared HuggingFace dataset index helpers.

Used by both build_cr_index.py and orchestrate_cr.py.
Depends only on huggingface_hub (pip install huggingface_hub).
"""

import json
from pathlib import Path


def load_hf_index(hf_token: str, hf_repo: str) -> list[dict]:
    """
    Download cr_index.jsonl from the HF dataset and return parsed records.
    Returns an empty list if the file does not exist yet.
    """
    from huggingface_hub import hf_hub_download
    from huggingface_hub.errors import EntryNotFoundError

    try:
        path = hf_hub_download(
            repo_id=hf_repo,
            filename="cr_index.jsonl",
            repo_type="dataset",
            token=hf_token,
        )
        text = Path(path).read_text(encoding="utf-8")
        return [json.loads(line) for line in text.splitlines() if line.strip()]
    except EntryNotFoundError:
        return []


def push_hf_index(records: list[dict], hf_token: str, hf_repo: str) -> None:
    """
    Push all records as cr_index.jsonl to the HF dataset repo.
    Creates the repo if it does not exist.
    """
    from huggingface_hub import HfApi

    api = HfApi()
    api.create_repo(
        repo_id=hf_repo,
        repo_type="dataset",
        exist_ok=True,
        token=hf_token,
    )
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl',
                                     encoding='utf-8', delete=False) as _f:
        _f.write("\n".join(json.dumps(r, ensure_ascii=False) for r in records))
        _tmp_path = _f.name
    try:
        api.upload_file(
            path_or_fileobj=_tmp_path,
            path_in_repo="cr_index.jsonl",
            repo_id=hf_repo,
            repo_type="dataset",
            token=hf_token,
        )
    finally:
        Path(_tmp_path).unlink(missing_ok=True)