Spaces:
Sleeping
Sleeping
File size: 1,839 Bytes
a610111 b5fc740 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | """
hf_cr_index.py — Shared HuggingFace dataset index helpers.
Used by both build_cr_index.py and orchestrate_cr.py.
Depends only on huggingface_hub (pip install huggingface_hub).
"""
import json
from pathlib import Path
def load_hf_index(hf_token: str, hf_repo: str) -> list[dict]:
"""
Download cr_index.jsonl from the HF dataset and return parsed records.
Returns an empty list if the file does not exist yet.
"""
from huggingface_hub import hf_hub_download
from huggingface_hub.errors import EntryNotFoundError
try:
path = hf_hub_download(
repo_id=hf_repo,
filename="cr_index.jsonl",
repo_type="dataset",
token=hf_token,
)
text = Path(path).read_text(encoding="utf-8")
return [json.loads(line) for line in text.splitlines() if line.strip()]
except EntryNotFoundError:
return []
def push_hf_index(records: list[dict], hf_token: str, hf_repo: str) -> None:
"""
Push all records as cr_index.jsonl to the HF dataset repo.
Creates the repo if it does not exist.
"""
from huggingface_hub import HfApi
api = HfApi()
api.create_repo(
repo_id=hf_repo,
repo_type="dataset",
exist_ok=True,
token=hf_token,
)
import tempfile
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl',
encoding='utf-8', delete=False) as _f:
_f.write("\n".join(json.dumps(r, ensure_ascii=False) for r in records))
_tmp_path = _f.name
try:
api.upload_file(
path_or_fileobj=_tmp_path,
path_in_repo="cr_index.jsonl",
repo_id=hf_repo,
repo_type="dataset",
token=hf_token,
)
finally:
Path(_tmp_path).unlink(missing_ok=True)
|