""" hf_cr_index.py — Shared HuggingFace dataset index helpers. Used by both build_cr_index.py and orchestrate_cr.py. Depends only on huggingface_hub (pip install huggingface_hub). """ import json from pathlib import Path def load_hf_index(hf_token: str, hf_repo: str) -> list[dict]: """ Download cr_index.jsonl from the HF dataset and return parsed records. Returns an empty list if the file does not exist yet. """ from huggingface_hub import hf_hub_download from huggingface_hub.errors import EntryNotFoundError try: path = hf_hub_download( repo_id=hf_repo, filename="cr_index.jsonl", repo_type="dataset", token=hf_token, ) text = Path(path).read_text(encoding="utf-8") return [json.loads(line) for line in text.splitlines() if line.strip()] except EntryNotFoundError: return [] def push_hf_index(records: list[dict], hf_token: str, hf_repo: str) -> None: """ Push all records as cr_index.jsonl to the HF dataset repo. Creates the repo if it does not exist. """ from huggingface_hub import HfApi api = HfApi() api.create_repo( repo_id=hf_repo, repo_type="dataset", exist_ok=True, token=hf_token, ) import tempfile with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', encoding='utf-8', delete=False) as _f: _f.write("\n".join(json.dumps(r, ensure_ascii=False) for r in records)) _tmp_path = _f.name try: api.upload_file( path_or_fileobj=_tmp_path, path_in_repo="cr_index.jsonl", repo_id=hf_repo, repo_type="dataset", token=hf_token, ) finally: Path(_tmp_path).unlink(missing_ok=True)