Spaces:
Sleeping
Sleeping
| """ | |
| hf_cr_index.py — Shared HuggingFace dataset index helpers. | |
| Used by both build_cr_index.py and orchestrate_cr.py. | |
| Depends only on huggingface_hub (pip install huggingface_hub). | |
| """ | |
| import json | |
| from pathlib import Path | |
| def load_hf_index(hf_token: str, hf_repo: str) -> list[dict]: | |
| """ | |
| Download cr_index.jsonl from the HF dataset and return parsed records. | |
| Returns an empty list if the file does not exist yet. | |
| """ | |
| from huggingface_hub import hf_hub_download | |
| from huggingface_hub.errors import EntryNotFoundError | |
| try: | |
| path = hf_hub_download( | |
| repo_id=hf_repo, | |
| filename="cr_index.jsonl", | |
| repo_type="dataset", | |
| token=hf_token, | |
| ) | |
| text = Path(path).read_text(encoding="utf-8") | |
| return [json.loads(line) for line in text.splitlines() if line.strip()] | |
| except EntryNotFoundError: | |
| return [] | |
| def push_hf_index(records: list[dict], hf_token: str, hf_repo: str) -> None: | |
| """ | |
| Push all records as cr_index.jsonl to the HF dataset repo. | |
| Creates the repo if it does not exist. | |
| """ | |
| from huggingface_hub import HfApi | |
| api = HfApi() | |
| api.create_repo( | |
| repo_id=hf_repo, | |
| repo_type="dataset", | |
| exist_ok=True, | |
| token=hf_token, | |
| ) | |
| import tempfile | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', | |
| encoding='utf-8', delete=False) as _f: | |
| _f.write("\n".join(json.dumps(r, ensure_ascii=False) for r in records)) | |
| _tmp_path = _f.name | |
| try: | |
| api.upload_file( | |
| path_or_fileobj=_tmp_path, | |
| path_in_repo="cr_index.jsonl", | |
| repo_id=hf_repo, | |
| repo_type="dataset", | |
| token=hf_token, | |
| ) | |
| finally: | |
| Path(_tmp_path).unlink(missing_ok=True) | |