Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import json | |
| import os | |
| from pathlib import Path | |
| from typing import Any | |
| from huggingface_hub import HfApi, hf_hub_download | |
| def load_remote_file( | |
| api: HfApi, | |
| repo_id: str, | |
| path_in_repo: str, | |
| local_dir: Path, | |
| *, | |
| revision: str | None = None, | |
| ) -> Path | None: | |
| del api | |
| try: | |
| downloaded = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=path_in_repo, | |
| repo_type="dataset", | |
| revision=revision, | |
| local_dir=str(local_dir), | |
| token=os.getenv("HF_TOKEN"), | |
| ) | |
| except Exception: | |
| return None | |
| return Path(downloaded) | |
| def load_remote_json_file( | |
| api: HfApi, | |
| repo_id: str, | |
| path_in_repo: str, | |
| local_dir: Path, | |
| *, | |
| revision: str | None = None, | |
| ) -> dict[str, Any] | None: | |
| downloaded = load_remote_file( | |
| api, | |
| repo_id, | |
| path_in_repo, | |
| local_dir, | |
| revision=revision, | |
| ) | |
| if downloaded is None: | |
| return None | |
| return json.loads(downloaded.read_text(encoding="utf-8")) | |
| def list_remote_paths(api: HfApi, repo_id: str, *, revision: str | None = None) -> set[str]: | |
| try: | |
| info = api.dataset_info(repo_id=repo_id, revision=revision, files_metadata=True) | |
| except TypeError: | |
| info = api.dataset_info(repo_id=repo_id, revision=revision) | |
| except Exception: | |
| return set() | |
| return {sibling.rfilename for sibling in getattr(info, "siblings", [])} | |
| def stable_snapshot_candidates(latest_payload: dict[str, Any] | None, filename: str) -> list[str]: | |
| if latest_payload is None: | |
| return [filename] | |
| candidates: list[str] = [] | |
| manifest_path = str(latest_payload.get("manifest_path") or "").strip("/") | |
| snapshot_dir = str(latest_payload.get("snapshot_dir") or "").strip("/") | |
| latest_snapshot_id = str(latest_payload.get("latest_snapshot_id") or "").strip() | |
| if filename == "manifest.json" and manifest_path: | |
| candidates.append(manifest_path) | |
| if snapshot_dir and snapshot_dir not in {".", "/"}: | |
| candidates.append(f"{snapshot_dir}/{filename}") | |
| archived_manifest_path = str(latest_payload.get("archived_manifest_path") or "").strip("/") | |
| if filename == "manifest.json" and archived_manifest_path: | |
| candidates.append(archived_manifest_path) | |
| if manifest_path and "/" in manifest_path: | |
| manifest_dir = manifest_path.rsplit("/", 1)[0] | |
| candidates.append(f"{manifest_dir}/{filename}") | |
| if latest_snapshot_id: | |
| candidates.append(f"snapshots/{latest_snapshot_id}/{filename}") | |
| candidates.append(filename) | |
| deduped: list[str] = [] | |
| seen: set[str] = set() | |
| for candidate in candidates: | |
| normalized = candidate.lstrip("./") | |
| if not normalized or normalized in seen: | |
| continue | |
| seen.add(normalized) | |
| deduped.append(normalized) | |
| return deduped | |