from __future__ import annotations import json import os from pathlib import Path from typing import Any from huggingface_hub import HfApi, hf_hub_download def load_remote_file( api: HfApi, repo_id: str, path_in_repo: str, local_dir: Path, *, revision: str | None = None, ) -> Path | None: del api try: downloaded = hf_hub_download( repo_id=repo_id, filename=path_in_repo, repo_type="dataset", revision=revision, local_dir=str(local_dir), token=os.getenv("HF_TOKEN"), ) except Exception: return None return Path(downloaded) def load_remote_json_file( api: HfApi, repo_id: str, path_in_repo: str, local_dir: Path, *, revision: str | None = None, ) -> dict[str, Any] | None: downloaded = load_remote_file( api, repo_id, path_in_repo, local_dir, revision=revision, ) if downloaded is None: return None return json.loads(downloaded.read_text(encoding="utf-8")) def list_remote_paths(api: HfApi, repo_id: str, *, revision: str | None = None) -> set[str]: try: info = api.dataset_info(repo_id=repo_id, revision=revision, files_metadata=True) except TypeError: info = api.dataset_info(repo_id=repo_id, revision=revision) except Exception: return set() return {sibling.rfilename for sibling in getattr(info, "siblings", [])} def stable_snapshot_candidates(latest_payload: dict[str, Any] | None, filename: str) -> list[str]: if latest_payload is None: return [filename] candidates: list[str] = [] manifest_path = str(latest_payload.get("manifest_path") or "").strip("/") snapshot_dir = str(latest_payload.get("snapshot_dir") or "").strip("/") latest_snapshot_id = str(latest_payload.get("latest_snapshot_id") or "").strip() if filename == "manifest.json" and manifest_path: candidates.append(manifest_path) if snapshot_dir and snapshot_dir not in {".", "/"}: candidates.append(f"{snapshot_dir}/{filename}") archived_manifest_path = str(latest_payload.get("archived_manifest_path") or "").strip("/") if filename == "manifest.json" and archived_manifest_path: candidates.append(archived_manifest_path) if manifest_path and "/" in manifest_path: manifest_dir = manifest_path.rsplit("/", 1)[0] candidates.append(f"{manifest_dir}/{filename}") if latest_snapshot_id: candidates.append(f"snapshots/{latest_snapshot_id}/{filename}") candidates.append(filename) deduped: list[str] = [] seen: set[str] = set() for candidate in candidates: normalized = candidate.lstrip("./") if not normalized or normalized in seen: continue seen.add(normalized) deduped.append(normalized) return deduped