diffusers-pr-api / src /slop_farmer /data /hf_dataset_repo.py
evalstate's picture
evalstate HF Staff
Deploy Diffusers PR API
dbf7313 verified
from __future__ import annotations
import json
import os
from pathlib import Path
from typing import Any
from huggingface_hub import HfApi, hf_hub_download
def load_remote_file(
api: HfApi,
repo_id: str,
path_in_repo: str,
local_dir: Path,
*,
revision: str | None = None,
) -> Path | None:
del api
try:
downloaded = hf_hub_download(
repo_id=repo_id,
filename=path_in_repo,
repo_type="dataset",
revision=revision,
local_dir=str(local_dir),
token=os.getenv("HF_TOKEN"),
)
except Exception:
return None
return Path(downloaded)
def load_remote_json_file(
api: HfApi,
repo_id: str,
path_in_repo: str,
local_dir: Path,
*,
revision: str | None = None,
) -> dict[str, Any] | None:
downloaded = load_remote_file(
api,
repo_id,
path_in_repo,
local_dir,
revision=revision,
)
if downloaded is None:
return None
return json.loads(downloaded.read_text(encoding="utf-8"))
def list_remote_paths(api: HfApi, repo_id: str, *, revision: str | None = None) -> set[str]:
try:
info = api.dataset_info(repo_id=repo_id, revision=revision, files_metadata=True)
except TypeError:
info = api.dataset_info(repo_id=repo_id, revision=revision)
except Exception:
return set()
return {sibling.rfilename for sibling in getattr(info, "siblings", [])}
def stable_snapshot_candidates(latest_payload: dict[str, Any] | None, filename: str) -> list[str]:
if latest_payload is None:
return [filename]
candidates: list[str] = []
manifest_path = str(latest_payload.get("manifest_path") or "").strip("/")
snapshot_dir = str(latest_payload.get("snapshot_dir") or "").strip("/")
latest_snapshot_id = str(latest_payload.get("latest_snapshot_id") or "").strip()
if filename == "manifest.json" and manifest_path:
candidates.append(manifest_path)
if snapshot_dir and snapshot_dir not in {".", "/"}:
candidates.append(f"{snapshot_dir}/{filename}")
archived_manifest_path = str(latest_payload.get("archived_manifest_path") or "").strip("/")
if filename == "manifest.json" and archived_manifest_path:
candidates.append(archived_manifest_path)
if manifest_path and "/" in manifest_path:
manifest_dir = manifest_path.rsplit("/", 1)[0]
candidates.append(f"{manifest_dir}/{filename}")
if latest_snapshot_id:
candidates.append(f"snapshots/{latest_snapshot_id}/{filename}")
candidates.append(filename)
deduped: list[str] = []
seen: set[str] = set()
for candidate in candidates:
normalized = candidate.lstrip("./")
if not normalized or normalized in seen:
continue
seen.add(normalized)
deduped.append(normalized)
return deduped