NathanRoll's picture
Keep dataset mirror private by default
7c0c53a verified
from __future__ import annotations
import os
import shutil
import tempfile
from pathlib import Path
FALSE_VALUES = {"0", "false", "no", "off"}
TRUE_VALUES = {"1", "true", "yes", "on"}
def _dataset_repo() -> str:
return os.environ.get("PACKING_DATASET_REPO") or os.environ.get("HF_DATASET_REPO") or ""
def _dataset_required() -> bool:
"""Whether configured Dataset sync/hydration is required to succeed.
The deployed Space should treat the Dataset as the source of truth. Local
development can opt back into bundled data with PACKING_ALLOW_BUNDLED_DATA=1
or PACKING_REQUIRE_DATASET=0.
"""
if os.environ.get("PACKING_REQUIRE_DATASET", "").lower() in FALSE_VALUES:
return False
if os.environ.get("PACKING_ALLOW_BUNDLED_DATA", "").lower() in TRUE_VALUES:
return False
return bool(_dataset_repo())
def maybe_sync_dataset(data_dir: Path) -> str:
"""Mirror verified benchmark data to a Hugging Face Dataset repo if configured.
Set PACKING_DATASET_REPO or HF_DATASET_REPO to a dataset repo id such as
"username/packing-benchmark-data". HF_TOKEN should be configured as a
Space secret with write access. If unset, this is a no-op.
"""
repo_id = _dataset_repo()
if not repo_id:
return "dataset sync disabled"
try:
from huggingface_hub import HfApi
except Exception as exc: # pragma: no cover - only active in deployed Space
if _dataset_required():
raise RuntimeError(f"dataset sync unavailable for {repo_id}: {exc}") from exc
return f"dataset sync unavailable: {exc}"
token = os.environ.get("HF_TOKEN")
private = os.environ.get("PACKING_DATASET_PRIVATE", "1").lower() in TRUE_VALUES
try:
api = HfApi(token=token)
api.create_repo(repo_id=repo_id, repo_type="dataset", private=private, exist_ok=True)
api.upload_folder(
repo_id=repo_id,
repo_type="dataset",
folder_path=str(data_dir),
path_in_repo=".",
ignore_patterns=["previews/*", "__pycache__/*", "*.pyc"],
)
except Exception as exc: # pragma: no cover - network dependent
if _dataset_required():
raise RuntimeError(f"dataset sync failed for {repo_id}: {exc}") from exc
return f"dataset sync failed: {exc}"
return f"synced to dataset {repo_id}"
def maybe_hydrate_from_dataset(data_dir: Path) -> str:
"""Load benchmark data from the configured Dataset mirror on startup."""
repo_id = _dataset_repo()
if not repo_id:
return "dataset hydrate disabled"
try:
from huggingface_hub import snapshot_download
except Exception as exc: # pragma: no cover - only active in deployed Space
if _dataset_required():
raise RuntimeError(f"dataset hydrate unavailable for {repo_id}: {exc}") from exc
return f"dataset hydrate unavailable: {exc}"
token = os.environ.get("HF_TOKEN")
try:
with tempfile.TemporaryDirectory() as tmp:
snapshot = Path(
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
token=token,
local_dir=tmp,
ignore_patterns=["previews/*", "__pycache__/*", "*.pyc"],
)
)
data_dir.mkdir(parents=True, exist_ok=True)
for source in snapshot.iterdir():
if source.name == ".huggingface":
continue
target = data_dir / source.name
if source.is_dir():
if target.exists():
shutil.rmtree(target)
shutil.copytree(source, target)
else:
shutil.copy2(source, target)
except Exception as exc: # pragma: no cover - network dependent
if _dataset_required():
raise RuntimeError(f"dataset hydrate failed for {repo_id}: {exc}") from exc
return f"dataset hydrate failed; using bundled data: {exc}"
return f"hydrated from dataset {repo_id}"