from __future__ import annotations import os import shutil import tempfile from pathlib import Path FALSE_VALUES = {"0", "false", "no", "off"} TRUE_VALUES = {"1", "true", "yes", "on"} def _dataset_repo() -> str: return os.environ.get("PACKING_DATASET_REPO") or os.environ.get("HF_DATASET_REPO") or "" def _dataset_required() -> bool: """Whether configured Dataset sync/hydration is required to succeed. The deployed Space should treat the Dataset as the source of truth. Local development can opt back into bundled data with PACKING_ALLOW_BUNDLED_DATA=1 or PACKING_REQUIRE_DATASET=0. """ if os.environ.get("PACKING_REQUIRE_DATASET", "").lower() in FALSE_VALUES: return False if os.environ.get("PACKING_ALLOW_BUNDLED_DATA", "").lower() in TRUE_VALUES: return False return bool(_dataset_repo()) def maybe_sync_dataset(data_dir: Path) -> str: """Mirror verified benchmark data to a Hugging Face Dataset repo if configured. Set PACKING_DATASET_REPO or HF_DATASET_REPO to a dataset repo id such as "username/packing-benchmark-data". HF_TOKEN should be configured as a Space secret with write access. If unset, this is a no-op. """ repo_id = _dataset_repo() if not repo_id: return "dataset sync disabled" try: from huggingface_hub import HfApi except Exception as exc: # pragma: no cover - only active in deployed Space if _dataset_required(): raise RuntimeError(f"dataset sync unavailable for {repo_id}: {exc}") from exc return f"dataset sync unavailable: {exc}" token = os.environ.get("HF_TOKEN") private = os.environ.get("PACKING_DATASET_PRIVATE", "1").lower() in TRUE_VALUES try: api = HfApi(token=token) api.create_repo(repo_id=repo_id, repo_type="dataset", private=private, exist_ok=True) api.upload_folder( repo_id=repo_id, repo_type="dataset", folder_path=str(data_dir), path_in_repo=".", ignore_patterns=["previews/*", "__pycache__/*", "*.pyc"], ) except Exception as exc: # pragma: no cover - network dependent if _dataset_required(): raise RuntimeError(f"dataset sync failed for {repo_id}: {exc}") from exc return f"dataset sync failed: {exc}" return f"synced to dataset {repo_id}" def maybe_hydrate_from_dataset(data_dir: Path) -> str: """Load benchmark data from the configured Dataset mirror on startup.""" repo_id = _dataset_repo() if not repo_id: return "dataset hydrate disabled" try: from huggingface_hub import snapshot_download except Exception as exc: # pragma: no cover - only active in deployed Space if _dataset_required(): raise RuntimeError(f"dataset hydrate unavailable for {repo_id}: {exc}") from exc return f"dataset hydrate unavailable: {exc}" token = os.environ.get("HF_TOKEN") try: with tempfile.TemporaryDirectory() as tmp: snapshot = Path( snapshot_download( repo_id=repo_id, repo_type="dataset", token=token, local_dir=tmp, ignore_patterns=["previews/*", "__pycache__/*", "*.pyc"], ) ) data_dir.mkdir(parents=True, exist_ok=True) for source in snapshot.iterdir(): if source.name == ".huggingface": continue target = data_dir / source.name if source.is_dir(): if target.exists(): shutil.rmtree(target) shutil.copytree(source, target) else: shutil.copy2(source, target) except Exception as exc: # pragma: no cover - network dependent if _dataset_required(): raise RuntimeError(f"dataset hydrate failed for {repo_id}: {exc}") from exc return f"dataset hydrate failed; using bundled data: {exc}" return f"hydrated from dataset {repo_id}"