Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import os | |
| import shutil | |
| import tempfile | |
| from pathlib import Path | |
| FALSE_VALUES = {"0", "false", "no", "off"} | |
| TRUE_VALUES = {"1", "true", "yes", "on"} | |
| def _dataset_repo() -> str: | |
| return os.environ.get("PACKING_DATASET_REPO") or os.environ.get("HF_DATASET_REPO") or "" | |
| def _dataset_required() -> bool: | |
| """Whether configured Dataset sync/hydration is required to succeed. | |
| The deployed Space should treat the Dataset as the source of truth. Local | |
| development can opt back into bundled data with PACKING_ALLOW_BUNDLED_DATA=1 | |
| or PACKING_REQUIRE_DATASET=0. | |
| """ | |
| if os.environ.get("PACKING_REQUIRE_DATASET", "").lower() in FALSE_VALUES: | |
| return False | |
| if os.environ.get("PACKING_ALLOW_BUNDLED_DATA", "").lower() in TRUE_VALUES: | |
| return False | |
| return bool(_dataset_repo()) | |
| def maybe_sync_dataset(data_dir: Path) -> str: | |
| """Mirror verified benchmark data to a Hugging Face Dataset repo if configured. | |
| Set PACKING_DATASET_REPO or HF_DATASET_REPO to a dataset repo id such as | |
| "username/packing-benchmark-data". HF_TOKEN should be configured as a | |
| Space secret with write access. If unset, this is a no-op. | |
| """ | |
| repo_id = _dataset_repo() | |
| if not repo_id: | |
| return "dataset sync disabled" | |
| try: | |
| from huggingface_hub import HfApi | |
| except Exception as exc: # pragma: no cover - only active in deployed Space | |
| if _dataset_required(): | |
| raise RuntimeError(f"dataset sync unavailable for {repo_id}: {exc}") from exc | |
| return f"dataset sync unavailable: {exc}" | |
| token = os.environ.get("HF_TOKEN") | |
| private = os.environ.get("PACKING_DATASET_PRIVATE", "1").lower() in TRUE_VALUES | |
| try: | |
| api = HfApi(token=token) | |
| api.create_repo(repo_id=repo_id, repo_type="dataset", private=private, exist_ok=True) | |
| api.upload_folder( | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| folder_path=str(data_dir), | |
| path_in_repo=".", | |
| ignore_patterns=["previews/*", "__pycache__/*", "*.pyc"], | |
| ) | |
| except Exception as exc: # pragma: no cover - network dependent | |
| if _dataset_required(): | |
| raise RuntimeError(f"dataset sync failed for {repo_id}: {exc}") from exc | |
| return f"dataset sync failed: {exc}" | |
| return f"synced to dataset {repo_id}" | |
| def maybe_hydrate_from_dataset(data_dir: Path) -> str: | |
| """Load benchmark data from the configured Dataset mirror on startup.""" | |
| repo_id = _dataset_repo() | |
| if not repo_id: | |
| return "dataset hydrate disabled" | |
| try: | |
| from huggingface_hub import snapshot_download | |
| except Exception as exc: # pragma: no cover - only active in deployed Space | |
| if _dataset_required(): | |
| raise RuntimeError(f"dataset hydrate unavailable for {repo_id}: {exc}") from exc | |
| return f"dataset hydrate unavailable: {exc}" | |
| token = os.environ.get("HF_TOKEN") | |
| try: | |
| with tempfile.TemporaryDirectory() as tmp: | |
| snapshot = Path( | |
| snapshot_download( | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| token=token, | |
| local_dir=tmp, | |
| ignore_patterns=["previews/*", "__pycache__/*", "*.pyc"], | |
| ) | |
| ) | |
| data_dir.mkdir(parents=True, exist_ok=True) | |
| for source in snapshot.iterdir(): | |
| if source.name == ".huggingface": | |
| continue | |
| target = data_dir / source.name | |
| if source.is_dir(): | |
| if target.exists(): | |
| shutil.rmtree(target) | |
| shutil.copytree(source, target) | |
| else: | |
| shutil.copy2(source, target) | |
| except Exception as exc: # pragma: no cover - network dependent | |
| if _dataset_required(): | |
| raise RuntimeError(f"dataset hydrate failed for {repo_id}: {exc}") from exc | |
| return f"dataset hydrate failed; using bundled data: {exc}" | |
| return f"hydrated from dataset {repo_id}" | |