Spaces:
Sleeping
Sleeping
File size: 4,126 Bytes
efb7599 95e1515 efb7599 9bf8a5e efb7599 9bf8a5e efb7599 9bf8a5e efb7599 7c0c53a 729c30a 9bf8a5e 729c30a efb7599 95e1515 9bf8a5e 95e1515 9bf8a5e 95e1515 729c30a 95e1515 729c30a 9bf8a5e 729c30a 95e1515 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | from __future__ import annotations
import os
import shutil
import tempfile
from pathlib import Path
FALSE_VALUES = {"0", "false", "no", "off"}
TRUE_VALUES = {"1", "true", "yes", "on"}
def _dataset_repo() -> str:
return os.environ.get("PACKING_DATASET_REPO") or os.environ.get("HF_DATASET_REPO") or ""
def _dataset_required() -> bool:
"""Whether configured Dataset sync/hydration is required to succeed.
The deployed Space should treat the Dataset as the source of truth. Local
development can opt back into bundled data with PACKING_ALLOW_BUNDLED_DATA=1
or PACKING_REQUIRE_DATASET=0.
"""
if os.environ.get("PACKING_REQUIRE_DATASET", "").lower() in FALSE_VALUES:
return False
if os.environ.get("PACKING_ALLOW_BUNDLED_DATA", "").lower() in TRUE_VALUES:
return False
return bool(_dataset_repo())
def maybe_sync_dataset(data_dir: Path) -> str:
"""Mirror verified benchmark data to a Hugging Face Dataset repo if configured.
Set PACKING_DATASET_REPO or HF_DATASET_REPO to a dataset repo id such as
"username/packing-benchmark-data". HF_TOKEN should be configured as a
Space secret with write access. If unset, this is a no-op.
"""
repo_id = _dataset_repo()
if not repo_id:
return "dataset sync disabled"
try:
from huggingface_hub import HfApi
except Exception as exc: # pragma: no cover - only active in deployed Space
if _dataset_required():
raise RuntimeError(f"dataset sync unavailable for {repo_id}: {exc}") from exc
return f"dataset sync unavailable: {exc}"
token = os.environ.get("HF_TOKEN")
private = os.environ.get("PACKING_DATASET_PRIVATE", "1").lower() in TRUE_VALUES
try:
api = HfApi(token=token)
api.create_repo(repo_id=repo_id, repo_type="dataset", private=private, exist_ok=True)
api.upload_folder(
repo_id=repo_id,
repo_type="dataset",
folder_path=str(data_dir),
path_in_repo=".",
ignore_patterns=["previews/*", "__pycache__/*", "*.pyc"],
)
except Exception as exc: # pragma: no cover - network dependent
if _dataset_required():
raise RuntimeError(f"dataset sync failed for {repo_id}: {exc}") from exc
return f"dataset sync failed: {exc}"
return f"synced to dataset {repo_id}"
def maybe_hydrate_from_dataset(data_dir: Path) -> str:
"""Load benchmark data from the configured Dataset mirror on startup."""
repo_id = _dataset_repo()
if not repo_id:
return "dataset hydrate disabled"
try:
from huggingface_hub import snapshot_download
except Exception as exc: # pragma: no cover - only active in deployed Space
if _dataset_required():
raise RuntimeError(f"dataset hydrate unavailable for {repo_id}: {exc}") from exc
return f"dataset hydrate unavailable: {exc}"
token = os.environ.get("HF_TOKEN")
try:
with tempfile.TemporaryDirectory() as tmp:
snapshot = Path(
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
token=token,
local_dir=tmp,
ignore_patterns=["previews/*", "__pycache__/*", "*.pyc"],
)
)
data_dir.mkdir(parents=True, exist_ok=True)
for source in snapshot.iterdir():
if source.name == ".huggingface":
continue
target = data_dir / source.name
if source.is_dir():
if target.exists():
shutil.rmtree(target)
shutil.copytree(source, target)
else:
shutil.copy2(source, target)
except Exception as exc: # pragma: no cover - network dependent
if _dataset_required():
raise RuntimeError(f"dataset hydrate failed for {repo_id}: {exc}") from exc
return f"dataset hydrate failed; using bundled data: {exc}"
return f"hydrated from dataset {repo_id}"
|