Spaces:

NathanRoll
/

packing-benchmark

Sleeping

App Files Files Community

packing-benchmark / packing_benchmark /hub_sync.py

NathanRoll

Keep dataset mirror private by default

7c0c53a verified 18 days ago

raw

history blame contribute delete

4.13 kB

	from __future__ import annotations

	import os
	import shutil
	import tempfile
	from pathlib import Path


	FALSE_VALUES = {"0", "false", "no", "off"}
	TRUE_VALUES = {"1", "true", "yes", "on"}


	def _dataset_repo() -> str:
	return os.environ.get("PACKING_DATASET_REPO") or os.environ.get("HF_DATASET_REPO") or ""


	def _dataset_required() -> bool:
	"""Whether configured Dataset sync/hydration is required to succeed.

	The deployed Space should treat the Dataset as the source of truth. Local
	development can opt back into bundled data with PACKING_ALLOW_BUNDLED_DATA=1
	or PACKING_REQUIRE_DATASET=0.
	"""

	if os.environ.get("PACKING_REQUIRE_DATASET", "").lower() in FALSE_VALUES:
	return False
	if os.environ.get("PACKING_ALLOW_BUNDLED_DATA", "").lower() in TRUE_VALUES:
	return False
	return bool(_dataset_repo())


	def maybe_sync_dataset(data_dir: Path) -> str:
	"""Mirror verified benchmark data to a Hugging Face Dataset repo if configured.

	Set PACKING_DATASET_REPO or HF_DATASET_REPO to a dataset repo id such as
	"username/packing-benchmark-data". HF_TOKEN should be configured as a
	Space secret with write access. If unset, this is a no-op.
	"""

	repo_id = _dataset_repo()
	if not repo_id:
	return "dataset sync disabled"

	try:
	from huggingface_hub import HfApi
	except Exception as exc: # pragma: no cover - only active in deployed Space
	if _dataset_required():
	raise RuntimeError(f"dataset sync unavailable for {repo_id}: {exc}") from exc
	return f"dataset sync unavailable: {exc}"

	token = os.environ.get("HF_TOKEN")
	private = os.environ.get("PACKING_DATASET_PRIVATE", "1").lower() in TRUE_VALUES
	try:
	api = HfApi(token=token)
	api.create_repo(repo_id=repo_id, repo_type="dataset", private=private, exist_ok=True)
	api.upload_folder(
	repo_id=repo_id,
	repo_type="dataset",
	folder_path=str(data_dir),
	path_in_repo=".",
	ignore_patterns=["previews/", "__pycache__/", "*.pyc"],
	)
	except Exception as exc: # pragma: no cover - network dependent
	if _dataset_required():
	raise RuntimeError(f"dataset sync failed for {repo_id}: {exc}") from exc
	return f"dataset sync failed: {exc}"
	return f"synced to dataset {repo_id}"


	def maybe_hydrate_from_dataset(data_dir: Path) -> str:
	"""Load benchmark data from the configured Dataset mirror on startup."""

	repo_id = _dataset_repo()
	if not repo_id:
	return "dataset hydrate disabled"

	try:
	from huggingface_hub import snapshot_download
	except Exception as exc: # pragma: no cover - only active in deployed Space
	if _dataset_required():
	raise RuntimeError(f"dataset hydrate unavailable for {repo_id}: {exc}") from exc
	return f"dataset hydrate unavailable: {exc}"

	token = os.environ.get("HF_TOKEN")
	try:
	with tempfile.TemporaryDirectory() as tmp:
	snapshot = Path(
	snapshot_download(
	repo_id=repo_id,
	repo_type="dataset",
	token=token,
	local_dir=tmp,
	ignore_patterns=["previews/", "__pycache__/", "*.pyc"],
	)
	)
	data_dir.mkdir(parents=True, exist_ok=True)
	for source in snapshot.iterdir():
	if source.name == ".huggingface":
	continue
	target = data_dir / source.name
	if source.is_dir():
	if target.exists():
	shutil.rmtree(target)
	shutil.copytree(source, target)
	else:
	shutil.copy2(source, target)
	except Exception as exc: # pragma: no cover - network dependent
	if _dataset_required():
	raise RuntimeError(f"dataset hydrate failed for {repo_id}: {exc}") from exc
	return f"dataset hydrate failed; using bundled data: {exc}"
	return f"hydrated from dataset {repo_id}"