Spaces:

NathanRoll
/

packing-benchmark

Sleeping

File size: 4,126 Bytes

from __future__ import annotations

import os
import shutil
import tempfile
from pathlib import Path


FALSE_VALUES = {"0", "false", "no", "off"}
TRUE_VALUES = {"1", "true", "yes", "on"}


def _dataset_repo() -> str:
    return os.environ.get("PACKING_DATASET_REPO") or os.environ.get("HF_DATASET_REPO") or ""


def _dataset_required() -> bool:
    """Whether configured Dataset sync/hydration is required to succeed.

    The deployed Space should treat the Dataset as the source of truth.  Local
    development can opt back into bundled data with PACKING_ALLOW_BUNDLED_DATA=1
    or PACKING_REQUIRE_DATASET=0.
    """

    if os.environ.get("PACKING_REQUIRE_DATASET", "").lower() in FALSE_VALUES:
        return False
    if os.environ.get("PACKING_ALLOW_BUNDLED_DATA", "").lower() in TRUE_VALUES:
        return False
    return bool(_dataset_repo())


def maybe_sync_dataset(data_dir: Path) -> str:
    """Mirror verified benchmark data to a Hugging Face Dataset repo if configured.

    Set PACKING_DATASET_REPO or HF_DATASET_REPO to a dataset repo id such as
    "username/packing-benchmark-data".  HF_TOKEN should be configured as a
    Space secret with write access.  If unset, this is a no-op.
    """

    repo_id = _dataset_repo()
    if not repo_id:
        return "dataset sync disabled"

    try:
        from huggingface_hub import HfApi
    except Exception as exc:  # pragma: no cover - only active in deployed Space
        if _dataset_required():
            raise RuntimeError(f"dataset sync unavailable for {repo_id}: {exc}") from exc
        return f"dataset sync unavailable: {exc}"

    token = os.environ.get("HF_TOKEN")
    private = os.environ.get("PACKING_DATASET_PRIVATE", "1").lower() in TRUE_VALUES
    try:
        api = HfApi(token=token)
        api.create_repo(repo_id=repo_id, repo_type="dataset", private=private, exist_ok=True)
        api.upload_folder(
            repo_id=repo_id,
            repo_type="dataset",
            folder_path=str(data_dir),
            path_in_repo=".",
            ignore_patterns=["previews/*", "__pycache__/*", "*.pyc"],
        )
    except Exception as exc:  # pragma: no cover - network dependent
        if _dataset_required():
            raise RuntimeError(f"dataset sync failed for {repo_id}: {exc}") from exc
        return f"dataset sync failed: {exc}"
    return f"synced to dataset {repo_id}"


def maybe_hydrate_from_dataset(data_dir: Path) -> str:
    """Load benchmark data from the configured Dataset mirror on startup."""

    repo_id = _dataset_repo()
    if not repo_id:
        return "dataset hydrate disabled"

    try:
        from huggingface_hub import snapshot_download
    except Exception as exc:  # pragma: no cover - only active in deployed Space
        if _dataset_required():
            raise RuntimeError(f"dataset hydrate unavailable for {repo_id}: {exc}") from exc
        return f"dataset hydrate unavailable: {exc}"

    token = os.environ.get("HF_TOKEN")
    try:
        with tempfile.TemporaryDirectory() as tmp:
            snapshot = Path(
                snapshot_download(
                    repo_id=repo_id,
                    repo_type="dataset",
                    token=token,
                    local_dir=tmp,
                    ignore_patterns=["previews/*", "__pycache__/*", "*.pyc"],
                )
            )
            data_dir.mkdir(parents=True, exist_ok=True)
            for source in snapshot.iterdir():
                if source.name == ".huggingface":
                    continue
                target = data_dir / source.name
                if source.is_dir():
                    if target.exists():
                        shutil.rmtree(target)
                    shutil.copytree(source, target)
                else:
                    shutil.copy2(source, target)
    except Exception as exc:  # pragma: no cover - network dependent
        if _dataset_required():
            raise RuntimeError(f"dataset hydrate failed for {repo_id}: {exc}") from exc
        return f"dataset hydrate failed; using bundled data: {exc}"
    return f"hydrated from dataset {repo_id}"