#!/usr/bin/env python3 """ HF Dataset Persistence Verification Script ============================================ Verifies the HuggingFace Dataset persistence layer configuration. Can be imported as a module or run standalone for testing. Usage: python scripts/verify_dataset.py # Or import from app.py: from scripts.verify_dataset import verify_dataset, DatasetStatus status = verify_dataset() if status.ready: print("Persistence is ready!") """ import os import sys from pathlib import Path from typing import NamedTuple, Optional from dataclasses import dataclass # Add parent directory to path for imports when run standalone if __name__ == "__main__": sys.path.insert(0, str(Path(__file__).parent.parent)) # Set timeout BEFORE importing huggingface_hub os.environ.setdefault("HF_HUB_DOWNLOAD_TIMEOUT", "300") os.environ.setdefault("HF_HUB_UPLOAD_TIMEOUT", "600") # Suppress huggingface_hub progress bars and verbose logs os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1") os.environ.setdefault("HF_HUB_VERBOSITY", "warning") import logging _logging = logging.getLogger("huggingface_hub") _logging.setLevel(logging.WARNING) @dataclass class DatasetStatus: """Status of HF Dataset persistence configuration.""" ready: bool has_token: bool has_repo: bool auto_create: bool repo_exists: bool write_access: bool repo_id: str message: str emoji: str def get_dataset_repo_id() -> tuple[str, bool]: """ Determine the dataset repository ID. Returns: (repo_id, was_auto_derived) tuple """ # Try explicit env var first repo_id = os.environ.get("OPENCLAW_DATASET_REPO", "") if repo_id: return repo_id, False # Auto-derive from SPACE_ID (HF Spaces built-in) space_id = os.environ.get("SPACE_ID", "") if space_id: # SPACE_ID = "username/SpaceName" → derive "username/SpaceName-data" derived = f"{space_id}-data" return derived, True # Fallback: derive from HF_TOKEN username (local Docker) hf_token = os.environ.get("HF_TOKEN") if hf_token: try: from huggingface_hub import HfApi api = HfApi(token=hf_token) username = api.whoami()["name"] return f"{username}/HuggingClaw-data", True except Exception: pass return "", False def verify_dataset() -> DatasetStatus: """ Verify HF Dataset persistence configuration. Returns: DatasetStatus object with verification results """ hf_token = os.environ.get("HF_TOKEN") auto_create = os.environ.get("AUTO_CREATE_DATASET", "false").lower() in ("true", "1", "yes") repo_id, was_auto_derived = get_dataset_repo_id() # Check 1: HF_TOKEN if not hf_token: return DatasetStatus( ready=False, has_token=False, has_repo=False, auto_create=auto_create, repo_exists=False, write_access=False, repo_id=repo_id, message="HF_TOKEN not set - persistence disabled", emoji="⚠️" ) # Check 2: Repository ID if not repo_id: return DatasetStatus( ready=False, has_token=True, has_repo=False, auto_create=auto_create, repo_exists=False, write_access=False, repo_id="", message="Could not determine dataset repo (no SPACE_ID or OPENCLAW_DATASET_REPO)", emoji="❌" ) # Check 3: Verify connection and check if repo exists from huggingface_hub import HfApi api = HfApi(token=hf_token) try: # Test authentication whoami = api.whoami() username = whoami.get("name", "") # Check if repo exists try: api.repo_info(repo_id=repo_id, repo_type="dataset") repo_exists = True except Exception: repo_exists = False # If repo doesn't exist and auto_create is false, warn if not repo_exists and not auto_create: return DatasetStatus( ready=False, has_token=True, has_repo=True, auto_create=False, repo_exists=False, write_access=True, repo_id=repo_id, message=f"AUTO_CREATE_DATASET=false - dataset '{repo_id}' must exist manually", emoji="⚠️" ) # If repo doesn't exist but auto_create is true, we can create it if not repo_exists and auto_create: return DatasetStatus( ready=True, has_token=True, has_repo=True, auto_create=True, repo_exists=False, write_access=True, repo_id=repo_id, message=f"Will auto-create dataset '{repo_id}' on first sync", emoji="✅" ) # All checks passed source_info = " (auto-derived)" if was_auto_derived else "" return DatasetStatus( ready=True, has_token=True, has_repo=True, auto_create=auto_create, repo_exists=True, write_access=True, repo_id=repo_id, message=f"Persistence fully configured - dataset: {repo_id}{source_info}", emoji="✅" ) except Exception as e: # Connection failed error_msg = str(e) if "401" in error_msg or "authentication" in error_msg.lower(): return DatasetStatus( ready=False, has_token=True, has_repo=True, auto_create=auto_create, repo_exists=False, write_access=False, repo_id=repo_id, message=f"Authentication failed - check HF_TOKEN has write permissions", emoji="❌" ) return DatasetStatus( ready=False, has_token=True, has_repo=True, auto_create=auto_create, repo_exists=False, write_access=False, repo_id=repo_id, message=f"Connection failed: {error_msg[:100]}", emoji="❌" ) def print_status(status: DatasetStatus) -> None: """Print formatted status to console.""" print(f"{status.emoji} {status.message}") # Print additional details details = [] if not status.has_token: details.append(" • HF_TOKEN: NOT SET") else: details.append(f" • HF_TOKEN: {'✓ Set' if status.write_access else '✗ Invalid'}") if not status.has_repo: details.append(f" • OPENCLAW_DATASET_REPO: NOT SET (will auto-derive)") else: details.append(f" • Dataset repo: {status.repo_id}") details.append(f" • AUTO_CREATE_DATASET: {os.environ.get('AUTO_CREATE_DATASET', 'false')}") if status.repo_exists: details.append(" • Repository status: ✓ Exists") elif status.auto_create and status.has_token: details.append(" • Repository status: Will be created on first sync") else: details.append(" • Repository status: ✗ Not found") for detail in details: print(detail) def main() -> int: """ Main entry point when run standalone. Returns: 0 if persistence is ready, 1 otherwise """ print("HF Dataset Persistence Verification") print("=" * 40) print() status = verify_dataset() print_status(status) print() if status.ready: print("Result: Persistence is READY") return 0 else: print("Result: Persistence is NOT ready") return 1 if __name__ == "__main__": sys.exit(main())