Spaces:
Sleeping
Sleeping
Claude Code
Claude Code: Create a verification script at HF Dataset Persistence Verification
8445d86 | #!/usr/bin/env python3 | |
| """ | |
| HF Dataset Persistence Verification Script | |
| ============================================ | |
| Verifies the HuggingFace Dataset persistence layer configuration. | |
| Can be imported as a module or run standalone for testing. | |
| Usage: | |
| python scripts/verify_dataset.py | |
| # Or import from app.py: | |
| from scripts.verify_dataset import verify_dataset, DatasetStatus | |
| status = verify_dataset() | |
| if status.ready: | |
| print("Persistence is ready!") | |
| """ | |
| import os | |
| import sys | |
| from pathlib import Path | |
| from typing import NamedTuple, Optional | |
| from dataclasses import dataclass | |
| # Add parent directory to path for imports when run standalone | |
| if __name__ == "__main__": | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| # Set timeout BEFORE importing huggingface_hub | |
| os.environ.setdefault("HF_HUB_DOWNLOAD_TIMEOUT", "300") | |
| os.environ.setdefault("HF_HUB_UPLOAD_TIMEOUT", "600") | |
| # Suppress huggingface_hub progress bars and verbose logs | |
| os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1") | |
| os.environ.setdefault("HF_HUB_VERBOSITY", "warning") | |
| import logging | |
| _logging = logging.getLogger("huggingface_hub") | |
| _logging.setLevel(logging.WARNING) | |
| class DatasetStatus: | |
| """Status of HF Dataset persistence configuration.""" | |
| ready: bool | |
| has_token: bool | |
| has_repo: bool | |
| auto_create: bool | |
| repo_exists: bool | |
| write_access: bool | |
| repo_id: str | |
| message: str | |
| emoji: str | |
| def get_dataset_repo_id() -> tuple[str, bool]: | |
| """ | |
| Determine the dataset repository ID. | |
| Returns: | |
| (repo_id, was_auto_derived) tuple | |
| """ | |
| # Try explicit env var first | |
| repo_id = os.environ.get("OPENCLAW_DATASET_REPO", "") | |
| if repo_id: | |
| return repo_id, False | |
| # Auto-derive from SPACE_ID (HF Spaces built-in) | |
| space_id = os.environ.get("SPACE_ID", "") | |
| if space_id: | |
| # SPACE_ID = "username/SpaceName" → derive "username/SpaceName-data" | |
| derived = f"{space_id}-data" | |
| return derived, True | |
| # Fallback: derive from HF_TOKEN username (local Docker) | |
| hf_token = os.environ.get("HF_TOKEN") | |
| if hf_token: | |
| try: | |
| from huggingface_hub import HfApi | |
| api = HfApi(token=hf_token) | |
| username = api.whoami()["name"] | |
| return f"{username}/HuggingClaw-data", True | |
| except Exception: | |
| pass | |
| return "", False | |
| def verify_dataset() -> DatasetStatus: | |
| """ | |
| Verify HF Dataset persistence configuration. | |
| Returns: | |
| DatasetStatus object with verification results | |
| """ | |
| hf_token = os.environ.get("HF_TOKEN") | |
| auto_create = os.environ.get("AUTO_CREATE_DATASET", "false").lower() in ("true", "1", "yes") | |
| repo_id, was_auto_derived = get_dataset_repo_id() | |
| # Check 1: HF_TOKEN | |
| if not hf_token: | |
| return DatasetStatus( | |
| ready=False, | |
| has_token=False, | |
| has_repo=False, | |
| auto_create=auto_create, | |
| repo_exists=False, | |
| write_access=False, | |
| repo_id=repo_id, | |
| message="HF_TOKEN not set - persistence disabled", | |
| emoji="⚠️" | |
| ) | |
| # Check 2: Repository ID | |
| if not repo_id: | |
| return DatasetStatus( | |
| ready=False, | |
| has_token=True, | |
| has_repo=False, | |
| auto_create=auto_create, | |
| repo_exists=False, | |
| write_access=False, | |
| repo_id="", | |
| message="Could not determine dataset repo (no SPACE_ID or OPENCLAW_DATASET_REPO)", | |
| emoji="❌" | |
| ) | |
| # Check 3: Verify connection and check if repo exists | |
| from huggingface_hub import HfApi | |
| api = HfApi(token=hf_token) | |
| try: | |
| # Test authentication | |
| whoami = api.whoami() | |
| username = whoami.get("name", "") | |
| # Check if repo exists | |
| try: | |
| api.repo_info(repo_id=repo_id, repo_type="dataset") | |
| repo_exists = True | |
| except Exception: | |
| repo_exists = False | |
| # If repo doesn't exist and auto_create is false, warn | |
| if not repo_exists and not auto_create: | |
| return DatasetStatus( | |
| ready=False, | |
| has_token=True, | |
| has_repo=True, | |
| auto_create=False, | |
| repo_exists=False, | |
| write_access=True, | |
| repo_id=repo_id, | |
| message=f"AUTO_CREATE_DATASET=false - dataset '{repo_id}' must exist manually", | |
| emoji="⚠️" | |
| ) | |
| # If repo doesn't exist but auto_create is true, we can create it | |
| if not repo_exists and auto_create: | |
| return DatasetStatus( | |
| ready=True, | |
| has_token=True, | |
| has_repo=True, | |
| auto_create=True, | |
| repo_exists=False, | |
| write_access=True, | |
| repo_id=repo_id, | |
| message=f"Will auto-create dataset '{repo_id}' on first sync", | |
| emoji="✅" | |
| ) | |
| # All checks passed | |
| source_info = " (auto-derived)" if was_auto_derived else "" | |
| return DatasetStatus( | |
| ready=True, | |
| has_token=True, | |
| has_repo=True, | |
| auto_create=auto_create, | |
| repo_exists=True, | |
| write_access=True, | |
| repo_id=repo_id, | |
| message=f"Persistence fully configured - dataset: {repo_id}{source_info}", | |
| emoji="✅" | |
| ) | |
| except Exception as e: | |
| # Connection failed | |
| error_msg = str(e) | |
| if "401" in error_msg or "authentication" in error_msg.lower(): | |
| return DatasetStatus( | |
| ready=False, | |
| has_token=True, | |
| has_repo=True, | |
| auto_create=auto_create, | |
| repo_exists=False, | |
| write_access=False, | |
| repo_id=repo_id, | |
| message=f"Authentication failed - check HF_TOKEN has write permissions", | |
| emoji="❌" | |
| ) | |
| return DatasetStatus( | |
| ready=False, | |
| has_token=True, | |
| has_repo=True, | |
| auto_create=auto_create, | |
| repo_exists=False, | |
| write_access=False, | |
| repo_id=repo_id, | |
| message=f"Connection failed: {error_msg[:100]}", | |
| emoji="❌" | |
| ) | |
| def print_status(status: DatasetStatus) -> None: | |
| """Print formatted status to console.""" | |
| print(f"{status.emoji} {status.message}") | |
| # Print additional details | |
| details = [] | |
| if not status.has_token: | |
| details.append(" • HF_TOKEN: NOT SET") | |
| else: | |
| details.append(f" • HF_TOKEN: {'✓ Set' if status.write_access else '✗ Invalid'}") | |
| if not status.has_repo: | |
| details.append(f" • OPENCLAW_DATASET_REPO: NOT SET (will auto-derive)") | |
| else: | |
| details.append(f" • Dataset repo: {status.repo_id}") | |
| details.append(f" • AUTO_CREATE_DATASET: {os.environ.get('AUTO_CREATE_DATASET', 'false')}") | |
| if status.repo_exists: | |
| details.append(" • Repository status: ✓ Exists") | |
| elif status.auto_create and status.has_token: | |
| details.append(" • Repository status: Will be created on first sync") | |
| else: | |
| details.append(" • Repository status: ✗ Not found") | |
| for detail in details: | |
| print(detail) | |
| def main() -> int: | |
| """ | |
| Main entry point when run standalone. | |
| Returns: | |
| 0 if persistence is ready, 1 otherwise | |
| """ | |
| print("HF Dataset Persistence Verification") | |
| print("=" * 40) | |
| print() | |
| status = verify_dataset() | |
| print_status(status) | |
| print() | |
| if status.ready: | |
| print("Result: Persistence is READY") | |
| return 0 | |
| else: | |
| print("Result: Persistence is NOT ready") | |
| return 1 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |