HuggingClaw-Cain / scripts /verify_dataset.py
Claude Code
Claude Code: Create a verification script at HF Dataset Persistence Verification
8445d86
#!/usr/bin/env python3
"""
HF Dataset Persistence Verification Script
============================================
Verifies the HuggingFace Dataset persistence layer configuration.
Can be imported as a module or run standalone for testing.
Usage:
python scripts/verify_dataset.py
# Or import from app.py:
from scripts.verify_dataset import verify_dataset, DatasetStatus
status = verify_dataset()
if status.ready:
print("Persistence is ready!")
"""
import os
import sys
from pathlib import Path
from typing import NamedTuple, Optional
from dataclasses import dataclass
# Add parent directory to path for imports when run standalone
if __name__ == "__main__":
sys.path.insert(0, str(Path(__file__).parent.parent))
# Set timeout BEFORE importing huggingface_hub
os.environ.setdefault("HF_HUB_DOWNLOAD_TIMEOUT", "300")
os.environ.setdefault("HF_HUB_UPLOAD_TIMEOUT", "600")
# Suppress huggingface_hub progress bars and verbose logs
os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
os.environ.setdefault("HF_HUB_VERBOSITY", "warning")
import logging
_logging = logging.getLogger("huggingface_hub")
_logging.setLevel(logging.WARNING)
@dataclass
class DatasetStatus:
"""Status of HF Dataset persistence configuration."""
ready: bool
has_token: bool
has_repo: bool
auto_create: bool
repo_exists: bool
write_access: bool
repo_id: str
message: str
emoji: str
def get_dataset_repo_id() -> tuple[str, bool]:
"""
Determine the dataset repository ID.
Returns:
(repo_id, was_auto_derived) tuple
"""
# Try explicit env var first
repo_id = os.environ.get("OPENCLAW_DATASET_REPO", "")
if repo_id:
return repo_id, False
# Auto-derive from SPACE_ID (HF Spaces built-in)
space_id = os.environ.get("SPACE_ID", "")
if space_id:
# SPACE_ID = "username/SpaceName" → derive "username/SpaceName-data"
derived = f"{space_id}-data"
return derived, True
# Fallback: derive from HF_TOKEN username (local Docker)
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
try:
from huggingface_hub import HfApi
api = HfApi(token=hf_token)
username = api.whoami()["name"]
return f"{username}/HuggingClaw-data", True
except Exception:
pass
return "", False
def verify_dataset() -> DatasetStatus:
"""
Verify HF Dataset persistence configuration.
Returns:
DatasetStatus object with verification results
"""
hf_token = os.environ.get("HF_TOKEN")
auto_create = os.environ.get("AUTO_CREATE_DATASET", "false").lower() in ("true", "1", "yes")
repo_id, was_auto_derived = get_dataset_repo_id()
# Check 1: HF_TOKEN
if not hf_token:
return DatasetStatus(
ready=False,
has_token=False,
has_repo=False,
auto_create=auto_create,
repo_exists=False,
write_access=False,
repo_id=repo_id,
message="HF_TOKEN not set - persistence disabled",
emoji="⚠️"
)
# Check 2: Repository ID
if not repo_id:
return DatasetStatus(
ready=False,
has_token=True,
has_repo=False,
auto_create=auto_create,
repo_exists=False,
write_access=False,
repo_id="",
message="Could not determine dataset repo (no SPACE_ID or OPENCLAW_DATASET_REPO)",
emoji="❌"
)
# Check 3: Verify connection and check if repo exists
from huggingface_hub import HfApi
api = HfApi(token=hf_token)
try:
# Test authentication
whoami = api.whoami()
username = whoami.get("name", "")
# Check if repo exists
try:
api.repo_info(repo_id=repo_id, repo_type="dataset")
repo_exists = True
except Exception:
repo_exists = False
# If repo doesn't exist and auto_create is false, warn
if not repo_exists and not auto_create:
return DatasetStatus(
ready=False,
has_token=True,
has_repo=True,
auto_create=False,
repo_exists=False,
write_access=True,
repo_id=repo_id,
message=f"AUTO_CREATE_DATASET=false - dataset '{repo_id}' must exist manually",
emoji="⚠️"
)
# If repo doesn't exist but auto_create is true, we can create it
if not repo_exists and auto_create:
return DatasetStatus(
ready=True,
has_token=True,
has_repo=True,
auto_create=True,
repo_exists=False,
write_access=True,
repo_id=repo_id,
message=f"Will auto-create dataset '{repo_id}' on first sync",
emoji="✅"
)
# All checks passed
source_info = " (auto-derived)" if was_auto_derived else ""
return DatasetStatus(
ready=True,
has_token=True,
has_repo=True,
auto_create=auto_create,
repo_exists=True,
write_access=True,
repo_id=repo_id,
message=f"Persistence fully configured - dataset: {repo_id}{source_info}",
emoji="✅"
)
except Exception as e:
# Connection failed
error_msg = str(e)
if "401" in error_msg or "authentication" in error_msg.lower():
return DatasetStatus(
ready=False,
has_token=True,
has_repo=True,
auto_create=auto_create,
repo_exists=False,
write_access=False,
repo_id=repo_id,
message=f"Authentication failed - check HF_TOKEN has write permissions",
emoji="❌"
)
return DatasetStatus(
ready=False,
has_token=True,
has_repo=True,
auto_create=auto_create,
repo_exists=False,
write_access=False,
repo_id=repo_id,
message=f"Connection failed: {error_msg[:100]}",
emoji="❌"
)
def print_status(status: DatasetStatus) -> None:
"""Print formatted status to console."""
print(f"{status.emoji} {status.message}")
# Print additional details
details = []
if not status.has_token:
details.append(" • HF_TOKEN: NOT SET")
else:
details.append(f" • HF_TOKEN: {'✓ Set' if status.write_access else '✗ Invalid'}")
if not status.has_repo:
details.append(f" • OPENCLAW_DATASET_REPO: NOT SET (will auto-derive)")
else:
details.append(f" • Dataset repo: {status.repo_id}")
details.append(f" • AUTO_CREATE_DATASET: {os.environ.get('AUTO_CREATE_DATASET', 'false')}")
if status.repo_exists:
details.append(" • Repository status: ✓ Exists")
elif status.auto_create and status.has_token:
details.append(" • Repository status: Will be created on first sync")
else:
details.append(" • Repository status: ✗ Not found")
for detail in details:
print(detail)
def main() -> int:
"""
Main entry point when run standalone.
Returns:
0 if persistence is ready, 1 otherwise
"""
print("HF Dataset Persistence Verification")
print("=" * 40)
print()
status = verify_dataset()
print_status(status)
print()
if status.ready:
print("Result: Persistence is READY")
return 0
else:
print("Result: Persistence is NOT ready")
return 1
if __name__ == "__main__":
sys.exit(main())