""" HuggingFace Hub Configuration and Helper Functions This module provides configuration and utilities for loading data from HuggingFace Hub. The data is cached locally after first download, so subsequent accesses are fast. """ import os from functools import lru_cache from pathlib import Path from huggingface_hub import snapshot_download # ============================================================================= # Configuration # ============================================================================= # HuggingFace Dataset repository ID HF_REPO_ID = os.environ.get("HF_REPO_ID", "Real-TSF/TIME") HF_OUTPUT_REPO_ID = os.environ.get("HF_OUTPUT_REPO_ID", "Real-TSF/TIME-Output") # HuggingFace token (set via environment variable for security) # In HuggingFace Space, set this in Settings -> Repository secrets HF_TOKEN = os.environ.get("HF_TOKEN", None) # Whether to use HuggingFace Hub (True) or local files (False) # Set to False for local development with local data USE_HF_HUB = os.environ.get("USE_HF_HUB", "true").lower() == "true" # Local cache directory for HF Hub downloads HF_CACHE_DIR = os.environ.get("HF_CACHE_DIR", None) # None uses default ~/.cache/huggingface # Local data paths (used when USE_HF_HUB=false) # Set these environment variables to specify custom local paths LOCAL_RESULTS_PATH = os.environ.get("LOCAL_RESULTS_PATH", None) # Path to output/results LOCAL_FEATURES_PATH = os.environ.get("LOCAL_FEATURES_PATH", None) # Path to output/features LOCAL_CONFIG_PATH = os.environ.get("LOCAL_CONFIG_PATH", None) # Path to config directory LOCAL_DATASETS_PATH = os.environ.get("LOCAL_DATASETS_PATH", None) # Path to data/hf_dataset # ============================================================================= # Helper Functions # ============================================================================= @lru_cache(maxsize=1) def download_results_snapshot() -> Path: """ Download the results directory from HuggingFace Hub. Uses caching - only downloads once, then returns cached path. Returns: Path: Local path to the downloaded results directory """ if not USE_HF_HUB: # Return local path for development # Priority: 1) LOCAL_RESULTS_PATH env var if LOCAL_RESULTS_PATH: local_path = Path(LOCAL_RESULTS_PATH) else: local_path = Path("../output/results") if not local_path.exists(): print(f"⚠️ Warning: Local results path does not exist: {local_path}") return local_path print(f"📥 Downloading results from HuggingFace Hub: {HF_OUTPUT_REPO_ID}") local_dir = snapshot_download( repo_id=HF_OUTPUT_REPO_ID, repo_type="dataset", token=HF_TOKEN, allow_patterns=["results/**"], cache_dir=HF_CACHE_DIR, ) results_path = Path(local_dir) / "results" print(f"✅ Results cached at: {results_path}") return results_path @lru_cache(maxsize=1) def download_datasets_snapshot() -> Path: """ Download the hf_dataset directory from HuggingFace Hub. Uses caching - only downloads once, then returns cached path. Returns: Path: Local path to the downloaded hf_dataset directory """ if not USE_HF_HUB: # Return local path for development # Priority: 1) LOCAL_DATASETS_PATH env var, 2) ../data/hf_dataset if LOCAL_DATASETS_PATH: local_path = Path(LOCAL_DATASETS_PATH) else: local_path = Path("../data/hf_dataset") if not local_path.exists(): print(f"⚠️ Warning: Local datasets path does not exist: {local_path}") return local_path print(f"📥 Downloading datasets from HuggingFace Hub: {HF_REPO_ID}") local_dir = snapshot_download( repo_id=HF_REPO_ID, repo_type="dataset", token=HF_TOKEN, allow_patterns=None, cache_dir=HF_CACHE_DIR, ) datasets_path = Path(local_dir) print(f"✅ Datasets cached at: {datasets_path}") return datasets_path def download_config_snapshot() -> Path: """ Get the config directory from the installed timebench package. The config (datasets.yaml) is bundled with the timebench package, so no download is needed - we just use the installed package's config. Returns: Path: Local path to the config directory """ # Try to get config from installed timebench package try: from timebench.evaluation.data import DEFAULT_CONFIG_PATH config_path = DEFAULT_CONFIG_PATH.parent # Get the config directory if config_path.exists(): # print(f"📁 Using config from timebench package: {config_path}") return config_path except ImportError: print(f"❌ ImportError: {ImportError}, using local config") pass # Fallback: Local development path # Priority: 1) LOCAL_CONFIG_PATH env var, 2) ../config, if LOCAL_CONFIG_PATH: local_path = Path(LOCAL_CONFIG_PATH) if local_path.exists(): print(f"📁 Using local config: {local_path}") return local_path raise FileNotFoundError( "Config directory not found. Please ensure timebench is installed, " "set USE_HF_HUB=false for local development, " "or set LOCAL_CONFIG_PATH environment variable to point to your config directory." ) def get_results_root() -> Path: """Get the root path for results (handles both HF Hub and local).""" return download_results_snapshot() def get_datasets_root() -> Path: """Get the root path for hf_dataset (handles both HF Hub and local).""" return download_datasets_snapshot() def get_config_root() -> Path: """Get the root path for config (handles both HF Hub and local).""" return download_config_snapshot() def get_features_root() -> Path: """ Get the root path for features (handles both HF Hub and local). Features are stored at output/features/{dataset}/{freq}/test.csv Returns: Path: Local path to the features directory """ if not USE_HF_HUB: # Return local path for development # Priority: 1) LOCAL_FEATURES_PATH env var, 2) ../output/features, 3) /home/eee/qzz/TIME/output/features if LOCAL_FEATURES_PATH: local_path = Path(LOCAL_FEATURES_PATH) else: local_path = Path("../output/features") if not local_path.exists(): local_path = Path("/home/eee/qzz/TIME/output/features") if not local_path.exists(): print(f"⚠️ Warning: Local features path does not exist: {local_path}") return local_path # For HF Hub, features are in the same repo as results print(f"📥 Downloading features from HuggingFace Hub: {HF_OUTPUT_REPO_ID}") local_dir = snapshot_download( repo_id=HF_OUTPUT_REPO_ID, repo_type="dataset", token=HF_TOKEN, allow_patterns=["features/**"], cache_dir=HF_CACHE_DIR, ) features_path = Path(local_dir) / "features" print(f"✅ Features cached at: {features_path}") return features_path def clear_cache(): """Clear the LRU cache to force re-download on next access.""" download_results_snapshot.cache_clear() download_datasets_snapshot.cache_clear() # ============================================================================= # Initialization - Download data at module import # ============================================================================= def initialize_data(): """ Initialize by downloading all necessary data. Call this at app startup to pre-download data. """ print("🚀 Initializing TIME Leaderboard data...") # Download results (required for leaderboard) results_root = get_results_root() print(f" Results: {results_root}") # Download config (required for dataset settings) config_root = get_config_root() print(f" Config: {config_root}") # Note: Datasets are downloaded on-demand when visualization is needed # to reduce initial load time print("✅ Initialization complete!") return results_root, config_root