Spaces:
Running
Running
| """ | |
| HuggingFace Hub Configuration and Helper Functions | |
| This module provides configuration and utilities for loading data from HuggingFace Hub. | |
| The data is cached locally after first download, so subsequent accesses are fast. | |
| """ | |
| import os | |
| from functools import lru_cache | |
| from pathlib import Path | |
| from huggingface_hub import snapshot_download | |
| # ============================================================================= | |
| # Configuration | |
| # ============================================================================= | |
| # HuggingFace Dataset repository ID | |
| HF_REPO_ID = os.environ.get("HF_REPO_ID", "Real-TSF/TIME") | |
| HF_OUTPUT_REPO_ID = os.environ.get("HF_OUTPUT_REPO_ID", "Real-TSF/TIME-Output") | |
| # HuggingFace token (set via environment variable for security) | |
| # In HuggingFace Space, set this in Settings -> Repository secrets | |
| HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
| # Whether to use HuggingFace Hub (True) or local files (False) | |
| # Set to False for local development with local data | |
| USE_HF_HUB = os.environ.get("USE_HF_HUB", "true").lower() == "true" | |
| # Local cache directory for HF Hub downloads | |
| HF_CACHE_DIR = os.environ.get("HF_CACHE_DIR", None) # None uses default ~/.cache/huggingface | |
| # Local data paths (used when USE_HF_HUB=false) | |
| # Set these environment variables to specify custom local paths | |
| LOCAL_RESULTS_PATH = os.environ.get("LOCAL_RESULTS_PATH", None) # Path to output/results | |
| LOCAL_FEATURES_PATH = os.environ.get("LOCAL_FEATURES_PATH", None) # Path to output/features | |
| LOCAL_CONFIG_PATH = os.environ.get("LOCAL_CONFIG_PATH", None) # Path to config directory | |
| LOCAL_DATASETS_PATH = os.environ.get("LOCAL_DATASETS_PATH", None) # Path to data/hf_dataset | |
| # ============================================================================= | |
| # Helper Functions | |
| # ============================================================================= | |
| def download_results_snapshot() -> Path: | |
| """ | |
| Download the results directory from HuggingFace Hub. | |
| Uses caching - only downloads once, then returns cached path. | |
| Returns: | |
| Path: Local path to the downloaded results directory | |
| """ | |
| if not USE_HF_HUB: | |
| # Return local path for development | |
| # Priority: 1) LOCAL_RESULTS_PATH env var | |
| if LOCAL_RESULTS_PATH: | |
| local_path = Path(LOCAL_RESULTS_PATH) | |
| else: | |
| local_path = Path("../output/results") | |
| if not local_path.exists(): | |
| print(f"β οΈ Warning: Local results path does not exist: {local_path}") | |
| return local_path | |
| print(f"π₯ Downloading results from HuggingFace Hub: {HF_OUTPUT_REPO_ID}") | |
| local_dir = snapshot_download( | |
| repo_id=HF_OUTPUT_REPO_ID, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| allow_patterns=["results/**"], | |
| cache_dir=HF_CACHE_DIR, | |
| ) | |
| results_path = Path(local_dir) / "results" | |
| print(f"β Results cached at: {results_path}") | |
| return results_path | |
| def download_datasets_snapshot() -> Path: | |
| """ | |
| Download the hf_dataset directory from HuggingFace Hub. | |
| Uses caching - only downloads once, then returns cached path. | |
| Returns: | |
| Path: Local path to the downloaded hf_dataset directory | |
| """ | |
| if not USE_HF_HUB: | |
| # Return local path for development | |
| # Priority: 1) LOCAL_DATASETS_PATH env var, 2) ../data/hf_dataset | |
| if LOCAL_DATASETS_PATH: | |
| local_path = Path(LOCAL_DATASETS_PATH) | |
| else: | |
| local_path = Path("../data/hf_dataset") | |
| if not local_path.exists(): | |
| print(f"β οΈ Warning: Local datasets path does not exist: {local_path}") | |
| return local_path | |
| print(f"π₯ Downloading datasets from HuggingFace Hub: {HF_REPO_ID}") | |
| local_dir = snapshot_download( | |
| repo_id=HF_REPO_ID, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| allow_patterns=None, | |
| cache_dir=HF_CACHE_DIR, | |
| ) | |
| datasets_path = Path(local_dir) | |
| print(f"β Datasets cached at: {datasets_path}") | |
| return datasets_path | |
| def download_config_snapshot() -> Path: | |
| """ | |
| Get the config directory from the installed timebench package. | |
| The config (datasets.yaml) is bundled with the timebench package, | |
| so no download is needed - we just use the installed package's config. | |
| Returns: | |
| Path: Local path to the config directory | |
| """ | |
| # Try to get config from installed timebench package | |
| try: | |
| from timebench.evaluation.data import DEFAULT_CONFIG_PATH | |
| config_path = DEFAULT_CONFIG_PATH.parent # Get the config directory | |
| if config_path.exists(): | |
| # print(f"π Using config from timebench package: {config_path}") | |
| return config_path | |
| except ImportError: | |
| print(f"β ImportError: {ImportError}, using local config") | |
| pass | |
| # Fallback: Local development path | |
| # Priority: 1) LOCAL_CONFIG_PATH env var, 2) ../config, | |
| if LOCAL_CONFIG_PATH: | |
| local_path = Path(LOCAL_CONFIG_PATH) | |
| if local_path.exists(): | |
| print(f"π Using local config: {local_path}") | |
| return local_path | |
| raise FileNotFoundError( | |
| "Config directory not found. Please ensure timebench is installed, " | |
| "set USE_HF_HUB=false for local development, " | |
| "or set LOCAL_CONFIG_PATH environment variable to point to your config directory." | |
| ) | |
| def get_results_root() -> Path: | |
| """Get the root path for results (handles both HF Hub and local).""" | |
| return download_results_snapshot() | |
| def get_datasets_root() -> Path: | |
| """Get the root path for hf_dataset (handles both HF Hub and local).""" | |
| return download_datasets_snapshot() | |
| def get_config_root() -> Path: | |
| """Get the root path for config (handles both HF Hub and local).""" | |
| return download_config_snapshot() | |
| def get_features_root() -> Path: | |
| """ | |
| Get the root path for features (handles both HF Hub and local). | |
| Features are stored at output/features/{dataset}/{freq}/test.csv | |
| Returns: | |
| Path: Local path to the features directory | |
| """ | |
| if not USE_HF_HUB: | |
| # Return local path for development | |
| # Priority: 1) LOCAL_FEATURES_PATH env var, 2) ../output/features, 3) /home/eee/qzz/TIME/output/features | |
| if LOCAL_FEATURES_PATH: | |
| local_path = Path(LOCAL_FEATURES_PATH) | |
| else: | |
| local_path = Path("../output/features") | |
| if not local_path.exists(): | |
| local_path = Path("/home/eee/qzz/TIME/output/features") | |
| if not local_path.exists(): | |
| print(f"β οΈ Warning: Local features path does not exist: {local_path}") | |
| return local_path | |
| # For HF Hub, features are in the same repo as results | |
| print(f"π₯ Downloading features from HuggingFace Hub: {HF_OUTPUT_REPO_ID}") | |
| local_dir = snapshot_download( | |
| repo_id=HF_OUTPUT_REPO_ID, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| allow_patterns=["features/**"], | |
| cache_dir=HF_CACHE_DIR, | |
| ) | |
| features_path = Path(local_dir) / "features" | |
| print(f"β Features cached at: {features_path}") | |
| return features_path | |
| def clear_cache(): | |
| """Clear the LRU cache to force re-download on next access.""" | |
| download_results_snapshot.cache_clear() | |
| download_datasets_snapshot.cache_clear() | |
| # ============================================================================= | |
| # Initialization - Download data at module import | |
| # ============================================================================= | |
| def initialize_data(): | |
| """ | |
| Initialize by downloading all necessary data. | |
| Call this at app startup to pre-download data. | |
| """ | |
| print("π Initializing TIME Leaderboard data...") | |
| # Download results (required for leaderboard) | |
| results_root = get_results_root() | |
| print(f" Results: {results_root}") | |
| # Download config (required for dataset settings) | |
| config_root = get_config_root() | |
| print(f" Config: {config_root}") | |
| # Note: Datasets are downloaded on-demand when visualization is needed | |
| # to reduce initial load time | |
| print("β Initialization complete!") | |
| return results_root, config_root | |