TIME-leaderboard / src /hf_config.py
zqiao11's picture
Fit new hf Dataset
3e5a1a0
"""
HuggingFace Hub Configuration and Helper Functions
This module provides configuration and utilities for loading data from HuggingFace Hub.
The data is cached locally after first download, so subsequent accesses are fast.
"""
import os
from functools import lru_cache
from pathlib import Path
from huggingface_hub import snapshot_download
# =============================================================================
# Configuration
# =============================================================================
# HuggingFace Dataset repository ID
HF_REPO_ID = os.environ.get("HF_REPO_ID", "Real-TSF/TIME")
HF_OUTPUT_REPO_ID = os.environ.get("HF_OUTPUT_REPO_ID", "Real-TSF/TIME-Output")
# HuggingFace token (set via environment variable for security)
# In HuggingFace Space, set this in Settings -> Repository secrets
HF_TOKEN = os.environ.get("HF_TOKEN", None)
# Whether to use HuggingFace Hub (True) or local files (False)
# Set to False for local development with local data
USE_HF_HUB = os.environ.get("USE_HF_HUB", "true").lower() == "true"
# Local cache directory for HF Hub downloads
HF_CACHE_DIR = os.environ.get("HF_CACHE_DIR", None) # None uses default ~/.cache/huggingface
# Local data paths (used when USE_HF_HUB=false)
# Set these environment variables to specify custom local paths
LOCAL_RESULTS_PATH = os.environ.get("LOCAL_RESULTS_PATH", None) # Path to output/results
LOCAL_FEATURES_PATH = os.environ.get("LOCAL_FEATURES_PATH", None) # Path to output/features
LOCAL_CONFIG_PATH = os.environ.get("LOCAL_CONFIG_PATH", None) # Path to config directory
LOCAL_DATASETS_PATH = os.environ.get("LOCAL_DATASETS_PATH", None) # Path to data/hf_dataset
# =============================================================================
# Helper Functions
# =============================================================================
@lru_cache(maxsize=1)
def download_results_snapshot() -> Path:
"""
Download the results directory from HuggingFace Hub.
Uses caching - only downloads once, then returns cached path.
Returns:
Path: Local path to the downloaded results directory
"""
if not USE_HF_HUB:
# Return local path for development
# Priority: 1) LOCAL_RESULTS_PATH env var
if LOCAL_RESULTS_PATH:
local_path = Path(LOCAL_RESULTS_PATH)
else:
local_path = Path("../output/results")
if not local_path.exists():
print(f"⚠️ Warning: Local results path does not exist: {local_path}")
return local_path
print(f"πŸ“₯ Downloading results from HuggingFace Hub: {HF_OUTPUT_REPO_ID}")
local_dir = snapshot_download(
repo_id=HF_OUTPUT_REPO_ID,
repo_type="dataset",
token=HF_TOKEN,
allow_patterns=["results/**"],
cache_dir=HF_CACHE_DIR,
)
results_path = Path(local_dir) / "results"
print(f"βœ… Results cached at: {results_path}")
return results_path
@lru_cache(maxsize=1)
def download_datasets_snapshot() -> Path:
"""
Download the hf_dataset directory from HuggingFace Hub.
Uses caching - only downloads once, then returns cached path.
Returns:
Path: Local path to the downloaded hf_dataset directory
"""
if not USE_HF_HUB:
# Return local path for development
# Priority: 1) LOCAL_DATASETS_PATH env var, 2) ../data/hf_dataset
if LOCAL_DATASETS_PATH:
local_path = Path(LOCAL_DATASETS_PATH)
else:
local_path = Path("../data/hf_dataset")
if not local_path.exists():
print(f"⚠️ Warning: Local datasets path does not exist: {local_path}")
return local_path
print(f"πŸ“₯ Downloading datasets from HuggingFace Hub: {HF_REPO_ID}")
local_dir = snapshot_download(
repo_id=HF_REPO_ID,
repo_type="dataset",
token=HF_TOKEN,
allow_patterns=None,
cache_dir=HF_CACHE_DIR,
)
datasets_path = Path(local_dir)
print(f"βœ… Datasets cached at: {datasets_path}")
return datasets_path
def download_config_snapshot() -> Path:
"""
Get the config directory from the installed timebench package.
The config (datasets.yaml) is bundled with the timebench package,
so no download is needed - we just use the installed package's config.
Returns:
Path: Local path to the config directory
"""
# Try to get config from installed timebench package
try:
from timebench.evaluation.data import DEFAULT_CONFIG_PATH
config_path = DEFAULT_CONFIG_PATH.parent # Get the config directory
if config_path.exists():
# print(f"πŸ“ Using config from timebench package: {config_path}")
return config_path
except ImportError:
print(f"❌ ImportError: {ImportError}, using local config")
pass
# Fallback: Local development path
# Priority: 1) LOCAL_CONFIG_PATH env var, 2) ../config,
if LOCAL_CONFIG_PATH:
local_path = Path(LOCAL_CONFIG_PATH)
if local_path.exists():
print(f"πŸ“ Using local config: {local_path}")
return local_path
raise FileNotFoundError(
"Config directory not found. Please ensure timebench is installed, "
"set USE_HF_HUB=false for local development, "
"or set LOCAL_CONFIG_PATH environment variable to point to your config directory."
)
def get_results_root() -> Path:
"""Get the root path for results (handles both HF Hub and local)."""
return download_results_snapshot()
def get_datasets_root() -> Path:
"""Get the root path for hf_dataset (handles both HF Hub and local)."""
return download_datasets_snapshot()
def get_config_root() -> Path:
"""Get the root path for config (handles both HF Hub and local)."""
return download_config_snapshot()
def get_features_root() -> Path:
"""
Get the root path for features (handles both HF Hub and local).
Features are stored at output/features/{dataset}/{freq}/test.csv
Returns:
Path: Local path to the features directory
"""
if not USE_HF_HUB:
# Return local path for development
# Priority: 1) LOCAL_FEATURES_PATH env var, 2) ../output/features, 3) /home/eee/qzz/TIME/output/features
if LOCAL_FEATURES_PATH:
local_path = Path(LOCAL_FEATURES_PATH)
else:
local_path = Path("../output/features")
if not local_path.exists():
local_path = Path("/home/eee/qzz/TIME/output/features")
if not local_path.exists():
print(f"⚠️ Warning: Local features path does not exist: {local_path}")
return local_path
# For HF Hub, features are in the same repo as results
print(f"πŸ“₯ Downloading features from HuggingFace Hub: {HF_OUTPUT_REPO_ID}")
local_dir = snapshot_download(
repo_id=HF_OUTPUT_REPO_ID,
repo_type="dataset",
token=HF_TOKEN,
allow_patterns=["features/**"],
cache_dir=HF_CACHE_DIR,
)
features_path = Path(local_dir) / "features"
print(f"βœ… Features cached at: {features_path}")
return features_path
def clear_cache():
"""Clear the LRU cache to force re-download on next access."""
download_results_snapshot.cache_clear()
download_datasets_snapshot.cache_clear()
# =============================================================================
# Initialization - Download data at module import
# =============================================================================
def initialize_data():
"""
Initialize by downloading all necessary data.
Call this at app startup to pre-download data.
"""
print("πŸš€ Initializing TIME Leaderboard data...")
# Download results (required for leaderboard)
results_root = get_results_root()
print(f" Results: {results_root}")
# Download config (required for dataset settings)
config_root = get_config_root()
print(f" Config: {config_root}")
# Note: Datasets are downloaded on-demand when visualization is needed
# to reduce initial load time
print("βœ… Initialization complete!")
return results_root, config_root