Spaces:

Real-TSF
/

TIME-leaderboard

Running

App Files Files Community

TIME-leaderboard / src /hf_config.py

zqiao11

Fit new hf Dataset

3e5a1a0 17 days ago

raw

history blame contribute delete

8.18 kB

	"""
	HuggingFace Hub Configuration and Helper Functions

	This module provides configuration and utilities for loading data from HuggingFace Hub.
	The data is cached locally after first download, so subsequent accesses are fast.
	"""

	import os
	from functools import lru_cache
	from pathlib import Path

	from huggingface_hub import snapshot_download

	# =============================================================================
	# Configuration
	# =============================================================================

	# HuggingFace Dataset repository ID
	HF_REPO_ID = os.environ.get("HF_REPO_ID", "Real-TSF/TIME")
	HF_OUTPUT_REPO_ID = os.environ.get("HF_OUTPUT_REPO_ID", "Real-TSF/TIME-Output")

	# HuggingFace token (set via environment variable for security)
	# In HuggingFace Space, set this in Settings -> Repository secrets
	HF_TOKEN = os.environ.get("HF_TOKEN", None)

	# Whether to use HuggingFace Hub (True) or local files (False)
	# Set to False for local development with local data
	USE_HF_HUB = os.environ.get("USE_HF_HUB", "true").lower() == "true"

	# Local cache directory for HF Hub downloads
	HF_CACHE_DIR = os.environ.get("HF_CACHE_DIR", None) # None uses default ~/.cache/huggingface

	# Local data paths (used when USE_HF_HUB=false)
	# Set these environment variables to specify custom local paths
	LOCAL_RESULTS_PATH = os.environ.get("LOCAL_RESULTS_PATH", None) # Path to output/results
	LOCAL_FEATURES_PATH = os.environ.get("LOCAL_FEATURES_PATH", None) # Path to output/features
	LOCAL_CONFIG_PATH = os.environ.get("LOCAL_CONFIG_PATH", None) # Path to config directory
	LOCAL_DATASETS_PATH = os.environ.get("LOCAL_DATASETS_PATH", None) # Path to data/hf_dataset

	# =============================================================================
	# Helper Functions
	# =============================================================================

	@lru_cache(maxsize=1)
	def download_results_snapshot() -> Path:
	"""
	Download the results directory from HuggingFace Hub.
	Uses caching - only downloads once, then returns cached path.

	Returns:
	Path: Local path to the downloaded results directory
	"""
	if not USE_HF_HUB:
	# Return local path for development
	# Priority: 1) LOCAL_RESULTS_PATH env var
	if LOCAL_RESULTS_PATH:
	local_path = Path(LOCAL_RESULTS_PATH)
	else:
	local_path = Path("../output/results")
	if not local_path.exists():
	print(f"⚠️ Warning: Local results path does not exist: {local_path}")
	return local_path

	print(f"📥 Downloading results from HuggingFace Hub: {HF_OUTPUT_REPO_ID}")

	local_dir = snapshot_download(
	repo_id=HF_OUTPUT_REPO_ID,
	repo_type="dataset",
	token=HF_TOKEN,
	allow_patterns=["results/**"],
	cache_dir=HF_CACHE_DIR,
	)

	results_path = Path(local_dir) / "results"
	print(f"✅ Results cached at: {results_path}")
	return results_path


	@lru_cache(maxsize=1)
	def download_datasets_snapshot() -> Path:
	"""
	Download the hf_dataset directory from HuggingFace Hub.
	Uses caching - only downloads once, then returns cached path.

	Returns:
	Path: Local path to the downloaded hf_dataset directory
	"""
	if not USE_HF_HUB:
	# Return local path for development
	# Priority: 1) LOCAL_DATASETS_PATH env var, 2) ../data/hf_dataset
	if LOCAL_DATASETS_PATH:
	local_path = Path(LOCAL_DATASETS_PATH)
	else:
	local_path = Path("../data/hf_dataset")
	if not local_path.exists():
	print(f"⚠️ Warning: Local datasets path does not exist: {local_path}")
	return local_path

	print(f"📥 Downloading datasets from HuggingFace Hub: {HF_REPO_ID}")

	local_dir = snapshot_download(
	repo_id=HF_REPO_ID,
	repo_type="dataset",
	token=HF_TOKEN,
	allow_patterns=None,
	cache_dir=HF_CACHE_DIR,
	)

	datasets_path = Path(local_dir)
	print(f"✅ Datasets cached at: {datasets_path}")
	return datasets_path


	def download_config_snapshot() -> Path:
	"""
	Get the config directory from the installed timebench package.

	The config (datasets.yaml) is bundled with the timebench package,
	so no download is needed - we just use the installed package's config.

	Returns:
	Path: Local path to the config directory
	"""
	# Try to get config from installed timebench package
	try:
	from timebench.evaluation.data import DEFAULT_CONFIG_PATH
	config_path = DEFAULT_CONFIG_PATH.parent # Get the config directory
	if config_path.exists():
	# print(f"📁 Using config from timebench package: {config_path}")
	return config_path
	except ImportError:
	print(f"❌ ImportError: {ImportError}, using local config")
	pass

	# Fallback: Local development path
	# Priority: 1) LOCAL_CONFIG_PATH env var, 2) ../config,
	if LOCAL_CONFIG_PATH:
	local_path = Path(LOCAL_CONFIG_PATH)

	if local_path.exists():
	print(f"📁 Using local config: {local_path}")
	return local_path

	raise FileNotFoundError(
	"Config directory not found. Please ensure timebench is installed, "
	"set USE_HF_HUB=false for local development, "
	"or set LOCAL_CONFIG_PATH environment variable to point to your config directory."
	)


	def get_results_root() -> Path:
	"""Get the root path for results (handles both HF Hub and local)."""
	return download_results_snapshot()


	def get_datasets_root() -> Path:
	"""Get the root path for hf_dataset (handles both HF Hub and local)."""
	return download_datasets_snapshot()


	def get_config_root() -> Path:
	"""Get the root path for config (handles both HF Hub and local)."""
	return download_config_snapshot()


	def get_features_root() -> Path:
	"""
	Get the root path for features (handles both HF Hub and local).

	Features are stored at output/features/{dataset}/{freq}/test.csv

	Returns:
	Path: Local path to the features directory
	"""
	if not USE_HF_HUB:
	# Return local path for development
	# Priority: 1) LOCAL_FEATURES_PATH env var, 2) ../output/features, 3) /home/eee/qzz/TIME/output/features
	if LOCAL_FEATURES_PATH:
	local_path = Path(LOCAL_FEATURES_PATH)
	else:
	local_path = Path("../output/features")
	if not local_path.exists():
	local_path = Path("/home/eee/qzz/TIME/output/features")
	if not local_path.exists():
	print(f"⚠️ Warning: Local features path does not exist: {local_path}")
	return local_path

	# For HF Hub, features are in the same repo as results
	print(f"📥 Downloading features from HuggingFace Hub: {HF_OUTPUT_REPO_ID}")

	local_dir = snapshot_download(
	repo_id=HF_OUTPUT_REPO_ID,
	repo_type="dataset",
	token=HF_TOKEN,
	allow_patterns=["features/**"],
	cache_dir=HF_CACHE_DIR,
	)

	features_path = Path(local_dir) / "features"
	print(f"✅ Features cached at: {features_path}")
	return features_path


	def clear_cache():
	"""Clear the LRU cache to force re-download on next access."""
	download_results_snapshot.cache_clear()
	download_datasets_snapshot.cache_clear()


	# =============================================================================
	# Initialization - Download data at module import
	# =============================================================================

	def initialize_data():
	"""
	Initialize by downloading all necessary data.
	Call this at app startup to pre-download data.
	"""
	print("🚀 Initializing TIME Leaderboard data...")

	# Download results (required for leaderboard)
	results_root = get_results_root()
	print(f" Results: {results_root}")

	# Download config (required for dataset settings)
	config_root = get_config_root()
	print(f" Config: {config_root}")

	# Note: Datasets are downloaded on-demand when visualization is needed
	# to reduce initial load time

	print("✅ Initialization complete!")
	return results_root, config_root