zqiao11's picture
Initial release
0b97f6a
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
import pandas as pd
# Import HuggingFace Hub configuration
from src.hf_config import get_results_root, get_config_root, get_features_root, initialize_data
from src.utils import (
get_all_datasets_results, get_all_domains_and_freq, get_all_variates_results,
get_dataset_choices, get_dataset_display_map, compute_ranks,
load_features, load_all_features, binarize_features
)
# =============================================================================
# Initialize data from HuggingFace Hub (or local for development)
# =============================================================================
print("๐Ÿš€ Starting TIME Leaderboard initialization...")
# Download/cache results and config from HuggingFace Hub
RESULTS_ROOT, CONFIG_ROOT = initialize_data()
# Get features root (local or HF)
FEATURES_ROOT = get_features_root()
# Get list of all models from results directory
ALL_MODELS = []
if RESULTS_ROOT.exists():
ALL_MODELS = [p.name for p in RESULTS_ROOT.iterdir() if p.is_dir()]
print(f"๐Ÿ“Š Found {len(ALL_MODELS)} models: {ALL_MODELS}")
# ---------------------------------------------------
# Get dataset choices from TIME results (with smart display names)
DATASET_CHOICES, DATASET_DISPLAY_TO_ID, DATASET_ID_TO_DISPLAY = get_dataset_choices(str(RESULTS_ROOT))
print(f"๐Ÿ“ Found {len(DATASET_CHOICES)} dataset configurations")
# === Load data once at startup ===
DATASETS_DF = get_all_datasets_results(root_dir=str(RESULTS_ROOT))
if not DATASETS_DF.empty:
# Use dataset_id (dataset/freq) for ranking to correctly handle multi-freq datasets
DATASETS_DF = compute_ranks(DATASETS_DF, groupby_cols=['dataset_id', "horizon"]) # Rows: ๆฏไธ€่กŒๆ˜ฏ1ไธช็‹ฌ็ซ‹็š„ๅฎž้ชŒ num_model x num_dataset_id x num_horizons
print(f"โœ… Loaded {len(DATASETS_DF)} dataset results")
# === Load variate-level results for pattern-based leaderboard ===
print("๐Ÿ“Š Loading variate-level results...")
VARIATES_DF = get_all_variates_results(root_dir=str(RESULTS_ROOT))
if not VARIATES_DF.empty:
# Compute ranks per (dataset_id, series_name, variate_name, horizon)
VARIATES_DF = compute_ranks(VARIATES_DF, groupby_cols=['dataset_id', 'series_name', 'variate_name', 'horizon'])
print(f"โœ… Loaded {len(VARIATES_DF)} variate-level results")
else:
print("โš ๏ธ No variate-level results found")
# === Load features for pattern-based filtering ===
print("๐Ÿ“Š Loading features...")
FEATURES_DF = load_all_features(features_root=str(FEATURES_ROOT), split="test")
if not FEATURES_DF.empty:
print(f"โœ… Loaded {len(FEATURES_DF)} variate features")
else:
print("โš ๏ธ No features found")
# Columns to exclude from binarization
BINARIZE_EXCLUDE = [
'dataset_id', 'series_name', 'variate_name', 'unique_id',
'mean', 'std', 'length',
'period1', 'period2', 'period3',
'p_strength1', 'p_strength2', 'p_strength3',
'missing_rate',
# Meta features are already 0/1, handle separately
'is_random_walk', 'has_spike_presence',
]
# Binarize numeric features by median
FEATURES_BOOL_DF = pd.DataFrame()
if not FEATURES_DF.empty:
FEATURES_BOOL_DF = binarize_features(FEATURES_DF, exclude=BINARIZE_EXCLUDE)
print(f"โœ… Binarized features for {len(FEATURES_BOOL_DF)} variates")
if not DATASETS_DF.empty:
OVERALL_TABLE_COLUMNS = ["model", "MASE", "CRPS", "MASE_rank", "CRPS_rank"]
else:
OVERALL_TABLE_COLUMNS = ["model", "MASE", "CRPS"]
ALL_HORIZONS = ['short', 'medium', 'long']
# Pattern mapping: UI pattern name -> feature column name
PATTERN_MAP = {
# Trend patterns
"T_strength": "trend_strength",
"T_linearity": "linearity",
"T_curvature": "curvature",
# Seasonal patterns
"S_strength": "seasonal_strength",
"S_complexity": "seasonal_entropy",
"S_corr": "seasonal_corr",
# Residual patterns
"R_diff1_ACF1": "e_diff1_acf1",
"R_ACF1": "e_acf1",
# Meta patterns
"stationarity": "is_random_walk", # Note: stationarity = NOT is_random_walk
"outlier_presence": "has_spike_presence",
"complexity": "x_entropy", # High entropy = low predictability/high noise
}
# ---------------------------------------------------
# Your leaderboard name
TITLE = """<h1 align="center" id="space-title"> It's TIME</h1>"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
TIME introduces a unified benchmark for time series probabilistic forecasting that supports evaluation at **multiple granularities**, ranging from overall performance across datasets to dataset-level, variate-level, and even individual test windows (with visualization). Beyond conventional analysis, the benchmark enables **pattern-driven, cross-dataset benchmarking** by grouping variates with similar temporal features, where patterns are defined based on groups of tsfeatures that capture properties such as trend, seasonality, and stationarity, offering a more systematic understanding of model behavior. For data and results, please refer to ๐Ÿค— [dataset](https://huggingface.co/datasets/TIME-benchmark/TIME-1.0/tree/main).
"""
# An integrated archive further enriches the platform by providing structural tsfeatures and statistical descriptors of all variates,
# ensuring both comprehensive evaluation and transparent interpretability across diverse forecasting scenarios
print("โœ… TIME Leaderboard initialization complete!")