Spaces:

Real-TSF
/

TIME-leaderboard

Running

File size: 5,423 Bytes

0b97f6a

from dataclasses import dataclass
from enum import Enum
from pathlib import Path
import pandas as pd

# Import HuggingFace Hub configuration
from src.hf_config import get_results_root, get_config_root, get_features_root, initialize_data

from src.utils import (
    get_all_datasets_results, get_all_domains_and_freq, get_all_variates_results,
    get_dataset_choices, get_dataset_display_map, compute_ranks,
    load_features, load_all_features, binarize_features
)


# =============================================================================
# Initialize data from HuggingFace Hub (or local for development)
# =============================================================================
print("🚀 Starting TIME Leaderboard initialization...")

# Download/cache results and config from HuggingFace Hub
RESULTS_ROOT, CONFIG_ROOT = initialize_data()

# Get features root (local or HF)
FEATURES_ROOT = get_features_root()

# Get list of all models from results directory
ALL_MODELS = []
if RESULTS_ROOT.exists():
    ALL_MODELS = [p.name for p in RESULTS_ROOT.iterdir() if p.is_dir()]
    print(f"📊 Found {len(ALL_MODELS)} models: {ALL_MODELS}")

# ---------------------------------------------------
# Get dataset choices from TIME results (with smart display names)
DATASET_CHOICES, DATASET_DISPLAY_TO_ID, DATASET_ID_TO_DISPLAY = get_dataset_choices(str(RESULTS_ROOT))
print(f"📁 Found {len(DATASET_CHOICES)} dataset configurations")

# === Load data once at startup ===
DATASETS_DF = get_all_datasets_results(root_dir=str(RESULTS_ROOT))
if not DATASETS_DF.empty:
    # Use dataset_id (dataset/freq) for ranking to correctly handle multi-freq datasets
    DATASETS_DF = compute_ranks(DATASETS_DF, groupby_cols=['dataset_id', "horizon"])  # Rows: 每一行是1个独立的实验 num_model x num_dataset_id x num_horizons
    print(f"✅ Loaded {len(DATASETS_DF)} dataset results")

# === Load variate-level results for pattern-based leaderboard ===
print("📊 Loading variate-level results...")
VARIATES_DF = get_all_variates_results(root_dir=str(RESULTS_ROOT))
if not VARIATES_DF.empty:
    # Compute ranks per (dataset_id, series_name, variate_name, horizon)
    VARIATES_DF = compute_ranks(VARIATES_DF, groupby_cols=['dataset_id', 'series_name', 'variate_name', 'horizon'])
    print(f"✅ Loaded {len(VARIATES_DF)} variate-level results")
else:
    print("⚠️ No variate-level results found")

# === Load features for pattern-based filtering ===
print("📊 Loading features...")
FEATURES_DF = load_all_features(features_root=str(FEATURES_ROOT), split="test")
if not FEATURES_DF.empty:
    print(f"✅ Loaded {len(FEATURES_DF)} variate features")
else:
    print("⚠️ No features found")

# Columns to exclude from binarization
BINARIZE_EXCLUDE = [
    'dataset_id', 'series_name', 'variate_name', 'unique_id',
    'mean', 'std', 'length',
    'period1', 'period2', 'period3',
    'p_strength1', 'p_strength2', 'p_strength3',
    'missing_rate',
    # Meta features are already 0/1, handle separately
    'is_random_walk', 'has_spike_presence',
]

# Binarize numeric features by median
FEATURES_BOOL_DF = pd.DataFrame()
if not FEATURES_DF.empty:
    FEATURES_BOOL_DF = binarize_features(FEATURES_DF, exclude=BINARIZE_EXCLUDE)
    print(f"✅ Binarized features for {len(FEATURES_BOOL_DF)} variates")


if not DATASETS_DF.empty:
    OVERALL_TABLE_COLUMNS = ["model", "MASE", "CRPS", "MASE_rank", "CRPS_rank"]
else:
    OVERALL_TABLE_COLUMNS = ["model", "MASE", "CRPS"]


ALL_HORIZONS = ['short', 'medium', 'long']

# Pattern mapping: UI pattern name -> feature column name
PATTERN_MAP = {
    # Trend patterns
    "T_strength": "trend_strength",
    "T_linearity": "linearity",
    "T_curvature": "curvature",
    # Seasonal patterns
    "S_strength": "seasonal_strength",
    "S_complexity": "seasonal_entropy",
    "S_corr": "seasonal_corr",
    # Residual patterns
    "R_diff1_ACF1": "e_diff1_acf1",
    "R_ACF1": "e_acf1",
    # Meta patterns
    "stationarity": "is_random_walk",  # Note: stationarity = NOT is_random_walk
    "outlier_presence": "has_spike_presence",
    "complexity": "x_entropy",  # High entropy = low predictability/high noise
}
# ---------------------------------------------------


# Your leaderboard name
TITLE = """<h1 align="center" id="space-title"> It's TIME</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
TIME introduces a unified benchmark for time series probabilistic forecasting that supports evaluation at **multiple granularities**, ranging from overall performance across datasets to dataset-level, variate-level, and even individual test windows (with visualization). Beyond conventional analysis, the benchmark enables **pattern-driven, cross-dataset benchmarking** by grouping variates with similar temporal features, where patterns are defined based on groups of tsfeatures that capture properties such as trend, seasonality, and stationarity, offering a more systematic understanding of model behavior. For data and results, please refer to 🤗 [dataset](https://huggingface.co/datasets/TIME-benchmark/TIME-1.0/tree/main).
"""
# An integrated archive further enriches the platform by providing structural tsfeatures and statistical descriptors of all variates,
# ensuring both comprehensive evaluation and transparent interpretability across diverse forecasting scenarios
print("✅ TIME Leaderboard initialization complete!")