File size: 5,423 Bytes
0b97f6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
import pandas as pd

# Import HuggingFace Hub configuration
from src.hf_config import get_results_root, get_config_root, get_features_root, initialize_data

from src.utils import (
    get_all_datasets_results, get_all_domains_and_freq, get_all_variates_results,
    get_dataset_choices, get_dataset_display_map, compute_ranks,
    load_features, load_all_features, binarize_features
)


# =============================================================================
# Initialize data from HuggingFace Hub (or local for development)
# =============================================================================
print("๐Ÿš€ Starting TIME Leaderboard initialization...")

# Download/cache results and config from HuggingFace Hub
RESULTS_ROOT, CONFIG_ROOT = initialize_data()

# Get features root (local or HF)
FEATURES_ROOT = get_features_root()

# Get list of all models from results directory
ALL_MODELS = []
if RESULTS_ROOT.exists():
    ALL_MODELS = [p.name for p in RESULTS_ROOT.iterdir() if p.is_dir()]
    print(f"๐Ÿ“Š Found {len(ALL_MODELS)} models: {ALL_MODELS}")

# ---------------------------------------------------
# Get dataset choices from TIME results (with smart display names)
DATASET_CHOICES, DATASET_DISPLAY_TO_ID, DATASET_ID_TO_DISPLAY = get_dataset_choices(str(RESULTS_ROOT))
print(f"๐Ÿ“ Found {len(DATASET_CHOICES)} dataset configurations")

# === Load data once at startup ===
DATASETS_DF = get_all_datasets_results(root_dir=str(RESULTS_ROOT))
if not DATASETS_DF.empty:
    # Use dataset_id (dataset/freq) for ranking to correctly handle multi-freq datasets
    DATASETS_DF = compute_ranks(DATASETS_DF, groupby_cols=['dataset_id', "horizon"])  # Rows: ๆฏไธ€่กŒๆ˜ฏ1ไธช็‹ฌ็ซ‹็š„ๅฎž้ชŒ num_model x num_dataset_id x num_horizons
    print(f"โœ… Loaded {len(DATASETS_DF)} dataset results")

# === Load variate-level results for pattern-based leaderboard ===
print("๐Ÿ“Š Loading variate-level results...")
VARIATES_DF = get_all_variates_results(root_dir=str(RESULTS_ROOT))
if not VARIATES_DF.empty:
    # Compute ranks per (dataset_id, series_name, variate_name, horizon)
    VARIATES_DF = compute_ranks(VARIATES_DF, groupby_cols=['dataset_id', 'series_name', 'variate_name', 'horizon'])
    print(f"โœ… Loaded {len(VARIATES_DF)} variate-level results")
else:
    print("โš ๏ธ No variate-level results found")

# === Load features for pattern-based filtering ===
print("๐Ÿ“Š Loading features...")
FEATURES_DF = load_all_features(features_root=str(FEATURES_ROOT), split="test")
if not FEATURES_DF.empty:
    print(f"โœ… Loaded {len(FEATURES_DF)} variate features")
else:
    print("โš ๏ธ No features found")

# Columns to exclude from binarization
BINARIZE_EXCLUDE = [
    'dataset_id', 'series_name', 'variate_name', 'unique_id',
    'mean', 'std', 'length',
    'period1', 'period2', 'period3',
    'p_strength1', 'p_strength2', 'p_strength3',
    'missing_rate',
    # Meta features are already 0/1, handle separately
    'is_random_walk', 'has_spike_presence',
]

# Binarize numeric features by median
FEATURES_BOOL_DF = pd.DataFrame()
if not FEATURES_DF.empty:
    FEATURES_BOOL_DF = binarize_features(FEATURES_DF, exclude=BINARIZE_EXCLUDE)
    print(f"โœ… Binarized features for {len(FEATURES_BOOL_DF)} variates")


if not DATASETS_DF.empty:
    OVERALL_TABLE_COLUMNS = ["model", "MASE", "CRPS", "MASE_rank", "CRPS_rank"]
else:
    OVERALL_TABLE_COLUMNS = ["model", "MASE", "CRPS"]


ALL_HORIZONS = ['short', 'medium', 'long']

# Pattern mapping: UI pattern name -> feature column name
PATTERN_MAP = {
    # Trend patterns
    "T_strength": "trend_strength",
    "T_linearity": "linearity",
    "T_curvature": "curvature",
    # Seasonal patterns
    "S_strength": "seasonal_strength",
    "S_complexity": "seasonal_entropy",
    "S_corr": "seasonal_corr",
    # Residual patterns
    "R_diff1_ACF1": "e_diff1_acf1",
    "R_ACF1": "e_acf1",
    # Meta patterns
    "stationarity": "is_random_walk",  # Note: stationarity = NOT is_random_walk
    "outlier_presence": "has_spike_presence",
    "complexity": "x_entropy",  # High entropy = low predictability/high noise
}
# ---------------------------------------------------


# Your leaderboard name
TITLE = """<h1 align="center" id="space-title"> It's TIME</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
TIME introduces a unified benchmark for time series probabilistic forecasting that supports evaluation at **multiple granularities**, ranging from overall performance across datasets to dataset-level, variate-level, and even individual test windows (with visualization). Beyond conventional analysis, the benchmark enables **pattern-driven, cross-dataset benchmarking** by grouping variates with similar temporal features, where patterns are defined based on groups of tsfeatures that capture properties such as trend, seasonality, and stationarity, offering a more systematic understanding of model behavior. For data and results, please refer to ๐Ÿค— [dataset](https://huggingface.co/datasets/TIME-benchmark/TIME-1.0/tree/main).
"""
# An integrated archive further enriches the platform by providing structural tsfeatures and statistical descriptors of all variates,
# ensuring both comprehensive evaluation and transparent interpretability across diverse forecasting scenarios
print("โœ… TIME Leaderboard initialization complete!")