Spaces:

Real-TSF
/

TIME-leaderboard

Running

App Files Files Community

zqiao11 commited on Feb 8

Commit

0b97f6a

0 Parent(s):

Initial release

Browse files

Files changed (21) hide show

.gitattributes +35 -0
.gitignore +18 -0
.pre-commit-config.yaml +53 -0
Dockerfile +37 -0
Makefile +13 -0
README.md +99 -0
app.py +69 -0
pyproject.toml +13 -0
requirements.txt +27 -0
src/about.py +119 -0
src/display.egg-info/PKG-INFO +3 -0
src/display.egg-info/SOURCES.txt +14 -0
src/display.egg-info/dependency_links.txt +1 -0
src/display.egg-info/top_level.txt +6 -0
src/display/css_html_js.py +169 -0
src/display/formatting.py +39 -0
src/display/utils.py +169 -0
src/hf_config.py +241 -0
src/leaderboard.py +1085 -0
src/tab.py +1370 -0
src/utils.py +635 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+auto_evals/
+venv/
+__pycache__/
+.env
+.ipynb_checkpoints
+*ipynb
+.idea
+.vscode/
+eval-queue/
+eval-results/
+eval-queue-bk/
+eval-results-bk/
+logs/
+utils.py
+css_html_js.py
+formatting.py
+run_local.sh

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+default_language_version:
+  python: python3
+ci:
+  autofix_prs: true
+  autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
+  autoupdate_schedule: quarterly
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: check-yaml
+      - id: check-case-conflict
+      - id: detect-private-key
+      - id: check-added-large-files
+        args: ['--maxkb=1000']
+      - id: requirements-txt-fixer
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: Format imports
+  - repo: https://github.com/psf/black
+    rev: 22.12.0
+    hooks:
+      - id: black
+        name: Format code
+        additional_dependencies: ['click==8.0.2']
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    # Ruff version.
+    rev: 'v0.0.267'
+    hooks:
+      - id: ruff

Dockerfile ADDED Viewed

	@@ -0,0 +1,37 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install git for pip install from GitHub
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Create startup script that installs timebench at runtime (when secrets are available)
+RUN echo '#!/bin/bash\n\
+if [ -n "$GITHUB_TOKEN" ]; then\n\
+    echo "Installing timebench from private GitHub repo..."\n\
+    pip install --no-cache-dir git+https://oauth2:${GITHUB_TOKEN}@github.com/zqiao11/TIME.git\n\
+else\n\
+    echo "Installing timebench from public GitHub repo..."\n\
+    pip install --no-cache-dir git+https://github.com/zqiao11/TIME.git\n\
+fi\n\
+exec python app.py\n' > /app/start.sh && chmod +x /app/start.sh
+# Expose Gradio default port
+EXPOSE 7860
+# Set environment variables
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+ENV GRADIO_SERVER_PORT="7860"
+# Run the startup script
+CMD ["/app/start.sh"]

Makefile ADDED Viewed

	@@ -0,0 +1,13 @@

+.PHONY: style format
+style:
+	python -m black --line-length 119 .
+	python -m isort .
+	ruff check --fix .
+quality:
+	python -m black --check --line-length 119 .
+	python -m isort --check-only .
+	ruff check .

README.md ADDED Viewed

	@@ -0,0 +1,99 @@

+---
+title: TIME Benchmark Leaderboard
+emoji: 🥇
+colorFrom: green
+colorTo: indigo
+sdk: docker
+pinned: true
+license: apache-2.0
+short_description: 'TIME: A Benchmark for Time Series Forecasting'
+---
+# TIME Benchmark Leaderboard
+A unified benchmark for time series probabilistic forecasting with multiple granularity evaluation.
+## Features
+- **Overall Performance**: Aggregated metrics across all datasets and horizons
+- **Dataset-level Analysis**: Performance breakdown by individual datasets
+- **Window-level Visualization**: Detailed test window analysis with prediction visualization
+## Configuration
+### Environment Variables
+The app reads data from HuggingFace Hub. Configure the following environment variables:
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `HF_TOKEN` | HuggingFace API token (required for private datasets) | None |
+| `HF_REPO_ID` | Dataset repository ID | `TIME-benchmark/TIME-1.0` |
+| `USE_HF_HUB` | Use HF Hub (`true`) or local files (`false`) | `true` |
+| `HF_CACHE_DIR` | Custom cache directory for downloads | `~/.cache/huggingface` |
+### For HuggingFace Space Deployment
+#### 快速部署（推荐）
+```bash
+# 1. 复制 timebench 模块到 leaderboard_app
+cd /home/eee/qzz/TIME
+cp -r src/timebench leaderboard_app/
+# 2. 进入 leaderboard_app 目录
+cd leaderboard_app
+# 3. 运行部署脚本
+chmod +x deploy.sh
+./deploy.sh YOUR_USERNAME YOUR_SPACE_NAME
+```
+#### 手动部署
+详细步骤请参考 [DEPLOY.md](DEPLOY.md)
+**重要**: 部署前需要：
+1. 创建 HuggingFace Space: https://huggingface.co/new-space
+2. 在 Space Settings → Repository secrets 中添加 `HF_TOKEN`
+3. 确保数据已上传到 `TIME-benchmark/TIME-1.0` Dataset
+### For Local Development
+Set `USE_HF_HUB=false` to use local data:
+```bash
+export USE_HF_HUB=false
+python app.py
+```
+## Installation
+```bash
+pip install -r requirements.txt
+python app.py
+```
+## Data Structure
+The app expects the following data structure in the HuggingFace Dataset:
+```
+HF_REPO/
+├── data/
+│   └── hf_dataset/           # Time series datasets
+│       ├── ECDC_COVID/
+│       ├── Australia_Solar/
+│       └── ...
+├── output/
+│   └── results/              # Model evaluation results
+│       ├── moirai_small/
+│       ├── chronos_base/
+│       └── ...
+└── config/
+    └── datasets.yaml         # Dataset configurations
+```
+## Ethical Considerations
+This release is for research purposes only in support of an academic paper. Our models, datasets, and code are not specifically designed or evaluated for all downstream purposes. We strongly recommend users evaluate and address potential concerns related to accuracy, safety, and fairness before deploying this model.

app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import os
+import sys
+# Add project root and src directory to Python path to enable imports from timebench
+# Get the directory containing this file (leaderboard_app/)
+current_dir = os.path.dirname(os.path.abspath(__file__))
+# Try multiple paths for timebench import:
+# 1. Current directory (if timebench was copied to leaderboard_app/)
+# 2. Parent directory's src (for local development: TIME/src/)
+# 3. Parent's parent's src (if running from leaderboard_app/)
+# Add current directory first (for Space deployment)
+if current_dir not in sys.path:
+    sys.path.insert(0, current_dir)
+# Add parent directory's src (for local development)
+project_root = os.path.dirname(current_dir)
+if project_root not in sys.path:
+    sys.path.insert(0, project_root)
+src_dir = os.path.join(project_root, "src")
+if src_dir not in sys.path and os.path.exists(src_dir):
+    sys.path.insert(0, src_dir)
+import gradio as gr
+from src.display.css_html_js import custom_css
+from src.about import TITLE, INTRODUCTION_TEXT
+from src.tab import init_overall_tab, init_per_window_tab, init_per_dataset_tab, init_per_pattern_tab
+# Custom head content for responsive design
+custom_head = """
+<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=5.0, user-scalable=yes">
+<style>
+    /* 响应式设计：让页面自动适配不同屏幕尺寸 */
+    html {
+        width: 100%;
+        max-width: 100%;
+    }
+    body {
+        width: 100%;
+        max-width: 100%;
+    }
+</style>
+"""
+with gr.Blocks(css=custom_css, head=custom_head) as demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_id="custom-tabs") as tabs:
+        with gr.Tab("🏅 Overall Performance", id=0):
+            init_overall_tab()
+        with gr.Tab("🏅 Per Dataset", id=1):
+            init_per_dataset_tab(demo)
+        with gr.Tab("🏅 Per Test Window", id=3):
+            init_per_window_tab(demo)
+        with gr.Tab("🏅 Per Pattern", id=4):
+            init_per_pattern_tab(demo)
+        # with gr.Tab("📂 Archive", id=5):
+        #     init_archive_tab(demo)
+if __name__ == "__main__":
+    demo.launch()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[tool.ruff]
+# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+select = ["E", "F"]
+ignore = ["E501"] # line too long (black is taking care of this)
+line-length = 119
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+[tool.isort]
+profile = "black"
+line_length = 119
+[tool.black]
+line-length = 119

requirements.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+# =============================================================================
+# TIME Leaderboard Dependencies
+# =============================================================================
+# Install timebench from public GitHub repo (always fetch latest from main)
+timebench @ git+https://github.com/zqiao11/TIME.git@main
+# Core dependencies - pinned to match local working environment
+gradio==5.50.0
+gradio_leaderboard==0.0.14
+gradio_client==1.14.0
+huggingface-hub==0.36.0
+datasets==2.17.1
+APScheduler
+matplotlib
+numpy==1.26.4
+plotly==6.5.0
+pandas==2.3.3
+python-dateutil
+python-dotenv
+tqdm
+pyarrow
+pyyaml
+scipy==1.11.4
+# Note: gluonts and other timebench dependencies are automatically installed
+# via the timebench package

src/about.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+import pandas as pd
+# Import HuggingFace Hub configuration
+from src.hf_config import get_results_root, get_config_root, get_features_root, initialize_data
+from src.utils import (
+    get_all_datasets_results, get_all_domains_and_freq, get_all_variates_results,
+    get_dataset_choices, get_dataset_display_map, compute_ranks,
+    load_features, load_all_features, binarize_features
+)
+# =============================================================================
+# Initialize data from HuggingFace Hub (or local for development)
+# =============================================================================
+print("🚀 Starting TIME Leaderboard initialization...")
+# Download/cache results and config from HuggingFace Hub
+RESULTS_ROOT, CONFIG_ROOT = initialize_data()
+# Get features root (local or HF)
+FEATURES_ROOT = get_features_root()
+# Get list of all models from results directory
+ALL_MODELS = []
+if RESULTS_ROOT.exists():
+    ALL_MODELS = [p.name for p in RESULTS_ROOT.iterdir() if p.is_dir()]
+    print(f"📊 Found {len(ALL_MODELS)} models: {ALL_MODELS}")
+# ---------------------------------------------------
+# Get dataset choices from TIME results (with smart display names)
+DATASET_CHOICES, DATASET_DISPLAY_TO_ID, DATASET_ID_TO_DISPLAY = get_dataset_choices(str(RESULTS_ROOT))
+print(f"📁 Found {len(DATASET_CHOICES)} dataset configurations")
+# === Load data once at startup ===
+DATASETS_DF = get_all_datasets_results(root_dir=str(RESULTS_ROOT))
+if not DATASETS_DF.empty:
+    # Use dataset_id (dataset/freq) for ranking to correctly handle multi-freq datasets
+    DATASETS_DF = compute_ranks(DATASETS_DF, groupby_cols=['dataset_id', "horizon"])  # Rows: 每一行是1个独立的实验 num_model x num_dataset_id x num_horizons
+    print(f"✅ Loaded {len(DATASETS_DF)} dataset results")
+# === Load variate-level results for pattern-based leaderboard ===
+print("📊 Loading variate-level results...")
+VARIATES_DF = get_all_variates_results(root_dir=str(RESULTS_ROOT))
+if not VARIATES_DF.empty:
+    # Compute ranks per (dataset_id, series_name, variate_name, horizon)
+    VARIATES_DF = compute_ranks(VARIATES_DF, groupby_cols=['dataset_id', 'series_name', 'variate_name', 'horizon'])
+    print(f"✅ Loaded {len(VARIATES_DF)} variate-level results")
+else:
+    print("⚠️ No variate-level results found")
+# === Load features for pattern-based filtering ===
+print("📊 Loading features...")
+FEATURES_DF = load_all_features(features_root=str(FEATURES_ROOT), split="test")
+if not FEATURES_DF.empty:
+    print(f"✅ Loaded {len(FEATURES_DF)} variate features")
+else:
+    print("⚠️ No features found")
+# Columns to exclude from binarization
+BINARIZE_EXCLUDE = [
+    'dataset_id', 'series_name', 'variate_name', 'unique_id',
+    'mean', 'std', 'length',
+    'period1', 'period2', 'period3',
+    'p_strength1', 'p_strength2', 'p_strength3',
+    'missing_rate',
+    # Meta features are already 0/1, handle separately
+    'is_random_walk', 'has_spike_presence',
+]
+# Binarize numeric features by median
+FEATURES_BOOL_DF = pd.DataFrame()
+if not FEATURES_DF.empty:
+    FEATURES_BOOL_DF = binarize_features(FEATURES_DF, exclude=BINARIZE_EXCLUDE)
+    print(f"✅ Binarized features for {len(FEATURES_BOOL_DF)} variates")
+if not DATASETS_DF.empty:
+    OVERALL_TABLE_COLUMNS = ["model", "MASE", "CRPS", "MASE_rank", "CRPS_rank"]
+else:
+    OVERALL_TABLE_COLUMNS = ["model", "MASE", "CRPS"]
+ALL_HORIZONS = ['short', 'medium', 'long']
+# Pattern mapping: UI pattern name -> feature column name
+PATTERN_MAP = {
+    # Trend patterns
+    "T_strength": "trend_strength",
+    "T_linearity": "linearity",
+    "T_curvature": "curvature",
+    # Seasonal patterns
+    "S_strength": "seasonal_strength",
+    "S_complexity": "seasonal_entropy",
+    "S_corr": "seasonal_corr",
+    # Residual patterns
+    "R_diff1_ACF1": "e_diff1_acf1",
+    "R_ACF1": "e_acf1",
+    # Meta patterns
+    "stationarity": "is_random_walk",  # Note: stationarity = NOT is_random_walk
+    "outlier_presence": "has_spike_presence",
+    "complexity": "x_entropy",  # High entropy = low predictability/high noise
+}
+# ---------------------------------------------------
+# Your leaderboard name
+TITLE = """<h1 align="center" id="space-title"> It's TIME</h1>"""
+# What does your leaderboard evaluate?
+INTRODUCTION_TEXT = """
+TIME introduces a unified benchmark for time series probabilistic forecasting that supports evaluation at **multiple granularities**, ranging from overall performance across datasets to dataset-level, variate-level, and even individual test windows (with visualization). Beyond conventional analysis, the benchmark enables **pattern-driven, cross-dataset benchmarking** by grouping variates with similar temporal features, where patterns are defined based on groups of tsfeatures that capture properties such as trend, seasonality, and stationarity, offering a more systematic understanding of model behavior. For data and results, please refer to 🤗 [dataset](https://huggingface.co/datasets/TIME-benchmark/TIME-1.0/tree/main).
+"""
+# An integrated archive further enriches the platform by providing structural tsfeatures and statistical descriptors of all variates,
+# ensuring both comprehensive evaluation and transparent interpretability across diverse forecasting scenarios
+print("✅ TIME Leaderboard initialization complete!")

src/display.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,3 @@

+Metadata-Version: 2.4
+Name: display
+Version: 0.0.0

src/display.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+README.md
+pyproject.toml
+src/about.py
+src/hf_config.py
+src/leaderboard.py
+src/tab.py
+src/utils.py
+src/display/css_html_js.py
+src/display/formatting.py
+src/display/utils.py
+src/display.egg-info/PKG-INFO
+src/display.egg-info/SOURCES.txt
+src/display.egg-info/dependency_links.txt
+src/display.egg-info/top_level.txt

src/display.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/display.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+about
+display
+hf_config
+leaderboard
+tab
+utils

src/display/css_html_js.py ADDED Viewed

	@@ -0,0 +1,169 @@

+custom_css = """
+/* ========== 响应式布局 ========== */
+/* 移除固定宽度，让Gradio自动适配不同屏幕尺寸 */
+.gradio-container {
+    width: 100% !important;
+    max-width: 100% !important;
+}
+/* 主体内容区域自适应 */
+.main, .contain {
+    width: 100% !important;
+    max-width: 100% !important;
+}
+/* Tab 内容区域自适应 */
+.tabitem {
+    width: 100% !important;
+    max-width: 100% !important;
+}
+/* Plot 组件自适应，但保持最小可读宽度 */
+.js-plotly-plot, .plotly {
+    width: 100% !important;
+    max-width: 100% !important;
+    min-width: 300px !important;  /* 保持最小可读宽度 */
+}
+/* ========== 原有样式 ========== */
+.markdown-text {
+    font-size: 20px !important;
+}
+/* 只影响 Tabs 按钮 */
+#custom-tabs [role="tab"] {
+    font-size: 20px;
+}
+/* ✅ 只影响表格 */
+.custom-table table thead th {
+    font-size: 16px;
+    font-weight: 600;  /* 想要普通就改成 400 */
+    text-align: center;
+}
+.custom-table table tbody td {
+    font-size: 14px;
+}
+/* 响应式表格布局 */
+.custom-table table {
+    table-layout: auto;    /* 使用自动布局，让表格自适应 */
+    width: 100%;           /* 占满容器 */
+    min-width: 100%;       /* 确保至少占满容器 */
+}
+/* 表格容器允许横向滚动（当内容过宽时） */
+.custom-table {
+    overflow-x: auto;      /* 当表格内容过宽时，允许横向滚动 */
+    width: 100%;
+}
+/* 为不同列设置合适的宽度（使用相对单位，更灵活） */
+.custom-table table th:nth-child(1),
+.custom-table table td:nth-child(1) {
+    min-width: 150px;      /* model 列最小宽度 */
+    max-width: 250px;      /* 最大宽度限制 */
+}
+/* 指标列（MASE, CRPS, MAE, MSE） */
+.custom-table table th:nth-child(2),
+.custom-table table td:nth-child(2),
+.custom-table table th:nth-child(3),
+.custom-table table td:nth-child(3),
+.custom-table table th:nth-child(4),
+.custom-table table td:nth-child(4),
+.custom-table table th:nth-child(5),
+.custom-table table td:nth-child(5) {
+    min-width: 80px;      /* 原始指标列最小宽度 */
+    max-width: 120px;
+}
+/* 归一化指标列（MASE_norm, CRPS_norm, MAE_norm, MSE_norm） */
+.custom-table table th:nth-child(6),
+.custom-table table td:nth-child(6),
+.custom-table table th:nth-child(7),
+.custom-table table td:nth-child(7),
+.custom-table table th:nth-child(8),
+.custom-table table td:nth-child(8),
+.custom-table table th:nth-child(9),
+.custom-table table td:nth-child(9) {
+    min-width: 100px;      /* 归一化指标列最小宽度 */
+    max-width: 150px;
+}
+/* 排名列（MASE_rank, CRPS_rank） */
+.custom-table table th:nth-child(10),
+.custom-table table td:nth-child(10),
+.custom-table table th:nth-child(11),
+.custom-table table td:nth-child(11) {
+    min-width: 80px;       /* 排名列最小宽度 */
+    max-width: 120px;
+}
+#archive-table table thead th { font-size: 14px; font-weight: 400}
+#archive-table table {
+    table-layout: fixed;   /* 强制固定布局 */
+    width: 100%;           /* 占满容器 */
+}
+#archive-table table th:nth-child(1),
+#archive-table table td:nth-child(1) {
+    width: 160px !important;   /* dataset */
+}
+#archive-table table th:nth-child(2),
+#archive-table table td:nth-child(2) {
+    width: 100px !important;   /* variate_name */
+}
+#archive-table table th:nth-child(3),
+#archive-table table td:nth-child(3) {
+    width: 60px !important;   /* freq */
+}
+#archive-table table th:nth-child(4),
+#archive-table table td:nth-child(4) {
+    width: 100px !important;   /* domain */
+}
+/* 后面的特征列 */
+#archive-table table th:nth-child(n+5),
+#archive-table table td:nth-child(n+5) {
+    width: 120px !important;
+}
+#citation-button span {
+    font-size: 14px !important;
+}
+#citation-button textarea {
+    font-size: 16px !important;
+}
+#citation-button > label > button {
+    margin: 6px;
+    transform: scale(1.3);
+}
+#search-bar-table-box > div:first-child {
+    background: none;
+    border: none;
+}
+#search-bar {
+    padding: 0px;
+}
+"""
+# ToDO: markdown-text不好使...
+# archive-table table thead th { font-size: 14px; font-weight: 400}
+# /* 让表格遵守列宽、并能横向滚动 */
+# #

src/display/formatting.py ADDED Viewed

	@@ -0,0 +1,39 @@

+def model_hyperlink(model_link, code_link, model_name):
+    if model_link == "":
+        return model_name
+        # return f'<a target="_blank">{model_name}</a>'
+        # return f'<a target="_blank" href="{link}" rel="noopener noreferrer">{model_name}</a>'
+    else:
+        model_url = f'<a target="_blank" href="{model_link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+        if code_link == "":
+            return model_url
+        else:
+            code_url = f'<a target="_blank" href="{code_link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">code</a>'
+            return f"{model_url} ({code_url})"
+    # return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a> | ' \
+    #         f'<a target="_blank" href="https://www.google.com" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}_link2</a>'
+def make_clickable_model(model_name):
+    link = f"https://huggingface.co/{model_name}"
+    return model_hyperlink(link, model_name)
+def styled_error(error):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
+def styled_warning(warn):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
+def styled_message(message):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
+def has_no_nan_values(df, columns):
+    return df[columns].notna().all(axis=1)
+def has_nan_values(df, columns):
+    return df[columns].isna().any(axis=1)

src/display/utils.py ADDED Viewed

	@@ -0,0 +1,169 @@

+from dataclasses import dataclass, make_dataclass
+from enum import Enum
+import pandas as pd
+from src.about import Tasks
+def fields(raw_class):
+    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
+# These classes are for user facing column names,
+# to avoid having to change them all around the code
+# when a modif is needed
+@dataclass
+class ColumnContent:
+    name: str
+    type: str
+    displayed_by_default: bool
+    hidden: bool = False
+    never_hidden: bool = False
+## Leaderboard columns
+archive_info_dict = []
+archive_info_dict.append(["dataset", ColumnContent, ColumnContent("dataset", "markdown", True, never_hidden=True)])
+archive_info_dict.append(["unique_id", ColumnContent, ColumnContent("unique_id", "str", True, never_hidden=True)])
+archive_info_dict.append(["freq", ColumnContent, ColumnContent("freq", "str", True, never_hidden=True)])
+archive_info_dict.append(["domain", ColumnContent, ColumnContent("domain", "str", True, never_hidden=True)])
+# Raw features
+archive_info_dict.append(["x_acf1", ColumnContent, ColumnContent("x_acf1", "number", False, False)])
+archive_info_dict.append(["x_acf10", ColumnContent, ColumnContent("x_acf10", "number", False, False)])
+archive_info_dict.append(["lumpiness", ColumnContent, ColumnContent("lumpiness", "number", False, False)])
+archive_info_dict.append(["stability", ColumnContent, ColumnContent("stability", "number", False, False)])
+archive_info_dict.append(["hurst", ColumnContent, ColumnContent("hurst", "number", False, False)])
+archive_info_dict.append(["entropy", ColumnContent, ColumnContent("entropy", "number", False, False)])
+# Trend features
+archive_info_dict.append(["trend", ColumnContent, ColumnContent("trend_strength", "number", False, False)])
+archive_info_dict.append(["trend_crossing_point_ratio", ColumnContent, ColumnContent("trend_xpoint_ratio", "number", False, False)])
+archive_info_dict.append(["trend_stability", ColumnContent, ColumnContent("trend_stability", "number", False, False)])
+archive_info_dict.append(["trend_lumpiness", ColumnContent, ColumnContent("trend_lumpiness", "number", False, False)])
+archive_info_dict.append(["trend_hurst", ColumnContent, ColumnContent("trend_hurst", "number", False, False)])
+archive_info_dict.append(["trend_entropy", ColumnContent, ColumnContent("trend_entropy", "number", False, False)])
+# Seasonal features
+archive_info_dict.append(["e_acf1", ColumnContent, ColumnContent("e_acf1", "number", False, False)])
+archive_info_dict.append(["e_acf10", ColumnContent, ColumnContent("e_acf10", "number", False, False)])
+archive_info_dict.append(["e_entropy", ColumnContent, ColumnContent("e_entropy", "number", False, False)])
+archive_info_dict.append(["e_hurst", ColumnContent, ColumnContent("e_hurst", "number", False, False)])
+archive_info_dict.append(["e_lumpiness", ColumnContent, ColumnContent("e_lumpiness", "number", False, False)])
+archive_info_dict.append(["e_outlier_ratio", ColumnContent, ColumnContent("e_outlier_ratio", "number", False, False)])
+# Remainder features
+archive_info_dict.append(["seasonal_strength", ColumnContent, ColumnContent("seasonal_strength", "number", False, False)])
+archive_info_dict.append(["seasonality_corr", ColumnContent, ColumnContent("seasonality_corr", "number", False, False)])
+archive_info_dict.append(["seasonal_stability", ColumnContent, ColumnContent("seasonal_stability", "number", False, False)])
+archive_info_dict.append(["seasonal_lumpiness", ColumnContent, ColumnContent("seasonal_lumpiness", "number", False, False)])
+archive_info_dict.append(["seasonal_hurst", ColumnContent, ColumnContent("seasonal_hurst", "number", False, False)])
+archive_info_dict.append(["seasonal_entropy", ColumnContent, ColumnContent("seasonal_entropy", "number", False, False)])
+# Statistics
+archive_info_dict.append(["mean", ColumnContent, ColumnContent("mean", "number", False, False)])
+archive_info_dict.append(["std", ColumnContent, ColumnContent("std", "number", False, False)])
+archive_info_dict.append(["missing_rate", ColumnContent, ColumnContent("missing_rate", "number", False, False)])
+archive_info_dict.append(["length", ColumnContent, ColumnContent("length", "number", False, False)])
+archive_info_dict.append(["period1", ColumnContent, ColumnContent("period1", "number", False, False)])
+archive_info_dict.append(["period2", ColumnContent, ColumnContent("period2", "number", False, False)])
+archive_info_dict.append(["period3", ColumnContent, ColumnContent("period3", "number", False, False)])
+archive_info_dict.append(["p_strength1", ColumnContent, ColumnContent("p_strength1", "number", False, False)])
+archive_info_dict.append(["p_strength2", ColumnContent, ColumnContent("p_strength2", "number", False, False)])
+archive_info_dict.append(["p_strength3", ColumnContent, ColumnContent("p_strength3", "number", False, False)])
+ArchiveInfoColumn = make_dataclass("ArchiveInfoColumn", archive_info_dict, frozen=True)
+model_info_dict = []
+# Init column for the model properties
+model_info_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
+model_info_dict.append(["model", ColumnContent, ColumnContent("model", "markdown", True, never_hidden=True)])
+# Model information
+model_info_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False, True)])
+model_info_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False, True)])
+model_info_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False, True)])
+model_info_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False, True)])
+model_info_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False, True)])
+model_info_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+model_info_dict.append(["org", ColumnContent, ColumnContent("Organization", "str", True, hidden=False)])
+model_info_dict.append(["testdata_leakage", ColumnContent, ColumnContent("TestData Leakage", "str", True, hidden=False)])
+# We use make dataclass to dynamically fill the scores from Tasks
+ModelInfoColumn = make_dataclass("ModelInfoColumn", model_info_dict, frozen=True)
+## For the queue columns in the submission tab
+@dataclass(frozen=True)
+class EvalQueueColumn:  # Queue column
+    model = ColumnContent("model", "markdown", True)
+    revision = ColumnContent("revision", "str", True)
+    private = ColumnContent("private", "bool", True)
+    precision = ColumnContent("precision", "str", True)
+    weight_type = ColumnContent("weight_type", "str", "Original")
+    status = ColumnContent("status", "str", True)
+## All the model information that we might need
+@dataclass
+class ModelDetails:
+    name: str
+    display_name: str = ""
+    symbol: str = "" # emoji
+class ModelType(Enum):
+    PT = ModelDetails(name="🟢 pretrained", symbol="🟢")
+    ZT = ModelDetails(name="🔴 zero-shot", symbol="🔴")
+    FT = ModelDetails(name="🟣 fine-tuned", symbol="🟣")
+    AG = ModelDetails(name="🟡 agentic", symbol="🟡")
+    DL = ModelDetails(name="🔷 deep-learning", symbol="🔷")
+    ST = ModelDetails(name="🔶 statistical", symbol="🔶")
+    Unknown = ModelDetails(name="", symbol="?")
+    def to_str(self, separator=" "):
+        return f"{self.value.symbol}{separator}{self.value.name}"
+    @staticmethod
+    def from_str(type):
+        if "fine-tuned" in type or "🔶" in type:
+            return ModelType.FT
+        if "pretrained" in type or "🟢" in type:
+            return ModelType.PT
+        if "zero-shot" in type or "🔴" in type:
+            return ModelType.ZT
+        if "agentic" in type or "🟡" in type:
+            return ModelType.AG
+        if "deep-learning" in type or "🟦" in type:
+            return ModelType.DL
+        if "statistical" in type or "🟣" in type:
+            return ModelType.ST
+        return ModelType.Unknown
+class WeightType(Enum):
+    Adapter = ModelDetails("Adapter")
+    Original = ModelDetails("Original")
+    Delta = ModelDetails("Delta")
+class Precision(Enum):
+    float16 = ModelDetails("float16")
+    bfloat16 = ModelDetails("bfloat16")
+    Unknown = ModelDetails("?")
+    def from_str(precision):
+        if precision in ["torch.float16", "float16"]:
+            return Precision.float16
+        if precision in ["torch.bfloat16", "bfloat16"]:
+            return Precision.bfloat16
+        return Precision.Unknown
+# Column selection
+MODEL_INFO_COLS = [c.name for c in fields(ModelInfoColumn) if not c.hidden]
+EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
+EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = [t.value.col_name for t in Tasks]

src/hf_config.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+HuggingFace Hub Configuration and Helper Functions
+This module provides configuration and utilities for loading data from HuggingFace Hub.
+The data is cached locally after first download, so subsequent accesses are fast.
+"""
+import os
+from functools import lru_cache
+from pathlib import Path
+from huggingface_hub import snapshot_download
+# =============================================================================
+# Configuration
+# =============================================================================
+# HuggingFace Dataset repository ID
+HF_REPO_ID = os.environ.get("HF_REPO_ID", "TIME-benchmark/TIME-1.0")
+# HuggingFace token (set via environment variable for security)
+# In HuggingFace Space, set this in Settings -> Repository secrets
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
+# Whether to use HuggingFace Hub (True) or local files (False)
+# Set to False for local development with local data
+USE_HF_HUB = os.environ.get("USE_HF_HUB", "true").lower() == "true"
+# Local cache directory for HF Hub downloads
+HF_CACHE_DIR = os.environ.get("HF_CACHE_DIR", None)  # None uses default ~/.cache/huggingface
+# Local data paths (used when USE_HF_HUB=false)
+# Set these environment variables to specify custom local paths
+LOCAL_RESULTS_PATH = os.environ.get("LOCAL_RESULTS_PATH", None)  # Path to output/results
+LOCAL_FEATURES_PATH = os.environ.get("LOCAL_FEATURES_PATH", None)  # Path to output/features
+LOCAL_CONFIG_PATH = os.environ.get("LOCAL_CONFIG_PATH", None)  # Path to config directory
+LOCAL_DATASETS_PATH = os.environ.get("LOCAL_DATASETS_PATH", None)  # Path to data/hf_dataset
+# =============================================================================
+# Helper Functions
+# =============================================================================
+@lru_cache(maxsize=1)
+def download_results_snapshot() -> Path:
+    """
+    Download the results directory from HuggingFace Hub.
+    Uses caching - only downloads once, then returns cached path.
+    Returns:
+        Path: Local path to the downloaded results directory
+    """
+    if not USE_HF_HUB:
+        # Return local path for development
+        # Priority: 1) LOCAL_RESULTS_PATH env var, 2) ../output/results, 3) /home/eee/qzz/TIME/output/results
+        if LOCAL_RESULTS_PATH:
+            local_path = Path(LOCAL_RESULTS_PATH)
+        else:
+            local_path = Path("../output/results")
+            if not local_path.exists():
+                local_path = Path("/home/eee/qzz/TIME/output/results")
+        if not local_path.exists():
+            print(f"⚠️ Warning: Local results path does not exist: {local_path}")
+        return local_path
+    print(f"📥 Downloading results from HuggingFace Hub: {HF_REPO_ID}")
+    local_dir = snapshot_download(
+        repo_id=HF_REPO_ID,
+        repo_type="dataset",
+        token=HF_TOKEN,
+        allow_patterns=["output/results/**"],
+        cache_dir=HF_CACHE_DIR,
+    )
+    results_path = Path(local_dir) / "output" / "results"
+    print(f"✅ Results cached at: {results_path}")
+    return results_path
+@lru_cache(maxsize=1)
+def download_datasets_snapshot() -> Path:
+    """
+    Download the hf_dataset directory from HuggingFace Hub.
+    Uses caching - only downloads once, then returns cached path.
+    Returns:
+        Path: Local path to the downloaded hf_dataset directory
+    """
+    if not USE_HF_HUB:
+        # Return local path for development
+        # Priority: 1) LOCAL_DATASETS_PATH env var, 2) ../data/hf_dataset, 3) /home/eee/qzz/TIME/data/hf_dataset
+        if LOCAL_DATASETS_PATH:
+            local_path = Path(LOCAL_DATASETS_PATH)
+        else:
+            local_path = Path("../data/hf_dataset")
+            if not local_path.exists():
+                local_path = Path("/home/eee/qzz/TIME/data/hf_dataset")
+        if not local_path.exists():
+            print(f"⚠️ Warning: Local datasets path does not exist: {local_path}")
+        return local_path
+    print(f"📥 Downloading datasets from HuggingFace Hub: {HF_REPO_ID}")
+    local_dir = snapshot_download(
+        repo_id=HF_REPO_ID,
+        repo_type="dataset",
+        token=HF_TOKEN,
+        allow_patterns=["data/hf_dataset/**"],
+        cache_dir=HF_CACHE_DIR,
+    )
+    datasets_path = Path(local_dir) / "data" / "hf_dataset"
+    print(f"✅ Datasets cached at: {datasets_path}")
+    return datasets_path
+def download_config_snapshot() -> Path:
+    """
+    Get the config directory from the installed timebench package.
+    The config (datasets.yaml) is bundled with the timebench package,
+    so no download is needed - we just use the installed package's config.
+    Returns:
+        Path: Local path to the config directory
+    """
+    # Try to get config from installed timebench package
+    try:
+        from timebench.evaluation.data import DEFAULT_CONFIG_PATH
+        config_path = DEFAULT_CONFIG_PATH.parent  # Get the config directory
+        if config_path.exists():
+            # print(f"📁 Using config from timebench package: {config_path}")
+            return config_path
+    except ImportError:
+        print(f"❌ ImportError: {ImportError}, using local config")
+        pass
+    # Fallback: Local development path
+    # Priority: 1) LOCAL_CONFIG_PATH env var, 2) ../config, 3) /home/eee/qzz/TIME/config
+    if LOCAL_CONFIG_PATH:
+        local_path = Path(LOCAL_CONFIG_PATH)
+    else:
+        local_path = Path("../config")
+        if not local_path.exists():
+            local_path = Path("/home/eee/qzz/TIME/config")
+    if local_path.exists():
+        print(f"📁 Using local config: {local_path}")
+        return local_path
+    raise FileNotFoundError(
+        "Config directory not found. Please ensure timebench is installed, "
+        "set USE_HF_HUB=false for local development, "
+        "or set LOCAL_CONFIG_PATH environment variable to point to your config directory."
+    )
+def get_results_root() -> Path:
+    """Get the root path for results (handles both HF Hub and local)."""
+    return download_results_snapshot()
+def get_datasets_root() -> Path:
+    """Get the root path for hf_dataset (handles both HF Hub and local)."""
+    return download_datasets_snapshot()
+def get_config_root() -> Path:
+    """Get the root path for config (handles both HF Hub and local)."""
+    return download_config_snapshot()
+def get_features_root() -> Path:
+    """
+    Get the root path for features (handles both HF Hub and local).
+    Features are stored at output/features/{dataset}/{freq}/test.csv
+    Returns:
+        Path: Local path to the features directory
+    """
+    if not USE_HF_HUB:
+        # Return local path for development
+        # Priority: 1) LOCAL_FEATURES_PATH env var, 2) ../output/features, 3) /home/eee/qzz/TIME/output/features
+        if LOCAL_FEATURES_PATH:
+            local_path = Path(LOCAL_FEATURES_PATH)
+        else:
+            local_path = Path("../output/features")
+            if not local_path.exists():
+                local_path = Path("/home/eee/qzz/TIME/output/features")
+        if not local_path.exists():
+            print(f"⚠️ Warning: Local features path does not exist: {local_path}")
+        return local_path
+    # For HF Hub, features are in the same repo as results
+    print(f"📥 Downloading features from HuggingFace Hub: {HF_REPO_ID}")
+    local_dir = snapshot_download(
+        repo_id=HF_REPO_ID,
+        repo_type="dataset",
+        token=HF_TOKEN,
+        allow_patterns=["output/features/**"],
+        cache_dir=HF_CACHE_DIR,
+    )
+    features_path = Path(local_dir) / "output" / "features"
+    print(f"✅ Features cached at: {features_path}")
+    return features_path
+def clear_cache():
+    """Clear the LRU cache to force re-download on next access."""
+    download_results_snapshot.cache_clear()
+    download_datasets_snapshot.cache_clear()
+# =============================================================================
+# Initialization - Download data at module import
+# =============================================================================
+def initialize_data():
+    """
+    Initialize by downloading all necessary data.
+    Call this at app startup to pre-download data.
+    """
+    print("🚀 Initializing TIME Leaderboard data...")
+    # Download results (required for leaderboard)
+    results_root = get_results_root()
+    print(f"   Results: {results_root}")
+    # Download config (required for dataset settings)
+    config_root = get_config_root()
+    print(f"   Config: {config_root}")
+    # Note: Datasets are downloaded on-demand when visualization is needed
+    # to reduce initial load time
+    print("✅ Initialization complete!")
+    return results_root, config_root

src/leaderboard.py ADDED Viewed

	@@ -0,0 +1,1085 @@

+import os
+import yaml
+import pandas as pd
+import numpy as np
+import json
+import gradio as gr
+from typing import List, Tuple, Optional
+from scipy import stats
+from src.about import DATASETS_DF, OVERALL_TABLE_COLUMNS, ALL_MODELS, RESULTS_ROOT, DATASET_DISPLAY_TO_ID
+from src.hf_config import get_datasets_root, get_config_root
+from src.utils import normalize_by_seasonal_naive
+# FEATURES_DF, FEATURES_BOOL_DF, VARIATES_DF VARIATE_COLUMNS
+import ast
+from timebench.evaluation.data import Dataset, get_dataset_settings, load_dataset_config
+from pathlib import Path
+def resolve_dataset_id(display_name: str) -> str:
+    """
+    Convert a display name to dataset_id.
+    Args:
+        display_name: Either a display_name from UI or a dataset_id directly
+    Returns:
+        dataset_id in format "dataset/freq"
+    """
+    # If it's in the mapping, use the mapping
+    if display_name in DATASET_DISPLAY_TO_ID:
+        return DATASET_DISPLAY_TO_ID[display_name]
+    # Otherwise assume it's already a dataset_id
+    return display_name
+def find_dataset_term_path(results_root, model_name, display_name):
+    """
+    Find the dataset_term path that matches the display_name or dataset_id.
+    Returns the path string (e.g., "Traffic/15T") which is dataset/freq, or None.
+    Path structure: results/{model_name}/{dataset}/{freq}/{horizon}/
+    Args:
+        results_root: Root directory for results
+        model_name: Model name
+        display_name: Display name from UI (could be "Traffic" or "Traffic/15T")
+                      or dataset_id directly
+    """
+    model_dir = os.path.join(results_root, model_name)
+    if not os.path.exists(model_dir):
+        return None
+    # Resolve display_name to dataset_id
+    dataset_id = resolve_dataset_id(display_name)
+    # Check if dataset_id is in format "dataset/freq"
+    if "/" in dataset_id:
+        # Direct lookup: dataset_id is already dataset/freq
+        dataset_name, freq = dataset_id.split("/", 1)
+        freq_path = os.path.join(model_dir, dataset_name, freq)
+        if os.path.isdir(freq_path):
+            # Verify it has horizon directories
+            for horizon in ["short", "medium", "long"]:
+                config_path = os.path.join(freq_path, horizon, "config.json")
+                if os.path.exists(config_path):
+                    return dataset_id
+        return None
+    # Legacy fallback: dataset_name only (find first freq)
+    dataset_name = dataset_id
+    for dataset_dir_name in os.listdir(model_dir):
+        dataset_dir = os.path.join(model_dir, dataset_dir_name)
+        if not os.path.isdir(dataset_dir):
+            continue
+        if dataset_dir_name == dataset_name:
+            # Check freq subdirectories
+            for freq_dir in os.listdir(dataset_dir):
+                freq_path = os.path.join(dataset_dir, freq_dir)
+                if not os.path.isdir(freq_path):
+                    continue
+                for horizon in ["short", "medium", "long"]:
+                    config_path = os.path.join(freq_path, horizon, "config.json")
+                    if os.path.exists(config_path):
+                        return f"{dataset_dir_name}/{freq_dir}"
+    return None
+def load_test_windows(display_name, horizon, model_name="moirai_small", series=None, variate=None, window_id=None, parse_series=False):
+    """
+    Load test window results from TIME NPZ files.
+    Args:
+        display_name: Dataset display name from UI (will be converted to dataset_id)
+        horizon: Horizon name (short, medium, long)
+        model_name: Model name
+        series: Optional series name (string) or index (int/string) to filter
+        variate: Optional variate name (string) or index (int/string) to filter
+        window_id: Optional window_id to filter
+        parse_series: If True, include label and quantile predictions as lists
+    Returns:
+        pd.DataFrame with columns: series_name, variate_name, window_id, MASE, CRPS, MAE, MSE,
+        and optionally label, quantile[...] if parse_series=True
+    """
+    results_root = str(RESULTS_ROOT)
+    # Find the dataset_term directory (handles display_name -> dataset_id conversion)
+    dataset_term = find_dataset_term_path(results_root, model_name, display_name)
+    if dataset_term is None:
+        return None
+    horizon_dir = os.path.join(results_root, model_name, dataset_term, horizon)
+    metrics_path = os.path.join(horizon_dir, "metrics.npz")
+    # Load data
+    metrics = np.load(metrics_path)
+    # Get array shapes
+    mase_arr = metrics["MASE"]  # (num_series, num_windows, num_variates)
+    num_series, num_windows, num_variates = mase_arr.shape
+    # Load Dataset to get actual names
+    series_names = None
+    variate_names = None
+    try:
+        # Use HF config to get dataset root (handles both local and HF Hub)
+        hf_dataset_root = str(get_datasets_root())
+        if os.path.exists(hf_dataset_root):
+            # Use HF config to get config root
+            config_root = get_config_root()
+            config_path = config_root / "datasets.yaml"
+            config = load_dataset_config(config_path) if config_path.exists() else {}
+            settings = get_dataset_settings(dataset_term, horizon, config)
+            prediction_length = settings.get("prediction_length")
+            test_length = settings.get("test_length")
+            # Load dataset with storage_path parameter
+            dataset_obj = Dataset(
+                name=dataset_term,
+                term=horizon,
+                prediction_length=prediction_length,
+                test_length=test_length,
+                storage_path=hf_dataset_root,
+            )
+            # Get series names (item_id)
+            if "item_id" in dataset_obj.hf_dataset.column_names:
+                series_names = dataset_obj.hf_dataset["item_id"]
+            else:
+                series_names = [dataset_obj.hf_dataset[i].get("item_id", f"item_{i}")
+                               for i in range(len(dataset_obj.hf_dataset))]
+            # Get variate names
+            variate_names = dataset_obj.get_variate_names()
+            if variate_names is None:
+                # Univariate mode: variate names are same as series names
+                variate_names = series_names
+    except Exception as e:
+        print(f"Error loading Dataset for names: {e}")
+    # Create name to index mappings
+    series_name_to_idx = {}
+    variate_name_to_idx = {}
+    if series_names is not None:
+        series_name_to_idx = {name: idx for idx, name in enumerate(series_names)}
+    if variate_names is not None:
+        variate_name_to_idx = {name: idx for idx, name in enumerate(variate_names)}
+    # Convert series and variate names to indices if they are names
+    series_idx_filter = None
+    if series is not None:
+        if series in series_name_to_idx:
+            series_idx_filter = series_name_to_idx[series]
+        else:
+            # Try as index
+            try:
+                series_idx_filter = int(series)
+            except ValueError:
+                pass
+    variate_idx_filter = None
+    if variate is not None:
+        if variate in variate_name_to_idx:
+            variate_idx_filter = variate_name_to_idx[variate]
+        else:
+            # Try as index
+            try:
+                variate_idx_filter = int(variate)
+            except ValueError:
+                pass
+    # Build DataFrame row by row
+    rows = []
+    for series_idx in range(num_series):
+        # Filter by series if specified
+        if series_idx_filter is not None:
+            if series_idx != series_idx_filter:
+                continue
+        series_name = series_names[series_idx] if series_names is not None else str(series_idx)
+        for window_idx in range(num_windows):
+            # Filter by window_id if specified
+            if window_id is not None:
+                if window_idx != int(window_id):
+                    continue
+            for variate_idx in range(num_variates):
+                # Filter by variate if specified
+                if variate_idx_filter is not None:
+                    if variate_idx != variate_idx_filter:
+                        continue
+                variate_name = variate_names[variate_idx] if variate_names is not None else str(variate_idx)
+                row = {
+                    "series_name": series_name,
+                    "variate_name": variate_name,
+                    "window_id": window_idx,
+                    "MASE": float(mase_arr[series_idx, window_idx, variate_idx]),
+                    "CRPS": float(metrics["CRPS"][series_idx, window_idx, variate_idx]),
+                    "MAE": float(metrics["MAE"][series_idx, window_idx, variate_idx]),
+                    "MSE": float(metrics["MSE"][series_idx, window_idx, variate_idx]),
+                    "model": model_name,
+                    "series_idx": series_idx,  # Keep for reference
+                    "variate_idx": variate_idx,  # Keep for reference
+                }
+                # Add label and quantiles if requested
+                if parse_series:
+                    # Get ground truth from predictions (we need to load it separately)
+                    # For now, we'll need to compute it or load from a separate source
+                    # TIME doesn't save ground_truth in predictions.npz, so we'll skip for now
+                    # TODO: Need to handle ground truth loading
+                    pass
+                rows.append(row)
+    if not rows:
+        return None
+    df = pd.DataFrame(rows)
+    # If parse_series, we would need ground truth data
+    # For now, return without series data
+    return df
+def get_overall_leaderboard(df_datasets: pd.DataFrame, metric: str = "MASE") -> pd.DataFrame:
+    """
+    Compute overall leaderboard across datasets by normalizing metrics by Seasonal Naive
+    and aggregating with geometric mean.
+    Args:
+        df_datasets (pd.DataFrame): Dataset-level results, must include
+            ["model", "dataset_id", "horizon", "MASE", "CRPS", "MASE_rank", "CRPS_rank"].
+        metric (str): Metric to use for sorting. Defaults to "MASE".
+    Returns:
+        pd.DataFrame: Leaderboard with:
+            - MASE, CRPS: Geometric mean of Seasonal Naive-normalized values
+            - MASE_rank, CRPS_rank: Average rank across configurations (from original data)
+            Sorted by the chosen metric.
+    """
+    if df_datasets.empty:
+        return pd.DataFrame(columns=OVERALL_TABLE_COLUMNS)
+    if metric not in df_datasets.columns:
+        return pd.DataFrame(columns=OVERALL_TABLE_COLUMNS)
+    # Step 1: Normalize MASE and CRPS by Seasonal Naive per (dataset_id, horizon)
+    df_normalized = normalize_by_seasonal_naive(
+        df_datasets,
+        baseline_model="seasonal_naive",
+        metrics=["MASE", "CRPS"],
+        groupby_cols=["dataset_id", "horizon"],
+    )
+    if df_normalized.empty:
+        # Fall back to original behavior if normalization fails
+        print("[get_overall_leaderboard] Warning: normalization failed, using arithmetic mean")
+        leaderboard = (
+            df_datasets.groupby(["model"])
+            .mean(numeric_only=True)
+            .reset_index()
+        )
+        # Rename columns: MASE -> MASE (norm.), CRPS -> CRPS (norm.)
+        if "MASE" in leaderboard.columns:
+            leaderboard = leaderboard.rename(columns={"MASE": "MASE (norm.)"})
+        if "CRPS" in leaderboard.columns:
+            leaderboard = leaderboard.rename(columns={"CRPS": "CRPS (norm.)"})
+        # Adjust metric name for sorting
+        sort_metric = metric
+        if metric == "MASE":
+            sort_metric = "MASE (norm.)"
+        elif metric == "CRPS":
+            sort_metric = "CRPS (norm.)"
+        if sort_metric in leaderboard.columns:
+            leaderboard = leaderboard.sort_values(by=sort_metric, ascending=True).reset_index(drop=True)
+        else:
+            leaderboard = leaderboard.sort_values(by=metric, ascending=True).reset_index(drop=True)
+        # Define column order
+        col_order = ["model", "MASE (norm.)", "CRPS (norm.)", "MASE_rank", "CRPS_rank"]
+        col_order = [col for col in col_order if col in leaderboard.columns]
+        leaderboard = leaderboard[col_order]
+        leaderboard = leaderboard.round(3)
+        return leaderboard
+    # Step 2: Aggregate normalized MASE and CRPS with geometric mean
+    # Filter out NaN values for geometric mean computation
+    def gmean_with_nan(x):
+        """Compute geometric mean, ignoring NaN values."""
+        valid = x.dropna()
+        if len(valid) == 0:
+            return np.nan
+        return stats.gmean(valid)
+    normalized_metrics = (
+        df_normalized.groupby("model")[["MASE", "CRPS"]]
+        .agg(gmean_with_nan)
+        .reset_index()
+    )
+    # Rename columns: MASE -> MASE (norm.), CRPS -> CRPS (norm.)
+    normalized_metrics = normalized_metrics.rename(columns={
+        "MASE": "MASE (norm.)",
+        "CRPS": "CRPS (norm.)"
+    })
+    # Step 3: Compute average ranks from original data (pre-normalized)
+    # Ranks should be computed on original metrics, which is already done in about.py
+    if "MASE_rank" in df_datasets.columns and "CRPS_rank" in df_datasets.columns:
+        # Use the same configurations that were used in normalization
+        # (only those with Seasonal Naive baseline)
+        df_with_baseline = df_datasets[
+            df_datasets.set_index(["dataset_id", "horizon"]).index.isin(
+                df_normalized.set_index(["dataset_id", "horizon"]).index.unique()
+            )
+        ]
+        avg_ranks = (
+            df_with_baseline.groupby("model")[["MASE_rank", "CRPS_rank"]]
+            .mean()
+            .reset_index()
+        )
+        # Merge normalized metrics with average ranks
+        leaderboard = normalized_metrics.merge(avg_ranks, on="model", how="left")
+    else:
+        leaderboard = normalized_metrics
+    # Step 4: Sort by chosen metric (adjust metric name if needed)
+    sort_metric = metric
+    if metric == "MASE":
+        sort_metric = "MASE (norm.)"
+    elif metric == "CRPS":
+        sort_metric = "CRPS (norm.)"
+    if sort_metric in leaderboard.columns:
+        leaderboard = leaderboard.sort_values(by=sort_metric, ascending=True).reset_index(drop=True)
+    else:
+        # Fallback: sort by first available metric
+        leaderboard = leaderboard.sort_values(by=leaderboard.columns[1], ascending=True).reset_index(drop=True)
+    # Step 5: Select and order columns
+    col_order = ["model", "MASE (norm.)", "CRPS (norm.)", "MASE_rank", "CRPS_rank"]
+    col_order = [col for col in col_order if col in leaderboard.columns]
+    leaderboard = leaderboard[col_order]
+    leaderboard = leaderboard.round(3)
+    return leaderboard
+def get_dataset_leaderboard(
+        display_name: str,
+        horizons: List[str],
+        metric: str = "MASE"
+) -> Tuple[str, pd.DataFrame]:
+    """
+    Return leaderboard for a specific dataset, averaged over the specified horizons.
+    Returns both original metrics and Seasonal Naive-normalized metrics in a single table.
+    Args:
+        display_name (str): The dataset display name selected by the user (from UI dropdown).
+                            Will be converted to dataset_id for filtering.
+        horizons (List[str]): List of horizons to include (e.g., ["short", "medium"]).
+                              If None, all horizons are used.
+        metric (str): The metric used for sorting. Defaults to "MASE".
+    Returns:
+        tuple:
+            str: A message string to display in the UI ("" if no error).
+            pd.DataFrame: Dataframe containing leaderboard with columns:
+                - model
+                - MASE, CRPS, MAE, MSE (original, arithmetic mean)
+                - MASE_norm, CRPS_norm, MAE_norm, MSE_norm (normalized, geometric mean)
+                - MASE_rank, CRPS_rank (average of per-task ranks)
+    """
+    if DATASETS_DF.empty:
+        return "No dataset results are available. Please check your results folder.", pd.DataFrame(columns=["model"])
+    # Convert display_name to dataset_id for filtering
+    dataset_id = resolve_dataset_id(display_name)
+    # Filter by dataset_id
+    df_filtered = DATASETS_DF[DATASETS_DF["dataset_id"] == dataset_id].copy()
+    if df_filtered.empty:
+        return f"No results found for dataset '{display_name}'.", pd.DataFrame(columns=["model"])
+    # Filter by horizon
+    if horizons is None or len(horizons) == 0:
+        horizons = df_filtered["horizon"].unique().tolist()
+    df_filtered = df_filtered[df_filtered["horizon"].isin(horizons)]
+    if df_filtered.empty:
+        return f"No results found for dataset '{display_name}' with horizons {horizons}.", pd.DataFrame(columns=["model"])
+    # Get dataset information (series count, variate count, freq)
+    dataset_info_msg = ""
+    try:
+        # Parse dataset_id to get freq
+        if "/" in dataset_id:
+            _, freq = dataset_id.split("/", 1)
+        else:
+            freq = "unknown"
+        # Load Dataset to get series and variate counts
+        hf_dataset_root = str(get_datasets_root())
+        config_root = get_config_root()
+        config_path = config_root / "datasets.yaml"
+        if os.path.exists(hf_dataset_root) and config_path.exists():
+            config = load_dataset_config(config_path)
+            settings = get_dataset_settings(dataset_id, horizons[0] if horizons else "short", config)
+            dataset_obj = Dataset(
+                name=dataset_id,
+                term=horizons[0] if horizons else "short",
+                prediction_length=settings.get("prediction_length"),
+                test_length=settings.get("test_length"),
+                storage_path=hf_dataset_root,
+            )
+            # Get series count
+            if "item_id" in dataset_obj.hf_dataset.column_names:
+                series_names = dataset_obj.hf_dataset["item_id"]
+                # Convert to list if it's an array/Series to avoid ambiguity in boolean check
+                if isinstance(series_names, (np.ndarray, pd.Series)):
+                    series_names = list(series_names)
+                elif not isinstance(series_names, list):
+                    series_names = list(series_names) if hasattr(series_names, '__iter__') else [series_names]
+                # Use len() check instead of boolean check to avoid ambiguity
+                if len(series_names) > 0:
+                    num_series = len(set(series_names))
+                else:
+                    num_series = len(dataset_obj.hf_dataset)
+            else:
+                num_series = len(dataset_obj.hf_dataset)
+            # Get variate count
+            variate_names = dataset_obj.get_variate_names()
+            if variate_names is not None:
+                num_variates = len(variate_names)
+            else:
+                # UTS: each series is one variate
+                num_variates = 1
+            dataset_info_msg = f"📊 Dataset Info: {num_series} series, {num_variates} variates, freq={freq}"
+    except Exception as e:
+        print(f"Error getting dataset info: {e}")
+        # If we can't get info, try to extract freq from dataset_id
+        if "/" in dataset_id:
+            _, freq = dataset_id.split("/", 1)
+            dataset_info_msg = f"📊 Dataset Info: freq={freq}"
+    metrics_list = ["MASE", "CRPS", "MAE", "MSE"]
+    # === Step 1: Compute original metrics (arithmetic mean) ===
+    original_df = (
+        df_filtered.groupby("model")[metrics_list]
+        .mean()
+        .reset_index()
+    )
+    # === Step 2: Compute normalized metrics (geometric mean of Seasonal Naive-normalized) ===
+    df_normalized = normalize_by_seasonal_naive(
+        df_filtered,
+        baseline_model="seasonal_naive",
+        metrics=metrics_list,
+        groupby_cols=["dataset_id", "horizon"],
+    )
+    # Helper function for geometric mean with NaN handling
+    def gmean_with_nan(x):
+        valid = x.dropna()
+        if len(valid) == 0:
+            return np.nan
+        return stats.gmean(valid)
+    if not df_normalized.empty:
+        normalized_df = (
+            df_normalized.groupby("model")[metrics_list]
+            .agg(gmean_with_nan)
+            .reset_index()
+        )
+        # Rename columns to * (norm.)
+        normalized_df = normalized_df.rename(columns={
+            "MASE": "MASE (norm.)",
+            "CRPS": "CRPS (norm.)",
+            "MAE": "MAE (norm.)",
+            "MSE": "MSE (norm.)",
+        })
+    else:
+        # If normalization fails, create empty normalized columns
+        normalized_df = original_df[["model"]].copy()
+        for col in ["MASE (norm.)", "CRPS (norm.)", "MAE (norm.)", "MSE (norm.)"]:
+            normalized_df[col] = np.nan
+    # Rename original columns to * (raw)
+    original_df = original_df.rename(columns={
+        "MASE": "MASE (raw)",
+        "CRPS": "CRPS (raw)",
+        "MAE": "MAE (raw)",
+        "MSE": "MSE (raw)",
+    })
+    # === Step 3: Compute average ranks from pre-computed per-task ranks ===
+    if "MASE_rank" in df_filtered.columns and "CRPS_rank" in df_filtered.columns:
+        # Use only configurations that have Seasonal Naive baseline (for consistency)
+        if not df_normalized.empty:
+            df_with_baseline = df_filtered[
+                df_filtered.set_index(["dataset_id", "horizon"]).index.isin(
+                    df_normalized.set_index(["dataset_id", "horizon"]).index.unique()
+                )
+            ]
+        else:
+            df_with_baseline = df_filtered
+        ranks_df = (
+            df_with_baseline.groupby("model")[["MASE_rank", "CRPS_rank"]]
+            .mean()
+            .reset_index()
+        )
+    else:
+        ranks_df = original_df[["model"]].copy()
+        ranks_df["MASE_rank"] = np.nan
+        ranks_df["CRPS_rank"] = np.nan
+    # === Step 4: Combine all into one DataFrame ===
+    agg_df = original_df.merge(normalized_df, on="model", how="left")
+    agg_df = agg_df.merge(ranks_df, on="model", how="left")
+    # Sort by MASE (norm.) as requested
+    if "MASE (norm.)" in agg_df.columns:
+        agg_df = agg_df.sort_values(by="MASE (norm.)", ascending=True).reset_index(drop=True)
+    elif "MASE (raw)" in agg_df.columns:
+        # Fallback to MASE (raw) if normalized version not available
+        agg_df = agg_df.sort_values(by="MASE (raw)", ascending=True).reset_index(drop=True)
+    else:
+        # Final fallback: sort by first available metric column
+        if len(agg_df.columns) > 1:
+            agg_df = agg_df.sort_values(by=agg_df.columns[1], ascending=True).reset_index(drop=True)
+    # Define column order: model, * (norm.), * (raw), *_rank
+    cols_order = ["model",
+                  "MASE (norm.)", "CRPS (norm.)", "MAE (norm.)", "MSE (norm.)",
+                  "MASE (raw)", "CRPS (raw)", "MAE (raw)", "MSE (raw)",
+                  "MASE_rank", "CRPS_rank"]
+    cols_to_return = [col for col in cols_order if col in agg_df.columns]
+    agg_df = agg_df[cols_to_return].round(3)
+    return dataset_info_msg, agg_df
+def get_dataset_multilevel_leaderboard(display_name, series, variate, horizons, metric: str = "MASE"):
+    """
+    Get leaderboard based on dataset, series, and variate selections.
+    Logic:
+    0. If only dataset selected (series="---", variate="---"): return dataset-level results
+       with both original and normalized metrics
+    1. If series/variate selected: return only original metrics (MASE, CRPS, MAE, MSE)
+    Args:
+        display_name: Dataset display name from UI (will be converted to dataset_id)
+        series: Series name or "---" if not selected
+        variate: Variate name or "---" if not selected
+        horizons: List of horizons to include
+        metric: Metric for sorting
+    Returns:
+        tuple: (message, DataFrame)
+    """
+    # Case 0: Only dataset selected - return both original and normalized metrics
+    if (series is None or series == "---" or series == "") and (variate is None or variate == "---" or variate == ""):
+        return get_dataset_leaderboard(display_name, horizons, metric)
+    # Case 1: Series/Variate selected - return only original metrics
+    # Determine if dataset is UTS or MTS by checking if variate dropdown is enabled
+    results_root = str(RESULTS_ROOT)
+    model_name = ALL_MODELS[0]
+    dataset_term = find_dataset_term_path(results_root, model_name, display_name)
+    if dataset_term is None:
+        return f"Dataset '{display_name}' not found.", pd.DataFrame(columns=["model", "MASE", "CRPS", "MAE", "MSE"])
+    # Check if dataset is UTS or MTS
+    is_uts = False
+    try:
+        hf_dataset_root = str(get_datasets_root())
+        if os.path.exists(hf_dataset_root):
+            config_root = get_config_root()
+            config_path = config_root / "datasets.yaml"
+            config = load_dataset_config(config_path) if config_path.exists() else {}
+            settings = get_dataset_settings(dataset_term, horizons[0] if horizons else "short", config)
+            dataset_obj = Dataset(
+                name=dataset_term,
+                term=horizons[0] if horizons else "short",
+                prediction_length=settings.get("prediction_length"),
+                test_length=settings.get("test_length"),
+                storage_path=hf_dataset_root,
+            )
+            variate_names = dataset_obj.get_variate_names()
+            is_uts = (variate_names is None)
+    except Exception as e:
+        print(f"Error checking UTS/MTS: {e}")
+    # Collect data from all models and horizons
+    df_all = []
+    for model in ALL_MODELS:
+        for horizon in horizons:
+            series_filter = None if (series == "---" or series == "") else series
+            variate_filter = None if (variate == "---" or variate == "") else variate
+            if is_uts:
+                variate_filter = None
+            df_model = load_test_windows(
+                display_name, horizon, model,
+                series=series_filter,
+                variate=variate_filter,
+                window_id=None
+            )
+            if df_model is not None and not df_model.empty:
+                df_all.append(df_model)
+    if not df_all:
+        return f"⚠️ No results found for the selected filters.", pd.DataFrame(columns=["model", "MASE", "CRPS", "MAE", "MSE"])
+    # Combine all data
+    df_combined = pd.concat(df_all, ignore_index=True)
+    metrics_list = ["MASE", "CRPS", "MAE", "MSE"]
+    # Simple arithmetic mean across all windows (no normalization for series/variate level)
+    leaderboard = (
+        df_combined.groupby("model")[metrics_list]
+        .mean()
+        .reset_index()
+    )
+    leaderboard = leaderboard.round(3)
+    if metric not in leaderboard.columns:
+        return f"Metric '{metric}' not found.", pd.DataFrame(columns=["model"])
+    # Sort by metric
+    leaderboard = leaderboard.sort_values(by=metric, ascending=True).reset_index(drop=True)
+    return "", leaderboard
+def get_window_leaderboard(display_name, series, variate, window_id, horizon, metric: str = "MASE"):
+    """
+    Get leaderboard for a specific test window.
+    Args:
+        display_name: Dataset display name from UI (will be converted to dataset_id)
+        series: Series name or index
+        variate: Variate name or index
+        window_id: Window index
+        horizon: Horizon name
+        metric: Metric for sorting
+    """
+    df_all = []
+    for model in ALL_MODELS:
+        df_model = load_test_windows(display_name, horizon, model, series=series, variate=variate, window_id=window_id)
+        if df_model is not None and not df_model.empty:
+            df_all.append(df_model)
+    if not df_all:
+        # Return empty DataFrame with expected columns if no data found
+        return pd.DataFrame(columns=["model", "MASE", "CRPS", "MAE", "MSE"])
+    df_all = pd.concat(df_all, ignore_index=True)
+    # metrics DataFrame
+    metrics_cols = ["model", "MASE", "CRPS", "MAE", "MSE"]
+    leaderboard = df_all[metrics_cols].reset_index(drop=True)
+    if metric not in leaderboard.columns:
+        return pd.DataFrame(columns=["model"])
+    # Round numeric columns to 3 decimal places
+    numeric_cols = leaderboard.select_dtypes(include=[np.number]).columns
+    for col in numeric_cols:
+        leaderboard[col] = leaderboard[col].round(3)
+    return leaderboard
+def get_pattern_leaderboard(
+        pattern_filters: dict[str, int],
+        selected_horizons: list[str],
+) -> tuple[str, pd.DataFrame]:
+    """
+    Filter variates by selected patterns and compute average metrics per model.
+    Uses FEATURES_BOOL_DF (binarized features) to filter variates,
+    then joins with VARIATES_DF to get metrics.
+    Args:
+        pattern_filters: Dict mapping pattern names to required values.
+            - {feature_name: required_value} where required_value is 0 or 1.
+            - Features with "Any" selection are not included in the dict.
+            - Example: {"T_strength": 1, "S_strength": 0} means:
+                - T_strength must be 1 (has the feature)
+                - S_strength must be 0 (does not have the feature)
+        selected_horizons: List of horizons to include (e.g., ["short", "medium"])
+    Returns:
+        tuple: (message, leaderboard_df)
+            - message: Status message with matching count
+            - leaderboard_df: DataFrame with model metrics, sorted by MASE
+    """
+    from src.about import VARIATES_DF, FEATURES_DF, FEATURES_BOOL_DF, PATTERN_MAP
+    # Check if data is available
+    if VARIATES_DF.empty:
+        return "⚠️ No variate-level results available.", pd.DataFrame(columns=["model", "MASE", "CRPS"])
+    if FEATURES_DF.empty or FEATURES_BOOL_DF.empty:
+        return "⚠️ No features data available.", pd.DataFrame(columns=["model", "MASE", "CRPS"])
+    if not selected_horizons:
+        return "ℹ️ Please select at least one horizon.", pd.DataFrame(columns=["model", "MASE", "CRPS"])
+    # === Step 1. Apply pattern filters ===
+    # Start with all variates
+    mask = pd.Series(True, index=FEATURES_BOOL_DF.index)
+    # If no pattern filters (all "Any"), use all variates
+    if pattern_filters:
+        for pattern, required_value in pattern_filters.items():
+            # Map UI pattern name to feature column name
+            feature_col = PATTERN_MAP.get(pattern, pattern)
+            if feature_col not in FEATURES_BOOL_DF.columns:
+                return f"⚠️ Pattern '{pattern}' (column '{feature_col}') not found in features.", pd.DataFrame(columns=["model", "MASE", "CRPS"])
+            # Special handling for "stationarity" pattern
+            # stationarity = NOT is_random_walk
+            # When user selects "Has stationarity" (required_value=1), we want is_random_walk == 0
+            # When user selects "Not stationarity" (required_value=0), we want is_random_walk == 1
+            if pattern == "stationarity":
+                mask &= (FEATURES_BOOL_DF[feature_col] == (1 - required_value))
+            else:
+                mask &= (FEATURES_BOOL_DF[feature_col] == required_value)
+    # Get matching variates
+    matched_features = FEATURES_DF[mask].copy()
+    if matched_features.empty:
+        # Build debug info for empty results
+        debug_info = []
+        for pattern, required_value in pattern_filters.items():
+            feature_col = PATTERN_MAP.get(pattern, pattern)
+            if feature_col in FEATURES_BOOL_DF.columns:
+                value_counts = FEATURES_BOOL_DF[feature_col].value_counts().to_dict()
+                debug_info.append(f"{pattern} ({feature_col}): {value_counts}")
+        debug_msg = "; ".join(debug_info) if debug_info else "No debug info available"
+        return f"⚠️ No variates match the selected patterns.\n📊 Feature distribution: {debug_msg}", pd.DataFrame(columns=["model", "MASE", "CRPS"])
+    # === Step 2. Join with VARIATES_DF to get metrics ===
+    # Join strategy:
+    # - For multivariate (is_uts=False): use full join on (dataset_id, series_name, variate_name)
+    # - For univariate (is_uts=True): determine which FEATURES_DF field matches VARIATES_DF series_name
+    # Check if join columns exist
+    base_join_cols = ["dataset_id", "series_name", "variate_name"]
+    for col in base_join_cols:
+        if col not in matched_features.columns:
+            return f"⚠️ Column '{col}' not found in features.", pd.DataFrame(columns=["model", "MASE", "CRPS"])
+        if col not in VARIATES_DF.columns:
+            return f"⚠️ Column '{col}' not found in variates results.", pd.DataFrame(columns=["model", "MASE", "CRPS"])
+    # Select only join columns from features (to avoid column conflicts)
+    features_keys = matched_features[base_join_cols].drop_duplicates()
+    # Group by dataset_id and is_uts, then perform appropriate join
+    merged_list = []
+    for dataset_id in features_keys["dataset_id"].unique():
+        # Get dataset-specific data
+        dataset_features = features_keys[features_keys["dataset_id"] == dataset_id]
+        dataset_variates = VARIATES_DF[VARIATES_DF["dataset_id"] == dataset_id]
+        if dataset_variates.empty or dataset_features.empty:
+            continue
+        # Check is_uts for this dataset (should be consistent across all rows)
+        is_uts_values = dataset_variates["is_uts"].unique()
+        if len(is_uts_values) > 1:
+            print(f"⚠️ Warning: {dataset_id} has inconsistent is_uts values: {is_uts_values}")
+        is_uts = is_uts_values[0] if len(is_uts_values) > 0 else False
+        # Initialize dataset_merged to ensure it's always defined
+        dataset_merged = pd.DataFrame()
+        if not is_uts:
+            # Multivariate: use full join on (dataset_id, series_name, variate_name)
+            join_cols = ["dataset_id", "series_name", "variate_name"]
+            dataset_features_keys = dataset_features[join_cols].drop_duplicates()
+            dataset_merged = dataset_variates.merge(
+                dataset_features_keys,
+                on=join_cols,
+                how="inner"
+            )
+        else:
+            # Univariate: determine which FEATURES_DF field matches VARIATES_DF series_name
+            # Get unique series_name values from VARIATES_DF for this dataset
+            variates_series_names = set(dataset_variates["series_name"].unique())
+            # Check which FEATURES_DF field matches VARIATES_DF series_name
+            features_series_names = set(dataset_features["series_name"].unique())
+            features_variate_names = set(dataset_features["variate_name"].unique())
+            features_series_match = len(variates_series_names & features_series_names)
+            features_variate_match = len(variates_series_names & features_variate_names)
+            if features_series_match > features_variate_match:
+                # FEATURES_DF series_name matches VARIATES_DF series_name
+                join_cols = ["dataset_id", "series_name"]
+                dataset_features_keys = dataset_features[join_cols].drop_duplicates()
+                dataset_merged = dataset_variates.merge(
+                    dataset_features_keys,
+                    on=join_cols,
+                    how="inner"
+                )
+            elif features_variate_match > features_series_match:
+                # FEATURES_DF variate_name matches VARIATES_DF series_name
+                # Create mapping: use FEATURES_DF variate_name to match VARIATES_DF series_name
+                dataset_features_keys = dataset_features[["dataset_id", "variate_name"]].drop_duplicates()
+                # Rename variate_name to series_name for join
+                dataset_features_keys = dataset_features_keys.rename(columns={"variate_name": "series_name"})
+                dataset_merged = dataset_variates.merge(
+                    dataset_features_keys,
+                    on=["dataset_id", "series_name"],
+                    how="inner"
+                )
+            else:
+                # Both match equally or neither matches - try series_name first
+                if features_series_match > 0:
+                    join_cols = ["dataset_id", "series_name"]
+                    dataset_features_keys = dataset_features[join_cols].drop_duplicates()
+                    dataset_merged = dataset_variates.merge(
+                        dataset_features_keys,
+                        on=join_cols,
+                        how="inner"
+                    )
+                else:
+                    # No match found, skip this dataset
+                    print(f"⚠️ Warning: {dataset_id} (UTS) - no matching field found between FEATURES_DF and VARIATES_DF series_name")
+                    print(f"  VARIATES_DF series_names: {sorted(list(variates_series_names))[:5]}")
+                    print(f"  FEATURES_DF series_names: {sorted(list(features_series_names))[:5]}")
+                    print(f"  FEATURES_DF variate_names: {sorted(list(features_variate_names))[:5]}")
+                    continue
+        if not dataset_merged.empty:
+            merged_list.append(dataset_merged)
+    # Combine all merged results
+    if merged_list:
+        merged = pd.concat(merged_list, ignore_index=True)
+    else:
+        merged = pd.DataFrame(columns=VARIATES_DF.columns)
+    # === Step 3. Apply horizon filter ===
+    merged = merged[merged["horizon"].isin(selected_horizons)]
+    if merged.empty:
+        return f"⚠️ No results for selected horizons: {selected_horizons}", pd.DataFrame(columns=["model", "MASE", "CRPS"])
+    # === Step 4. Aggregate by model ===
+    metric_cols = ["MASE", "CRPS"]
+    available_metrics = [col for col in metric_cols if col in merged.columns]
+    # 4a. Original metrics: arithmetic mean across all matching variates and horizons
+    original_df = (
+        merged.groupby("model")[available_metrics]
+        .mean()
+        .reset_index()
+    )
+    # 4b. Normalized metrics: normalize by Seasonal Naive at (dataset_id, series_name, variate_name, horizon) level
+    # then aggregate with geometric mean
+    df_normalized = normalize_by_seasonal_naive(
+        merged,
+        baseline_model="seasonal_naive",
+        metrics=available_metrics,
+        groupby_cols=["dataset_id", "series_name", "variate_name", "horizon"],
+    )
+    # Helper function for geometric mean with NaN handling
+    def gmean_with_nan(x):
+        valid = x.dropna()
+        if len(valid) == 0:
+            return np.nan
+        return stats.gmean(valid)
+    if not df_normalized.empty:
+        normalized_df = (
+            df_normalized.groupby("model")[available_metrics]
+            .agg(gmean_with_nan)
+            .reset_index()
+        )
+        # Rename columns to *_norm
+        rename_map = {col: f"{col}_norm" for col in available_metrics}
+        normalized_df = normalized_df.rename(columns=rename_map)
+    else:
+        # If normalization fails, create empty normalized columns
+        normalized_df = original_df[["model"]].copy()
+        for col in available_metrics:
+            normalized_df[f"{col}_norm"] = np.nan
+    # Combine original and normalized metrics
+    leaderboard = original_df.merge(normalized_df, on="model", how="left")
+    # Rename columns for better clarity
+    rename_map = {}
+    if "MASE" in leaderboard.columns:
+        rename_map["MASE"] = "MASE (raw)"
+    if "CRPS" in leaderboard.columns:
+        rename_map["CRPS"] = "CRPS (raw)"
+    if "MASE_norm" in leaderboard.columns:
+        rename_map["MASE_norm"] = "MASE (norm.)"
+    if "CRPS_norm" in leaderboard.columns:
+        rename_map["CRPS_norm"] = "CRPS (norm.)"
+    if rename_map:
+        leaderboard = leaderboard.rename(columns=rename_map)
+    # Sort by MASE (norm.) if available, otherwise by MASE (raw)
+    if "MASE (norm.)" in leaderboard.columns:
+        leaderboard = leaderboard.sort_values(by="MASE (norm.)", ascending=True).reset_index(drop=True)
+    elif "MASE (raw)" in leaderboard.columns:
+        leaderboard = leaderboard.sort_values(by="MASE (raw)", ascending=True).reset_index(drop=True)
+    # Round numeric columns to 3 decimal places
+    numeric_cols = leaderboard.select_dtypes(include=[np.number]).columns
+    for col in numeric_cols:
+        # Round to 3 decimal places and ensure proper formatting
+        leaderboard[col] = leaderboard[col].round(3)
+        # Convert to float64 to ensure consistent display
+        leaderboard[col] = leaderboard[col].astype('float64')
+    # Reorder columns: model, MASE (norm.), CRPS (norm.), MASE (raw), CRPS (raw)
+    col_order = ["model", "MASE (norm.)", "CRPS (norm.)", "MASE (raw)", "CRPS (raw)"]
+    col_order = [col for col in col_order if col in leaderboard.columns]
+    leaderboard = leaderboard[col_order]
+    # === Step 5. Build message ===
+    num_variates = len(features_keys)
+    num_results = len(merged)
+    num_models = leaderboard["model"].nunique()
+    # Count by dataset
+    dataset_counts = features_keys["dataset_id"].value_counts().to_dict()
+    dataset_msg = ", ".join([f"{ds}: {cnt}" for ds, cnt in list(dataset_counts.items())[:5]])
+    if len(dataset_counts) > 5:
+        dataset_msg += f", ... ({len(dataset_counts)} datasets total)"
+    # Build pattern description
+    if pattern_filters:
+        pattern_desc = ", ".join([
+            f"{p}={v}" for p, v in pattern_filters.items()
+        ])
+        filter_msg = f"🔍 Filters: {pattern_desc}"
+    else:
+        filter_msg = "🔍 Filters: All N/A (no filtering, all variates included)"
+    msg = (
+        f"✨ {num_variates} variates matched across {len(dataset_counts)} datasets.\n"
+        f"📊 {num_results} results from {num_models} models.\n"
+        f"{filter_msg}\n"
+        f"📁 Datasets: {dataset_msg}"
+    )
+    return msg, leaderboard
+# def get_archive_results(dataset_name: str, selected_patterns: list[str], variate_name: str):
+#     """
+#     Return variates filtered by dataset, patterns, and variate name.
+#     """
+#     if FEATURES_BOOL_DF.empty:
+#         return pd.DataFrame(columns=["dataset", "variate_name"])
+#     df = FEATURES_DF.copy()
+#     df_bool = FEATURES_BOOL_DF.copy()
+#     # Dataset filter
+#     if dataset_name and dataset_name != "All":
+#         df = df[df["dataset"] == dataset_name]
+#     # Pattern filter
+#     if selected_patterns:
+#         mask = pd.Series(True, index=df_bool.index)
+#         for pattern in selected_patterns:
+#             if pattern in df_bool.columns:
+#                 mask &= df_bool[pattern] == 1
+#         df = df[mask]
+#     # Variate filter
+#     if variate_name and variate_name != "All":
+#         df = df[df["variate_name"] == variate_name]
+#     if df.empty:
+#         return pd.DataFrame(columns=FEATURES_BOOL_DF.columns)  # ToDO: columns换成完整的
+#     # --- Add freq & domain from YAML ---
+#     freq_map, domain_map = {}, {}
+#     for ds in df["dataset"].unique():
+#         yaml_path = os.path.join("conf", "data", f"{ds}.yaml")
+#         if os.path.exists(yaml_path):
+#             with open(yaml_path, "r") as f:
+#                 meta = yaml.safe_load(f)
+#             freq_map[ds] = meta.get("freq", None)
+#             domain_map[ds] = meta.get("domain", None)
+#         else:
+#             freq_map[ds] = None
+#             domain_map[ds] = None
+#     df["freq"] = df["dataset"].map(freq_map)
+#     df["domain"] = df["dataset"].map(domain_map)
+#     # Select useful columns
+#     base_cols = ["dataset", "variate_name", "freq", "domain"]
+#     tsfeature_cols = [c for c in df.columns if c not in base_cols+['unitroot_pp', 'unitroot_kpss']]
+#     # === Rename feature columns ===
+#     renamed_cols = {}
+#     for col in tsfeature_cols:
+#         if col == 'trend':
+#             renamed_cols['trend'] = "T_strength"
+#         elif col.startswith("trend"):
+#             renamed_cols[col] = col.replace("trend", "T", 1)
+#         elif col.startswith("seasonality"):
+#             renamed_cols[col] = col.replace("seasonality", "S", 1)
+#         elif col.startswith("seasonal"):
+#             renamed_cols[col] = col.replace("seasonal", "S", 1)
+#     df = df.rename(columns=renamed_cols)
+#     # === Apply new column names ===
+#     tsfeature_cols = [renamed_cols.get(c, c) for c in tsfeature_cols]
+#     global_cols = ["x_acf1", "x_acf10", "lumpiness", "stability", "hurst", "entropy"]
+#     t_cols = [c for c in tsfeature_cols if c.startswith("T_")]
+#     s_cols = [c for c in tsfeature_cols if c.startswith("S_")]
+#     e_cols = [c for c in tsfeature_cols if c.startswith("e_")]
+#     stats_cols = [c for c in tsfeature_cols if c not in global_cols+ t_cols + s_cols + e_cols]
+#     ordered_cols = base_cols + global_cols + t_cols + s_cols + e_cols + stats_cols
+#     return df[ordered_cols].round(3)

src/tab.py ADDED Viewed

	@@ -0,0 +1,1370 @@

+import os
+import sys
+import plotly.graph_objects as go
+# Add project root and src directory to Python path to enable imports from timebench
+# Get the directory containing this file (leaderboard_app/src/)
+current_dir = os.path.dirname(os.path.abspath(__file__))
+# Get leaderboard_app directory
+leaderboard_app_dir = os.path.dirname(current_dir)
+# Try multiple paths for timebench import:
+# 1. Current leaderboard_app directory (if timebench was copied to leaderboard_app/)
+# 2. Parent directory's src (for local development: TIME/src/)
+# Add current leaderboard_app directory first (for Space deployment)
+if leaderboard_app_dir not in sys.path:
+    sys.path.insert(0, leaderboard_app_dir)
+# Get project root directory (TIME/) - for local development
+project_root = os.path.dirname(leaderboard_app_dir)
+if project_root not in sys.path:
+    sys.path.insert(0, project_root)
+src_dir = os.path.join(project_root, "src")
+if src_dir not in sys.path and os.path.exists(src_dir):
+    sys.path.insert(0, src_dir)
+import json
+import gradio as gr
+from src.about import DATASET_CHOICES, ALL_MODELS, RESULTS_ROOT, FEATURES_DF, FEATURES_BOOL_DF, PATTERN_MAP
+from src.leaderboard import (get_overall_leaderboard, get_dataset_multilevel_leaderboard,
+                            get_window_leaderboard, get_pattern_leaderboard, resolve_dataset_id)
+from src.about import DATASETS_DF, ALL_HORIZONS
+from src.hf_config import get_datasets_root, get_config_root
+import numpy as np
+import pandas as pd
+from pathlib import Path
+import ast
+import matplotlib
+matplotlib.use('Agg')  # Use non-interactive backend for Gradio
+import yaml
+import tempfile
+from timebench.evaluation.data import Dataset, get_dataset_settings, load_dataset_config
+from src.leaderboard import find_dataset_term_path
+def export_dataframe_to_csv(df, filename_prefix="leaderboard"):
+    """Export a DataFrame to a temporary CSV file and return the path for download.
+    Args:
+        df: pandas DataFrame to export
+        filename_prefix: prefix for the temporary file name
+    Returns:
+        str: path to the temporary CSV file, or None if df is empty
+    """
+    if df is None or (hasattr(df, 'empty') and df.empty):
+        return None
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, prefix=f"{filename_prefix}_") as f:
+        df.to_csv(f, index=False)
+        return f.name
+# def update_variate_choices(dataset_name: str, selected_patterns: list[str]):
+#     """
+#     Dynamically update the variate dropdown choices based on dataset + patterns.
+#     """
+#     if dataset_name == "All":
+#         return gr.Dropdown(choices=["All"], value="All", interactive=False)
+#     # Filter features by dataset
+#     df = FEATURES_BOOL_DF[FEATURES_BOOL_DF["dataset"] == dataset_name]
+#     # Apply pattern filters if provided
+#     if selected_patterns:
+#         mask = pd.Series(True, index=df.index)
+#         for pattern in selected_patterns:
+#             if pattern in df.columns:
+#                 mask &= df[pattern] == 1
+#         df = df[mask]
+#     variates = sorted(df["variate_name"].unique().tolist())
+#     if not variates:
+#         return gr.Dropdown(choices=["All"], value="All", interactive=False)
+#     return gr.Dropdown(choices=["All"] + variates, value="All", interactive=True)
+# # 更新 Variate 选择框
+# def update_variate_choices_groups(dataset_name, t, s, r, g):
+#     selected_patterns = (t or []) + (s or []) + (r or []) + (g or [])
+#     return update_variate_choices(dataset_name, selected_patterns)
+########################## Dataset Tab ##########################
+def update_series_and_variate(display_name):
+    """
+    根据 dataset display_name 更新 series 和 variate 的下拉选项
+    用于合并后的 Dataset tab
+    Args:
+        display_name: Dataset display name from UI dropdown (will be resolved to dataset_id)
+    """
+    # Use first available model to get data
+    model_name = ALL_MODELS[0]
+    # Find dataset_term (handles display_name -> dataset_id conversion)
+    results_root = str(RESULTS_ROOT)
+    dataset_term = find_dataset_term_path(results_root, model_name, display_name)
+    if dataset_term is None:
+        print(f"Error: dataset_term is None for display_name={display_name}, model_name={model_name}")
+        return (
+            gr.Dropdown(choices=["---"], value="---", label="Select Series", interactive=True),
+            gr.Dropdown(choices=["---"], value="---", label="Select Variate", interactive=True),
+        )
+    # Load Dataset to get actual series and variate names
+    # Use HF config to get dataset root (handles both local and HF Hub)
+    hf_dataset_root = str(get_datasets_root())
+    # Use HF config to get config root
+    config_root = get_config_root()
+    config_path = config_root / "datasets.yaml"
+    # horizon不影响series和variate的值，因此直接用short
+    config = load_dataset_config(config_path)
+    settings = get_dataset_settings(dataset_term, "short", config)
+    prediction_length = settings.get("prediction_length")
+    test_length = settings.get("test_length")
+    dataset_obj = Dataset(
+        name=dataset_term,
+        term="short",
+        prediction_length=prediction_length,
+        test_length=test_length,
+        storage_path=hf_dataset_root,  # Pass storage path directly
+    )
+    # Get series names
+    if "item_id" in dataset_obj.hf_dataset.column_names:
+        series_names = dataset_obj.hf_dataset["item_id"]
+    else:
+        series_names = [dataset_obj.hf_dataset[i].get("item_id", f"item_{i}")
+                        for i in range(len(dataset_obj.hf_dataset))]
+    series_list = ["---"] + [str(name) for name in series_names]
+    # Get variate names
+    variate_names = dataset_obj.get_variate_names()
+    if variate_names is None:
+        # UTS mode: variate dropdown should be disabled
+        return (
+            gr.Dropdown(choices=series_list, value="---", label="Select Series", interactive=True),
+            gr.Dropdown(choices=["---"], value="---", label="Select Variate", interactive=False),
+        )
+    else:
+        # MTS mode: both dropdowns are enabled
+        variates_list = ["---"] + [str(name) for name in variate_names]
+        return (
+            gr.Dropdown(choices=series_list, value="---", label="Select Series", interactive=True),
+            gr.Dropdown(choices=variates_list, value="---", label="Select Variate", interactive=True),
+        )
+########################## Window Tab ##########################
+def get_available_horizons(display_name):
+    """
+    获取数据集可用的horizons
+    Args:
+        display_name: Dataset display name from UI dropdown
+    Returns:
+        list: 可用的horizon列表，例如 ["short", "medium", "long"] 或 ["short"]
+    """
+    if DATASETS_DF.empty:
+        return ALL_HORIZONS
+    # Resolve display_name to dataset_id
+    dataset_id = resolve_dataset_id(display_name)
+    # Filter by dataset_id
+    df_filtered = DATASETS_DF[DATASETS_DF["dataset_id"] == dataset_id]
+    if df_filtered.empty:
+        # If not found, return all horizons as fallback
+        return ALL_HORIZONS
+    # Get unique horizons for this dataset
+    available_horizons = df_filtered["horizon"].unique().tolist()
+    # Sort to maintain order: short, medium, long
+    available_horizons = [h for h in ALL_HORIZONS if h in available_horizons]
+    return available_horizons if available_horizons else ["short"]
+def update_horizon_choices(display_name):
+    """
+    根据数据集更新horizon Radio组件的choices和value
+    Args:
+        display_name: Dataset display name from UI dropdown
+    Returns:
+        tuple: (choices, value) 用于更新Radio组件
+    """
+    available_horizons = get_available_horizons(display_name)
+    # 如果当前选择的horizon不在可用列表中，则选择第一个可用的
+    current_value = "short" if "short" in available_horizons else (available_horizons[0] if available_horizons else "short")
+    # 创建choices列表，只包含可用的horizons
+    choices = [h for h in ALL_HORIZONS if h in available_horizons]
+    return gr.Radio(choices=choices, value=current_value)
+def update_series_variate_and_window(display_name, horizon):
+    """
+    根据 dataset display_name 和 horizon 更新 series, variate, window 的下拉选项
+    使用 Dataset 加载实际的 series 和 variate 名称
+    Args:
+        display_name: Dataset display name from UI dropdown (will be resolved to dataset_id)
+        horizon: Horizon name (short, medium, long)
+    """
+    # Use first available model to get data
+    model_name = ALL_MODELS[0]
+    # Find dataset_term (handles display_name -> dataset_id conversion)
+    results_root = str(RESULTS_ROOT)
+    dataset_term = find_dataset_term_path(results_root, model_name, display_name)
+    if dataset_term is None:
+        print(f"Error: dataset_term is None for display_name={display_name}, horizon={horizon}, model_name={model_name}")
+        return (
+            gr.Dropdown(choices=[], value=None, label="Select Series", interactive=False),
+            gr.Dropdown(choices=[], value=None, label="Select Variate", interactive=False),
+            gr.Dropdown(choices=[], value=None, label="Select Testing Window", interactive=False),
+        )
+    # Parse dataset_name and freq from dataset_term (format: "dataset_name/freq")
+    dataset_name, freq = dataset_term.split("/", 1)
+    # Load Dataset to get actual series and variate names
+    # Use HF config to get dataset root (handles both local and HF Hub)
+    hf_dataset_root = str(get_datasets_root())
+    # Use HF config to get config root
+    config_root = get_config_root()
+    config_path = config_root / "datasets.yaml"
+    config = load_dataset_config(config_path) if config_path.exists() else {}
+    settings = get_dataset_settings(dataset_term, horizon, config)
+    prediction_length = settings.get("prediction_length")
+    test_length = settings.get("test_length")
+    # Load dataset
+    dataset_obj = Dataset(
+        name=dataset_term,
+        term=horizon,
+        prediction_length=prediction_length,
+        test_length=test_length,
+        storage_path=hf_dataset_root,  # Pass storage path directly
+    )
+    # Get series names (item_id) from hf_dataset
+    if "item_id" in dataset_obj.hf_dataset.column_names:
+        series_names = dataset_obj.hf_dataset["item_id"]
+    else:
+        # Fallback: get from iterating
+        series_names = [dataset_obj.hf_dataset[i].get("item_id", f"item_{i}")
+                        for i in range(len(dataset_obj.hf_dataset))]
+    # Get variate names
+    variate_names = dataset_obj.get_variate_names()
+    # Get window count
+    num_windows = dataset_obj.windows
+    windows = [str(i) for i in range(num_windows)]
+    # Convert to lists and maintain order (no sorting)
+    series_list = [str(name) for name in series_names]
+    # Handle UTS (Univariate Time Series) vs MTS (Multivariate Time Series)
+    if variate_names is None:
+        # UTS mode: each series is a single variate, so variate is always 0
+        return (
+            gr.Dropdown(choices=series_list, value=series_list[0], label="Select Series", interactive=True),
+            gr.Dropdown(choices=["0"], value="0", label="Select Variate", interactive=False),
+            gr.Dropdown(choices=windows, value=windows[0], label="Select Testing Window", interactive=True),
+        )
+    else:
+        # MTS mode: multiple variates per series
+        variates_list = [str(name) for name in variate_names]
+        return (
+            gr.Dropdown(choices=series_list, value=series_list[0], label="Select Series", interactive=True),
+            gr.Dropdown(choices=variates_list, value=variates_list[0], label="Select Variate", interactive=True),
+            gr.Dropdown(choices=windows, value=windows[0], label="Select Testing Window", interactive=True),
+        )
+def plot_window_series(display_name, series, variate, window_id, horizon, selected_quantiles, model):
+    """
+    Plot time series predictions for a specific window using Plotly for interactive visualization.
+    Now includes full time series visualization with test window highlighted.
+    Accepts series and variate names (strings) and converts them to indices.
+    Args:
+        display_name: Dataset display name from UI dropdown (will be resolved to dataset_id)
+        series: Series name
+        variate: Variate name
+        window_id: Window index
+        horizon: Horizon name
+        selected_quantiles: List of quantile strings to plot
+        model: Model name
+    Returns:
+        tuple: (fig, info_message) where fig is Plotly figure and info_message contains prediction details
+    """
+    print(f"🔍 plot_window_series called: display_name={display_name}, series={series}, variate={variate}, window_id={window_id}, horizon={horizon}, model={model}")
+    if display_name is None or series is None or variate is None or window_id is None:
+        print("❌ Missing parameters")
+        fig = go.Figure()
+        fig.update_layout(title="Please select all parameters")
+        return fig, ""
+    results_root = str(RESULTS_ROOT)
+    print(f"📁 results_root: {results_root}")
+    dataset_term = find_dataset_term_path(results_root, model, display_name)
+    print(f"📁 dataset_term: {dataset_term}")
+    if dataset_term is None:
+        print("❌ Dataset not found")
+        fig = go.Figure()
+        fig.update_layout(title="Dataset not found")
+        return fig, ""
+    predictions_path = os.path.join(results_root, model, dataset_term, horizon, "predictions.npz")
+    print(f"📁 predictions_path: {predictions_path}, exists: {os.path.exists(predictions_path)}")
+    if not os.path.exists(predictions_path):
+        print("❌ Predictions file not found")
+        fig = go.Figure()
+        fig.update_layout(title="Predictions file not found")
+        return fig, ""
+    predictions = np.load(predictions_path)
+    # Load pre-computed quantiles (new format only)
+    predictions_quantiles = predictions["predictions_quantiles"]  # (num_series, num_windows, 9, num_variates, prediction_length)
+    quantile_levels = predictions["quantile_levels"]  # [0.1, 0.2, ..., 0.9]
+    # Load prediction scale factor from config.json (for float16 overflow prevention)
+    model_config_path = os.path.join(results_root, model, dataset_term, horizon, "config.json")
+    prediction_scale_factor = 1.0
+    if os.path.exists(model_config_path):
+        with open(model_config_path, "r") as f:
+            model_config = json.load(f)
+            prediction_scale_factor = model_config.get("prediction_scale_factor", 1.0)
+    if prediction_scale_factor != 1.0:
+        print(f"📊 Applying inverse scale factor: {prediction_scale_factor}")
+        predictions_quantiles = predictions_quantiles.astype(np.float32) * prediction_scale_factor
+    # Convert series and variate names to indices
+    series_idx = None
+    variate_idx = None
+    dataset_obj = None
+    # Load Dataset to get name-to-index mappings and full time series
+    # Use HF config to get dataset root (handles both local and HF Hub)
+    hf_dataset_root = str(get_datasets_root())
+    print(f"📁 hf_dataset_root: {hf_dataset_root}, exists: {os.path.exists(hf_dataset_root)}")
+    # Use HF config to get config root
+    config_root = get_config_root()
+    config_path_yaml = config_root / "datasets.yaml"
+    print(f"📁 config_path_yaml: {config_path_yaml}, exists: {config_path_yaml.exists()}")
+    config = load_dataset_config(config_path_yaml) if config_path_yaml.exists() else {}
+    settings = get_dataset_settings(dataset_term, horizon, config)
+    print(f"⚙️ settings: {settings}")
+    prediction_length = settings.get("prediction_length")
+    test_length = settings.get("test_length")
+    print(f"📥 Loading Dataset: name={dataset_term}, term={horizon}, storage_path={hf_dataset_root}")
+    dataset_obj = Dataset(
+        name=dataset_term,
+        term=horizon,
+        prediction_length=prediction_length,
+        test_length=test_length,
+        storage_path=hf_dataset_root,  # Pass storage path directly
+    )
+    print(f"✅ Dataset loaded: {len(dataset_obj.hf_dataset)} series")
+    # Get frequency from dataset
+    dataset_freq = dataset_obj.freq
+    print(f"📅 Dataset frequency: {dataset_freq}")
+    # Get series names and create mapping
+    if "item_id" in dataset_obj.hf_dataset.column_names:
+        series_names = dataset_obj.hf_dataset["item_id"]
+    else:
+        series_names = [dataset_obj.hf_dataset[i].get("item_id", f"item_{i}")
+                        for i in range(len(dataset_obj.hf_dataset))]
+    print(f"📋 series_names: {list(series_names)}")
+    series_name_to_idx = {name: idx for idx, name in enumerate(series_names)}
+    if series in series_name_to_idx:
+        series_idx = series_name_to_idx[series]
+        print(f"✅ Found series '{series}' at index {series_idx}")
+    else:
+        series_idx = int(series)
+        print(f"⚠️ Series '{series}' not found in names, using int index {series_idx}")
+    # Get variate names and create mapping
+    variate_names = dataset_obj.get_variate_names()
+    print(f"📋 variate_names: {variate_names}")
+    if variate_names is not None:
+        # MTS mode: multiple variates per series
+        variate_name_to_idx = {name: idx for idx, name in enumerate(variate_names)}
+        if variate in variate_name_to_idx:
+            variate_idx = variate_name_to_idx[variate]
+            print(f"✅ Found variate '{variate}' at index {variate_idx}")
+        else:
+            variate_idx = int(variate)
+            print(f"⚠️ Variate '{variate}' not found in names, using int index {variate_idx}")
+    else:
+        # UTS mode: each series is a single variate, so variate_idx is always 0
+        variate_idx = 0
+        print(f"ℹ️ UTS mode, variate_idx=0")
+    if series_idx is None:
+        series_idx = int(series)
+    if variate_idx is None:
+        # For UTS mode, variate_idx should be 0
+        try:
+            variate_idx = int(variate) if variate is not None else 0
+        except (ValueError, TypeError):
+            variate_idx = 0
+    window_idx = int(window_id)
+    # Get pre-computed quantiles for this specific series, window, and variate
+    quantiles_data = predictions_quantiles[series_idx, window_idx, :, variate_idx, :]  # (9, prediction_length)
+    prediction_length = quantiles_data.shape[1]
+    # Create mapping from quantile level string to index
+    quantile_level_to_idx = {f"{q:.1f}": i for i, q in enumerate(quantile_levels)}
+    # Load full time series data
+    full_series = None
+    train_end_idx = None
+    test_window_start_idx = None
+    test_window_end_idx = None
+    # Get full target time series for this series
+    print(f"📊 Getting target for series_idx={series_idx}, variate_idx={variate_idx}")
+    full_target = dataset_obj.hf_dataset[series_idx]["target"]
+    print(f"📊 full_target shape: {full_target.shape}, dtype: {full_target.dtype}")
+    print(f"📊 full_target first 10 values (all variates): {full_target[:, :10] if full_target.ndim > 1 else full_target[:10]}")
+    # Get start timestamp for this series and create timestamp array
+    series_start = dataset_obj.hf_dataset[series_idx]["start"]
+    print(f"📅 Series start timestamp: {series_start}, type: {type(series_start)}")
+    # Handle numpy array containing datetime64 (common when reading from HF dataset)
+    if isinstance(series_start, np.ndarray):
+        # Extract scalar from array
+        series_start = series_start.item() if series_start.ndim == 0 else series_start[0]
+        print(f"📅 Extracted scalar: {series_start}, type: {type(series_start)}")
+    # Convert numpy datetime64 to pandas Timestamp
+    if isinstance(series_start, (np.datetime64, str)):
+        series_start = pd.Timestamp(series_start)
+    # Calculate series length for timestamp creation
+    if full_target.ndim > 1:
+        ts_length = full_target.shape[1]
+    else:
+        ts_length = len(full_target)
+    # Create timestamp array for the entire series
+    try:
+        timestamps = pd.date_range(start=series_start, periods=ts_length, freq=dataset_freq)
+        print(f"📅 Created timestamp array: {timestamps[0]} to {timestamps[-1]}")
+    except Exception as e:
+        print(f"⚠️ Failed to create timestamps: {e}, falling back to indices")
+        timestamps = None
+    # Handle multivariate case: extract specific variate
+    if full_target.ndim > 1:
+        full_series = full_target[variate_idx, :]  # Shape: (series_length,)
+    else:
+        full_series = full_target  # Shape: (series_length,)
+    print(f"📊 full_series shape: {full_series.shape}, min: {full_series.min()}, max: {full_series.max()}, has_nan: {np.isnan(full_series).any()}")
+    # Calculate train/test split point
+    # Test data starts at: series_length - test_length
+    series_length = len(full_series)
+    train_end_idx = series_length - test_length
+    # Calculate current test window position
+    test_window_start_idx = train_end_idx + window_idx * prediction_length
+    test_window_end_idx = test_window_start_idx + prediction_length
+    # Create Plotly figure
+    fig = go.Figure()
+    # Quantile colors - from light to dark
+    quantile_colors = {
+        "0.1": "#c6dbef", "0.9": "#c6dbef",  # lightest
+        "0.2": "#6baed6", "0.8": "#6baed6",  # light
+        "0.3": "#4292c6", "0.7": "#4292c6",  # medium
+        "0.4": "#2171b5", "0.6": "#2171b5",  # dark
+        "0.5": "#08306b",  # darkest (median)
+    }
+    # Calculate prediction time steps (overlay on the test window)
+    if test_window_start_idx is not None:
+        pred_time_steps = np.arange(test_window_start_idx, test_window_end_idx)
+    else:
+        pred_time_steps = np.arange(prediction_length)
+    # Plot full time series if available
+    time_steps = np.arange(len(full_series))
+    # Use timestamps for x-axis if available
+    if timestamps is not None:
+        x_full = timestamps
+        x_pred = timestamps[pred_time_steps] if test_window_start_idx is not None else timestamps[:prediction_length]
+        x_window = timestamps[test_window_start_idx:test_window_end_idx] if test_window_start_idx is not None else None
+    else:
+        x_full = time_steps
+        x_pred = pred_time_steps
+        x_window = np.arange(test_window_start_idx, test_window_end_idx) if test_window_start_idx is not None else None
+    # Plot full series in light gray
+    fig.add_trace(go.Scatter(
+        x=x_full,
+        y=full_series,
+        mode='lines',
+        name='Full Time Series',
+        line=dict(color='gray', width=1),
+        opacity=0.6,
+        hovertemplate='Time: %{x}<br>Value: %{y:.4f}<extra></extra>'
+    ))
+    # Add shapes for regions (training, test, current window)
+    if train_end_idx is not None:
+        # Training region - use timestamps if available
+        x0_train = timestamps[0] if timestamps is not None else 0
+        x1_train = timestamps[train_end_idx] if timestamps is not None else train_end_idx
+        fig.add_shape(
+            type="rect",
+            x0=x0_train, x1=x1_train,
+            y0=0, y1=1, yref="paper",
+            fillcolor="blue", opacity=0.1,
+            layer="below", line_width=0,
+        )
+        # Test region
+        test_region_end = len(full_series)
+        x0_test = timestamps[train_end_idx] if timestamps is not None else train_end_idx
+        x1_test = timestamps[test_region_end-1] if timestamps is not None else test_region_end-1
+        fig.add_shape(
+            type="rect",
+            x0=x0_test, x1=x1_test,
+            y0=0, y1=1, yref="paper",
+            fillcolor="orange", opacity=0.15,
+            layer="below", line_width=0,
+        )
+    # Highlight current test window
+    if test_window_start_idx is not None and test_window_end_idx is not None:
+        # Use timestamps for window highlight if available
+        x0_window = timestamps[test_window_start_idx] if timestamps is not None else test_window_start_idx
+        x1_window = timestamps[test_window_end_idx-1] if timestamps is not None else test_window_end_idx-1
+        fig.add_shape(
+            type="rect",
+            x0=x0_window, x1=x1_window,
+            y0=0, y1=1, yref="paper",
+            fillcolor="red", opacity=0.2,
+            layer="below", line_width=0,
+        )
+        # Plot the test window portion of full series
+        window_series = full_series[test_window_start_idx:test_window_end_idx]
+        fig.add_trace(go.Scatter(
+            x=x_window,
+            y=window_series,
+            mode='lines',
+            name='Ground Truth (Window)',
+            line=dict(color='red', width=2),
+            opacity=0.8,
+            hovertemplate='Time: %{x}<br>Value: %{y:.4f}<extra></extra>'
+        ))
+    # Quantile pairs mapping: UI selection -> (low, high) quantile values
+    quantile_pair_map = {
+        "0.1-0.9": ("0.1", "0.9"),
+        "0.2-0.8": ("0.2", "0.8"),
+        "0.3-0.7": ("0.3", "0.7"),
+        "0.4-0.6": ("0.4", "0.6"),
+    }
+    # Helper function to get pre-computed quantile values
+    def get_quantile_values(q_str):
+        return quantiles_data[quantile_level_to_idx[q_str], :]
+    # Plot quantile pairs with fill (based on paired selection)
+    for pair_str, (q_low_str, q_high_str) in quantile_pair_map.items():
+        if pair_str in selected_quantiles:
+            quantile_low = get_quantile_values(q_low_str)
+            quantile_high = get_quantile_values(q_high_str)
+            color = quantile_colors.get(q_low_str, "#2171b5")
+            # Add filled area between quantiles
+            fig.add_trace(go.Scatter(
+                x=list(x_pred) + list(x_pred[::-1]),
+                y=list(quantile_high) + list(quantile_low[::-1]),
+                fill='toself',
+                fillcolor=color,
+                line=dict(color='rgba(255,255,255,0)'),
+                hoverinfo="skip",
+                showlegend=True,
+                name=f'Q{q_low_str}-Q{q_high_str}',
+                opacity=0.3
+            ))
+            # Add lower quantile line
+            fig.add_trace(go.Scatter(
+                x=x_pred,
+                y=quantile_low,
+                mode='lines',
+                name=f'Q{q_low_str}',
+                line=dict(color=color, width=1),
+                opacity=0.7,
+                showlegend=False,
+                hovertemplate=f'Time: %{{x}}<br>Q{q_low_str}: %{{y:.4f}}<extra></extra>'
+            ))
+            # Add upper quantile line
+            fig.add_trace(go.Scatter(
+                x=x_pred,
+                y=quantile_high,
+                mode='lines',
+                name=f'Q{q_high_str}',
+                line=dict(color=color, width=1),
+                opacity=0.7,
+                showlegend=False,
+                hovertemplate=f'Time: %{{x}}<br>Q{q_high_str}: %{{y:.4f}}<extra></extra>'
+            ))
+    # Plot median (0.5) if selected
+    if "0.5" in selected_quantiles:
+        quantile_values = get_quantile_values("0.5")
+        color = quantile_colors.get("0.5", "#08306b")
+        fig.add_trace(go.Scatter(
+            x=x_pred,
+            y=quantile_values,
+            mode='lines+markers',
+            name='Median (Q0.5)',
+            line=dict(color=color, width=3),
+            marker=dict(size=5, symbol='circle'),
+            opacity=0.8,
+            hovertemplate='Time: %{x}<br>Q0.5: %{y:.4f}<extra></extra>'
+        ))
+    # Update layout - use autosize for responsive width
+    x_axis_title = "Timestamp" if timestamps is not None else "Time Step"
+    fig.update_layout(
+        title=None,
+        xaxis_title=x_axis_title,
+        yaxis_title="Value",
+        hovermode='x unified',
+        autosize=True,  # 使用自动宽度，让图表响应容器大小
+        height=400,
+        margin=dict(l=60, r=40, t=60, b=60),  # 设置合理的边距
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=1.02,
+            xanchor="right",
+            x=1,
+            font=dict(size=14)
+        ),
+        plot_bgcolor='white',
+        xaxis=dict(showgrid=True, gridcolor='lightgray', gridwidth=1),
+        yaxis=dict(showgrid=True, gridcolor='lightgray', gridwidth=1)
+    )
+    # Create info message for prediction window
+    if timestamps is not None and test_window_start_idx is not None and test_window_end_idx is not None:
+        pred_start_ts = timestamps[test_window_start_idx]
+        pred_end_ts = timestamps[test_window_end_idx - 1]  # -1 because end index is exclusive
+        # Format with weekday name
+        start_str = f"{pred_start_ts.strftime('%Y-%m-%d %H:%M:%S')} ({pred_start_ts.day_name()})"
+        end_str = f"{pred_end_ts.strftime('%Y-%m-%d %H:%M:%S')} ({pred_end_ts.day_name()})"
+        base_info = (
+            f"📊 Prediction Length: {prediction_length}\n"
+            f"📅 Prediction Range: {start_str} → {end_str}\n"
+            f"🔄 Dataset Frequency: {dataset_freq}"
+        )
+    else:
+        base_info = (
+            f"📊 Prediction Length: {prediction_length}\n"
+            f"📅 Prediction Range: index {test_window_start_idx} → {test_window_end_idx - 1}\n"
+            f"🔄 Dataset Frequency: {dataset_freq if 'dataset_freq' in dir() else 'N/A'}"
+        )
+    # Get features information for the selected variate
+    # Pattern names from init_per_pattern_tab
+    pattern_names = [
+        "T_strength", "T_linearity",
+        "S_strength", "S_corr",
+        "R_ACF1",
+        "stationarity", "complexity"
+    ]
+    features_info = ""
+    if not FEATURES_DF.empty and not FEATURES_BOOL_DF.empty:
+        # Find matching row in features dataframes
+        # Try to match by dataset_id, series_name, variate_name
+        feature_row_orig = None
+        feature_row_bool = None
+        # Match by dataset_id first
+        features_subset_orig = FEATURES_DF[FEATURES_DF["dataset_id"] == dataset_term]
+        features_subset_bool = FEATURES_BOOL_DF[FEATURES_BOOL_DF["dataset_id"] == dataset_term]
+        print(f"🔍 Features lookup: dataset_term={dataset_term}, series={series}, variate={variate}")
+        print(f"🔍 Features subset size: orig={len(features_subset_orig)}, bool={len(features_subset_bool)}")
+        # Try matching by series_name and variate_name (for MTS)
+        if not features_subset_orig.empty:
+            # Check if series_name matches
+            if "series_name" in features_subset_orig.columns:
+                series_match_orig = features_subset_orig["series_name"] == series
+                if series_match_orig.any():
+                    series_matched = features_subset_orig[series_match_orig]
+                    print(f"🔍 Found {len(series_matched)} rows with series_name={series}")
+                    # Check if variate_name matches
+                    if "variate_name" in series_matched.columns:
+                        # For UTS, variate might be "0" or 0, try both
+                        variate_str = str(variate)
+                        variate_match_orig = (series_matched["variate_name"] == variate_str) | (series_matched["variate_name"] == variate)
+                        if variate_match_orig.any():
+                            feature_row_orig = series_matched[variate_match_orig].iloc[0]
+                            print(f"✅ Found feature row by series_name + variate_name")
+                            # Find corresponding row in bool dataframe
+                            if not features_subset_bool.empty and "series_name" in features_subset_bool.columns and "variate_name" in features_subset_bool.columns:
+                                series_match_bool = features_subset_bool["series_name"] == series
+                                variate_match_bool = (features_subset_bool["variate_name"] == variate_str) | (features_subset_bool["variate_name"] == variate)
+                                bool_matched = features_subset_bool[series_match_bool & variate_match_bool]
+                                if not bool_matched.empty:
+                                    feature_row_bool = bool_matched.iloc[0]
+        # If not found, try matching by series_name only (for UTS cases where variate_name might not match)
+        if feature_row_orig is None and not features_subset_orig.empty:
+            if "series_name" in features_subset_orig.columns:
+                series_match_orig = features_subset_orig["series_name"] == series
+                if series_match_orig.any():
+                    # For UTS, there might be only one row per series
+                    series_matched = features_subset_orig[series_match_orig]
+                    if len(series_matched) == 1:
+                        feature_row_orig = series_matched.iloc[0]
+                        print(f"✅ Found feature row by series_name only (UTS)")
+                        # Find corresponding row in bool dataframe
+                        if not features_subset_bool.empty and "series_name" in features_subset_bool.columns:
+                            series_match_bool = features_subset_bool["series_name"] == series
+                            bool_matched = features_subset_bool[series_match_bool]
+                            if len(bool_matched) == 1:
+                                feature_row_bool = bool_matched.iloc[0]
+        # If still not found, try matching by variate_name only (for UTS cases where variate_name == series)
+        if feature_row_orig is None and not features_subset_orig.empty:
+            if "variate_name" in features_subset_orig.columns:
+                variate_match_orig = features_subset_orig["variate_name"] == series  # For UTS, series might be the variate_name
+                if variate_match_orig.any():
+                    feature_row_orig = features_subset_orig[variate_match_orig].iloc[0]
+                    print(f"✅ Found feature row by variate_name (series as variate_name)")
+                    # Find corresponding row in bool dataframe
+                    if not features_subset_bool.empty and "variate_name" in features_subset_bool.columns:
+                        variate_match_bool = features_subset_bool["variate_name"] == series
+                        if variate_match_bool.any():
+                            feature_row_bool = features_subset_bool[variate_match_bool].iloc[0]
+        if feature_row_orig is None:
+            print(f"⚠️ Could not find features for dataset_term={dataset_term}, series={series}, variate={variate}")
+            if not features_subset_orig.empty:
+                print(f"   Available series_names: {features_subset_orig['series_name'].unique()[:10] if 'series_name' in features_subset_orig.columns else 'N/A'}")
+                print(f"   Available variate_names: {features_subset_orig['variate_name'].unique()[:10] if 'variate_name' in features_subset_orig.columns else 'N/A'}")
+        if feature_row_orig is not None:
+            # Build features display
+            features_orig_items = []
+            features_bool_items = []
+            for pattern_name in pattern_names:
+                # Map pattern name to feature column name
+                feature_col = PATTERN_MAP.get(pattern_name, pattern_name)
+                # Get original value (skip stationarity as it's derived from is_random_walk)
+                if pattern_name != "stationarity":
+                    if feature_col in feature_row_orig.index:
+                        orig_value = feature_row_orig[feature_col]
+                        if pd.notna(orig_value):
+                            features_orig_items.append(f"{pattern_name}: {orig_value:.3f}")
+                # Get binary value
+                if feature_row_bool is not None and feature_col in feature_row_bool.index:
+                    bool_value = feature_row_bool[feature_col]
+                    if pd.notna(bool_value):
+                        # Special handling for stationarity (it's inverted)
+                        if pattern_name == "stationarity":
+                            # stationarity = NOT is_random_walk, so display the inverted value
+                            display_bool = 1 - int(bool_value)
+                        else:
+                            display_bool = int(bool_value)
+                        features_bool_items.append(f"{pattern_name}: {display_bool}")
+            if features_orig_items or features_bool_items:
+                features_info = "\n\n 📝 Features of variate:\n"
+                if features_orig_items:
+                    features_info += "- Original Values: " + ", ".join(features_orig_items) + "\n"
+                if features_bool_items:
+                    features_info += "- Binary Values (0/1): " + ", ".join(features_bool_items)
+    info_message = base_info + features_info
+    print(f"📝 Info message: {info_message}")
+    return fig, info_message
+def init_overall_tab():
+    gr.Markdown(
+        """
+        This tab presents each model's overall performance aggregated across all tasks. A **task** is defined as a specific **(dataset, horizon)** pair. For each task, the result is obtained by averaging the metrics across all its variates.
+        - **MASE (norm.), CRPS (norm.)**: task-level results are normalized by Seasonal Naive and aggregated by geometric mean.
+        - **MASE_rank, CRPS_rank**: for each task, models are ranked by the metric; the average rank across all tasks is then reported.
+        """,
+        elem_classes="markdown-text"
+    )
+    overall_table = gr.DataFrame(
+        value=get_overall_leaderboard(DATASETS_DF, metric="MASE"),
+        elem_classes="custom-table",
+        interactive=False
+    )
+    # CSV Export
+    def export_overall_csv():
+        df = get_overall_leaderboard(DATASETS_DF, metric="MASE")
+        return export_dataframe_to_csv(df, filename_prefix="overall_leaderboard")
+    with gr.Row():
+        export_btn = gr.Button("📥 Export CSV", size="sm")
+        export_file = gr.File(label="Download CSV", visible=False)
+    export_btn.click(
+        fn=export_overall_csv,
+        inputs=[],
+        outputs=[export_file]
+    ).then(
+        fn=lambda: gr.File(visible=True),
+        inputs=[],
+        outputs=[export_file]
+    )
+def init_per_dataset_tab(demo):
+    gr.Markdown(
+        """
+        This tab provides flexible analysis at dataset, series, and variate levels.
+        - **Dataset only**: Shows both Seasonal Naive-normalized metrics (task-level) and original non-normalized metrics, plus average ranks
+        - **Series/Variate selected**: Shows only original metrics.
+        - **Horizons**: Select one or more horizons to aggregate results
+        """,
+        elem_classes="markdown-text"
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            horizons = gr.CheckboxGroup(
+                choices=ALL_HORIZONS,
+                value=ALL_HORIZONS,
+                label="Horizons"
+            )
+            dataset_dropdown = gr.Dropdown(
+                choices=DATASET_CHOICES,
+                value=DATASET_CHOICES[0],
+                label="Dataset",
+                interactive=True
+            )
+            # Initialize series and variate dropdowns
+            initial_dataset = DATASET_CHOICES[0]
+            series_dropdown, variate_dropdown = update_series_and_variate(
+                initial_dataset
+            )
+    msg = gr.Textbox(label="Message", interactive=False)
+    table = gr.DataFrame(elem_classes="custom-table", interactive=False)
+    # Update series and variate dropdowns when dataset changes
+    dataset_dropdown.change(
+        fn=update_series_and_variate,
+        inputs=[dataset_dropdown],
+        outputs=[series_dropdown, variate_dropdown],
+    )
+    # Update leaderboard when any selection changes
+    for comp in [dataset_dropdown, series_dropdown, variate_dropdown, horizons]:
+        comp.change(
+            fn=get_dataset_multilevel_leaderboard,
+            inputs=[dataset_dropdown, series_dropdown, variate_dropdown, horizons],
+            outputs=[msg, table]
+        )
+    # Load on startup
+    demo.load(
+        fn=get_dataset_multilevel_leaderboard,
+        inputs=[dataset_dropdown, series_dropdown, variate_dropdown, horizons],
+        outputs=[msg, table]
+    )
+    # CSV Export
+    def export_dataset_csv(dataset, series, variate, horizons_val):
+        _, df = get_dataset_multilevel_leaderboard(dataset, series, variate, horizons_val)
+        # Sanitize dataset name for filename (replace / with _)
+        safe_dataset_name = dataset.replace("/", "_") if dataset else "unknown"
+        return export_dataframe_to_csv(df, filename_prefix=f"dataset_{safe_dataset_name}")
+    with gr.Row():
+        export_btn = gr.Button("📥 Export CSV", size="sm")
+        export_file = gr.File(label="Download CSV", visible=False)
+    export_btn.click(
+        fn=export_dataset_csv,
+        inputs=[dataset_dropdown, series_dropdown, variate_dropdown, horizons],
+        outputs=[export_file]
+    ).then(
+        fn=lambda: gr.File(visible=True),
+        inputs=[],
+        outputs=[export_file]
+    )
+def init_per_window_tab(demo):
+    gr.Markdown(
+        """
+        This tab enables detailed analysis of model performance at the level of individual testing windows. By selecting a dataset, variate, horizon, and test window, users can examine window-level metrics (MASE, CRPS, MAE, MSE) at fine granularity and visualize the predicted quantiles of a model along with the ground-truth.
+        - **Interactive Visualization**: Zoom, pan, autoscale and download the plot.
+        - 🟦 Train Split 🟨 Test Split 🟥 Prediction Window
+        """
+    )
+    QUANTILE_PAIR_CHOICES = ["0.1-0.9", "0.2-0.8", "0.3-0.7", "0.4-0.6", "0.5"]
+    initial_quantiles = ["0.5"]
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Initialize horizon choices based on first dataset
+            initial_dataset = DATASET_CHOICES[0] if DATASET_CHOICES else None
+            initial_horizons = get_available_horizons(initial_dataset) if initial_dataset else ALL_HORIZONS
+            horizons = gr.Radio(
+                choices=initial_horizons,
+                value="short" if "short" in initial_horizons else (initial_horizons[0] if initial_horizons else "short"),
+                label="Horizons"
+            )
+            # Dropdown for dataset selection
+            dataset_dropdown = gr.Dropdown(
+                choices=DATASET_CHOICES,
+                value=DATASET_CHOICES[0] if DATASET_CHOICES else None,  # 默认选第一个
+                label="Dataset",
+                interactive=True
+            )
+            # Initialize series, variate, window dropdowns using function
+            series_dropdown, variate_dropdown, window_dropdown = update_series_variate_and_window(
+                dataset_dropdown.value, horizons.value
+            )
+        with gr.Column(scale=2):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    quantiles = gr.CheckboxGroup(
+                        choices=QUANTILE_PAIR_CHOICES,
+                        value=initial_quantiles,
+                        label="Select Quantiles for Visualization"
+                    )
+                with gr.Column(scale=1):
+                    model = gr.Dropdown(
+                        choices=ALL_MODELS,
+                        value=ALL_MODELS[0],
+                        label="Select Model for Visualization",
+                        interactive=True
+                    )
+            ts_visualization = gr.Plot()
+            # Message box for prediction window info
+            prediction_info = gr.Textbox(
+                label="Info",
+                interactive=False,
+                lines=3
+            )
+    table_window = gr.DataFrame(elem_classes="custom-table", interactive=False)
+    # When dataset changes: first update horizon choices, then update dropdowns
+    dataset_dropdown.change(
+        fn=update_horizon_choices,
+        inputs=[dataset_dropdown],
+        outputs=[horizons],
+    ).then(
+        fn=update_series_variate_and_window,
+        inputs=[dataset_dropdown, horizons],
+        outputs=[series_dropdown, variate_dropdown, window_dropdown],
+    ).then(
+        # After dropdowns are updated, refresh the visualization and table
+        fn=plot_window_series,
+        inputs=[dataset_dropdown, series_dropdown, variate_dropdown, window_dropdown, horizons, quantiles, model],
+        outputs=[ts_visualization, prediction_info]
+    ).then(
+        fn=get_window_leaderboard,
+        inputs=[dataset_dropdown, series_dropdown, variate_dropdown, window_dropdown, horizons],
+        outputs=table_window
+    )
+    # When horizon changes: update dropdowns, then refresh visualization
+    horizons.change(
+        fn=update_series_variate_and_window,
+        inputs=[dataset_dropdown, horizons],
+        outputs=[series_dropdown, variate_dropdown, window_dropdown],
+    ).then(
+        fn=plot_window_series,
+        inputs=[dataset_dropdown, series_dropdown, variate_dropdown, window_dropdown, horizons, quantiles, model],
+        outputs=[ts_visualization, prediction_info]
+    ).then(
+        fn=get_window_leaderboard,
+        inputs=[dataset_dropdown, series_dropdown, variate_dropdown, window_dropdown, horizons],
+        outputs=table_window
+    )
+    # For series, variate, window changes - update visualization and table
+    for comp in [series_dropdown, variate_dropdown, window_dropdown]:
+        comp.change(
+            fn=get_window_leaderboard,
+            inputs=[dataset_dropdown, series_dropdown, variate_dropdown, window_dropdown, horizons],
+            outputs=table_window
+        )
+        comp.change(
+            fn=plot_window_series,
+            inputs=[dataset_dropdown, series_dropdown, variate_dropdown, window_dropdown, horizons, quantiles, model],
+            outputs=[ts_visualization, prediction_info]
+        )
+    # For quantiles and model changes - only update visualization (no table change needed)
+    for comp in [quantiles, model]:
+        comp.change(
+            fn=plot_window_series,
+            inputs=[dataset_dropdown, series_dropdown, variate_dropdown, window_dropdown, horizons, quantiles, model],
+            outputs=[ts_visualization, prediction_info]
+        )
+    # Load initial visualization and table on page load
+    demo.load(
+        fn=plot_window_series,
+        inputs=[dataset_dropdown, series_dropdown, variate_dropdown, window_dropdown, horizons, quantiles, model],
+        outputs=[ts_visualization, prediction_info]
+    )
+    demo.load(
+        fn=get_window_leaderboard,
+        inputs=[dataset_dropdown, series_dropdown, variate_dropdown, window_dropdown, horizons],
+        outputs=table_window
+    )
+    # CSV Export
+    def export_window_csv(dataset, series, variate, window, horizon):
+        df = get_window_leaderboard(dataset, series, variate, window, horizon)
+        return export_dataframe_to_csv(df, filename_prefix="window_leaderboard")
+    with gr.Row():
+        export_btn = gr.Button("📥 Export CSV", size="sm")
+        export_file = gr.File(label="Download CSV", visible=False)
+    export_btn.click(
+        fn=export_window_csv,
+        inputs=[dataset_dropdown, series_dropdown, variate_dropdown, window_dropdown, horizons],
+        outputs=[export_file]
+    ).then(
+        fn=lambda: gr.File(visible=True),
+        inputs=[],
+        outputs=[export_file]
+    )
+def init_per_pattern_tab(demo):
+    gr.Markdown(
+        """
+        This tab allows you to explore model performance based on **selected patterns**.
+        Select patterns to filter variates that exhibit those characteristics, then view aggregated model performance.
+        Each pattern is a **boolean indicator** derived from time series features (binarized by **median** threshold for continuous features).
+        - **Patterns are intersected**: A variate must exhibit ALL selected patterns to be included.
+        - **MASE (norm.), CRPS (norm.)**: variate-level results are normalized by Seasonal Naive and aggregated by geometric mean across all matching variates.
+        - **MASE (raw), CRPS (raw)**: arithmetic mean across all matching variates.
+        """,
+        elem_classes="markdown-text"
+    )
+    # Define pattern choices for Radio components
+    PATTERN_CHOICES = ["N/A", "=1", "=0"]
+    with gr.Row():  # TSFeatures
+        with gr.Column(scale=1):
+            with gr.Group():
+                gr.Markdown("### 📈 Trend Features")
+                T_strength = gr.Radio(
+                    choices=PATTERN_CHOICES, value="N/A", label="T_strength"
+                )
+                T_linearity = gr.Radio(
+                    choices=PATTERN_CHOICES, value="N/A", label="T_linearity"
+                )
+        with gr.Column(scale=1):
+            with gr.Group():
+                gr.Markdown("### 🔄 Seasonal Features")
+                S_strength = gr.Radio(
+                    choices=PATTERN_CHOICES, value="N/A", label="S_strength"
+                )
+                S_corr = gr.Radio(
+                    choices=PATTERN_CHOICES, value="N/A", label="S_corr"
+                )
+        with gr.Column(scale=1):
+            with gr.Group():
+                gr.Markdown("### 🎯 Residual Features")
+                R_ACF1 = gr.Radio(
+                    choices=PATTERN_CHOICES, value="N/A", label="R_ACF1"
+                )
+        with gr.Column(scale=1):
+            with gr.Group():
+                gr.Markdown("### ⚙️ Global Features")
+                stationarity = gr.Radio(
+                    choices=PATTERN_CHOICES, value="N/A", label="stationarity"
+                )
+                complexity = gr.Radio(
+                    choices=PATTERN_CHOICES, value="N/A", label="complexity"
+                )
+    # List of all pattern Radio components and their names
+    pattern_radios = [
+        T_strength, T_linearity,
+        S_strength, S_corr,
+        R_ACF1,
+        stationarity, complexity
+    ]
+    pattern_names = [
+        "T_strength", "T_linearity",
+        "S_strength", "S_corr",
+        "R_ACF1",
+        "stationarity", "complexity"
+    ]
+    with gr.Row():
+        with gr.Column(scale=1):
+            horizons = gr.CheckboxGroup(
+                choices=ALL_HORIZONS,
+                value=ALL_HORIZONS,
+                label="Horizons"
+            )
+        with gr.Column(scale=2):
+            msg_pattern = gr.Textbox(label="Status", interactive=False, lines=4)
+    table_variates = gr.DataFrame(elem_classes="custom-table", interactive=False)
+    def merge_patterns(*radio_values):
+        """Convert Radio values to pattern filter dict.
+        Args:
+            *radio_values: Values from all Radio components in order of pattern_names
+        Returns:
+            dict: {feature_name: required_value} where required_value is 0 or 1.
+                  Features with "N/A" are not included in the dict.
+        """
+        result = {}
+        for name, value in zip(pattern_names, radio_values):
+            if value == "=1":
+                result[name] = 1
+            elif value == "=0":
+                result[name] = 0
+            # "N/A" -> don't include in dict (no filter on this feature)
+        return result
+    def update_leaderboard(*args):
+        """Callback to update the pattern leaderboard.
+        Args:
+            *args: All Radio values followed by horizons (last argument)
+        """
+        # Last argument is horizons, rest are pattern radio values
+        horizons_val = args[-1]
+        radio_values = args[:-1]
+        pattern_filters = merge_patterns(*radio_values)
+        return get_pattern_leaderboard(pattern_filters, horizons_val)
+    # Bind change events for all pattern radios and horizons
+    all_inputs = pattern_radios + [horizons]
+    for comp in all_inputs:
+        comp.change(
+            fn=update_leaderboard,
+            inputs=all_inputs,
+            outputs=[msg_pattern, table_variates]
+        )
+    # Load initial state
+    demo.load(
+        fn=update_leaderboard,
+        inputs=all_inputs,
+        outputs=[msg_pattern, table_variates]
+    )
+    # CSV Export
+    def export_pattern_csv(*args):
+        # Last argument is horizons, rest are pattern radio values
+        horizons_val = args[-1]
+        radio_values = args[:-1]
+        pattern_filters = merge_patterns(*radio_values)
+        _, df = get_pattern_leaderboard(pattern_filters, horizons_val)
+        return export_dataframe_to_csv(df, filename_prefix="pattern_leaderboard")
+    with gr.Row():
+        export_btn = gr.Button("📥 Export CSV", size="sm")
+        export_file = gr.File(label="Download CSV", visible=False)
+    export_btn.click(
+        fn=export_pattern_csv,
+        inputs=all_inputs,
+        outputs=[export_file]
+    ).then(
+        fn=lambda: gr.File(visible=True),
+        inputs=[],
+        outputs=[export_file]
+    )
+# # ToDO: Now the archive is using different features from the ones in per_pattern tab
+# def init_archive_tab(demo):
+#     gr.Markdown(
+#         """
+#         This tab provides an interactive archive of the features of time series variates across datasets. You can explore the archive by specifying a dataset, domain, and frequency, and filter variates with the selected structural patterns. Each pattern is a **boolean indicator** showing whether a variate exhibits the pattern, with thresholds derived from the distribution of feature values across the entire dataset. Pattern filters are applied as an **intersection** (a variate must exhibit all selected patterns). Domain and frequency filters are applied as a **union** (a variate may belong to any selected category). The resulting table displays all variates that satisfy the chosen filters, together with their dataset, frequency, domain, and computed feature values. This view makes it possible to identify and group variates that share similar feature profiles.
+#         """
+#     )
+#     with gr.Row():
+#         with gr.Column(scale=1):
+#             dataset_dropdown = gr.Dropdown(
+#                 choices=["All"] + sorted(FEATURES_BOOL_DF["dataset"].unique().tolist()),
+#                 value="All",
+#                 label="Select Dataset"
+#             )
+#             variate_dropdown = gr.Dropdown(
+#                 choices=["All"],
+#                 value="All",
+#                 label="Select Variate",
+#                 interactive=False
+#             )
+#             domains = gr.CheckboxGroup(
+#                 choices=ALL_DOMAINS,
+#                 value=ALL_DOMAINS,  # default all checked
+#                 label="Domains"
+#             )
+#             freqs = gr.CheckboxGroup(
+#                 choices=ALL_FREQS,
+#                 value=ALL_FREQS,  # 默认全选
+#                 label="Frequencies"
+#             )
+#         with gr.Column(scale=2):
+#             trend_group = gr.CheckboxGroup(
+#                 choices=["trend", "trend_stability", "trend_lumpiness", "trend_hurst", "trend_entropy"],
+#                 label="Trend Patterns"
+#             )
+#             season_group = gr.CheckboxGroup(
+#                 choices=["seasonal_strength", "seasonality_corr", "seasonal_stability",
+#                          "seasonal_lumpiness", "seasonal_hurst", "seasonal_entropy"],
+#                 label="Seasonality Patterns"
+#             )
+#             remainder_group = gr.CheckboxGroup(
+#                 choices=["e_acf1", "e_acf10",
+#                          "e_entropy", "e_hurst", "e_lumpiness", "e_outlier_ratio"],
+#                 label="Remainder Patterns"
+#             )
+#             global_group = gr.CheckboxGroup(
+#                 choices=["x_acf1", "x_acf10", "lumpiness", "stability", "hurst", "entropy"],
+#                 label="Global Patterns"
+#             )
+#             msg_box = gr.Textbox(
+#                 label="Message",
+#                 interactive=False
+#             )
+#     archive_leaderboard = gr.DataFrame(
+#         elem_classes="custom-table",
+#         elem_id="archive-table",
+#         max_height=600,
+#         interactive=False
+#     )
+#     # 绑定事件
+#     domains.change(
+#         fn=update_dataset_choices,
+#         inputs=[domains, freqs],
+#         outputs=dataset_dropdown
+#     )
+#     freqs.change(
+#         fn=update_dataset_choices,
+#         inputs=[domains, freqs],
+#         outputs=dataset_dropdown
+#     )
+#     # Change DF
+#     for comp in [dataset_dropdown, trend_group, season_group, remainder_group, global_group]:
+#         comp.change(
+#             fn=update_variate_choices_groups,
+#             inputs=[dataset_dropdown, trend_group, season_group, remainder_group, global_group],
+#             outputs=variate_dropdown
+#         )
+#         comp.change(
+#             fn=collect_patterns,
+#             inputs=[dataset_dropdown, trend_group, season_group, remainder_group, global_group,
+#                     variate_dropdown, domains, freqs],
+#             outputs=[msg_box, archive_leaderboard]
+#         )
+#     for comp in [variate_dropdown, domains, freqs]:
+#         comp.change(
+#             fn=collect_patterns,
+#             inputs=[dataset_dropdown, trend_group, season_group, remainder_group, global_group,
+#                     variate_dropdown, domains, freqs],
+#             outputs=[msg_box, archive_leaderboard]
+#         )
+#     # Initial Load
+#     demo.load(
+#         fn=collect_patterns,
+#         inputs=[dataset_dropdown, trend_group, season_group, remainder_group, global_group,
+#                 variate_dropdown, domains, freqs],
+#         outputs=[msg_box, archive_leaderboard]
+#     )

src/utils.py ADDED Viewed

	@@ -0,0 +1,635 @@

+import os
+import pandas as pd
+import numpy as np
+import json
+from typing import List, Tuple, Optional
+import yaml
+from pathlib import Path
+from scipy import stats
+from timebench.evaluation.data import Dataset, get_dataset_settings, load_dataset_config
+from src.hf_config import get_datasets_root, get_config_root
+def load_time_results(root_dir, model_name, dataset_with_freq, horizon):
+    """
+    Load TIME results from NPZ files for a specific model, dataset, and horizon.
+    Args:
+        root_dir: Root directory containing TIME results (e.g., "output/results")
+        model_name: Model name (e.g., "moirai_small")
+        dataset_with_freq: Dataset and freq combined (e.g., "Water_Quality_Darwin/15T")
+        horizon: Horizon name (e.g., "short", "medium", "long")
+    Returns:
+        tuple: (metrics_dict, predictions_dict, config_dict) or (None, None, None) if not found
+    """
+    horizon_dir = os.path.join(root_dir, model_name, dataset_with_freq, horizon)
+    metrics_path = os.path.join(horizon_dir, "metrics.npz")
+    predictions_path = os.path.join(horizon_dir, "predictions.npz")
+    config_path = os.path.join(horizon_dir, "config.json")
+    if not os.path.exists(metrics_path) or not os.path.exists(predictions_path):
+        return None, None, None
+    metrics = np.load(metrics_path)
+    predictions = np.load(predictions_path)
+    metrics_dict = {k: metrics[k] for k in metrics.files}
+    predictions_dict = {k: predictions[k] for k in predictions.files}
+    config_dict = {}
+    if os.path.exists(config_path):
+        with open(config_path, "r") as f:
+            config_dict = json.load(f)
+    return metrics_dict, predictions_dict, config_dict
+def get_all_datasets_results(root_dir="output/results"):
+    """
+    Load dataset-level leaderboard by reading TIME NPZ files and aggregating.
+    Args:
+        root_dir (str): Path to the TIME results root directory (e.g., "output/results").
+    Returns:
+        pd.DataFrame: DataFrame containing dataset-level results with columns
+            ["model", "dataset", "freq", "dataset_id", "horizon", "MASE", "CRPS", "MAE", "MSE"].
+            - dataset: Original dataset name (e.g., "Traffic")
+            - freq: Frequency string (e.g., "15T", "1H")
+            - dataset_id: Unique identifier as "dataset/freq" (e.g., "Traffic/15T")
+            Number of Rows: num_model x num_dataset_freq_combinations x num_horizons
+    """
+    rows = []
+    if not os.path.exists(root_dir):
+        print(f"Error: root_dir={root_dir} does not exist")
+        return pd.DataFrame(columns=["model", "dataset", "freq", "dataset_id", "horizon", "MASE", "CRPS", "MAE", "MSE"])
+    for model in os.listdir(root_dir):
+        model_dir = os.path.join(root_dir, model)
+        if not os.path.isdir(model_dir):
+            continue
+        for dataset in os.listdir(model_dir):
+            dataset_dir = os.path.join(model_dir, dataset)
+            if not os.path.isdir(dataset_dir):
+                continue
+            # Nested structure: model/dataset/freq/horizon/
+            for freq_dir in os.listdir(dataset_dir):
+                freq_path = os.path.join(dataset_dir, freq_dir)
+                if not os.path.isdir(freq_path):
+                    continue
+                for horizon in ["short", "medium", "long"]:
+                    dataset_with_freq = f"{dataset}/{freq_dir}"
+                    metrics_dict, _, config_dict = load_time_results(root_dir, model, dataset_with_freq, horizon)
+                    if metrics_dict is None:
+                        continue
+                    # Aggregate metrics
+                    mase = np.nanmean(metrics_dict.get("MASE", np.array([])))
+                    crps = np.nanmean(metrics_dict.get("CRPS", np.array([])))
+                    mae = np.nanmean(metrics_dict.get("MAE", np.array([])))
+                    mse = np.nanmean(metrics_dict.get("MSE", np.array([])))
+                    rows.append({
+                        "model": model,
+                        "dataset": dataset,
+                        "freq": freq_dir,
+                        "dataset_id": dataset_with_freq,  # Unique identifier: dataset/freq
+                        "horizon": horizon,
+                        "MASE": mase,
+                        "CRPS": crps,
+                        "MAE": mae,
+                        "MSE": mse,
+                    })
+    if rows:
+        return pd.DataFrame(rows)
+    else:
+        return pd.DataFrame(columns=["model", "dataset", "freq", "dataset_id", "horizon", "MASE", "CRPS", "MAE", "MSE"])
+def get_dataset_display_map(datasets_df: pd.DataFrame) -> Tuple[dict, dict]:
+    """
+    Generate smart display name mapping for datasets.
+    For datasets with only one freq: display as "dataset" (e.g., "Australia_Solar")
+    For datasets with multiple freqs: display as "dataset/freq" (e.g., "Traffic/15T")
+    Args:
+        datasets_df: DataFrame with 'dataset', 'freq', 'dataset_id' columns
+    Returns:
+        Tuple of:
+            - id_to_display: dict mapping dataset_id -> display_name
+            - display_to_id: dict mapping display_name -> dataset_id
+    """
+    if datasets_df.empty:
+        return {}, {}
+    # Count unique freqs per dataset
+    freq_counts = datasets_df.groupby('dataset')['freq'].nunique()
+    # Build mappings
+    id_to_display = {}
+    display_to_id = {}
+    unique_configs = datasets_df[['dataset', 'freq', 'dataset_id']].drop_duplicates()
+    for _, row in unique_configs.iterrows():
+        dataset_id = row['dataset_id']
+        dataset_name = row['dataset']
+        if freq_counts[dataset_name] > 1:
+            # Multiple freqs: display as dataset/freq
+            display_name = dataset_id
+        else:
+            # Single freq: display as dataset only
+            display_name = dataset_name
+        id_to_display[dataset_id] = display_name
+        display_to_id[display_name] = dataset_id
+    return id_to_display, display_to_id
+def get_all_variates_results(root_dir: str = "output/results") -> pd.DataFrame:
+    """
+    Collect all variate-individual-level results from TIME NPZ files.
+    Each (series, variate) combination is treated as an independent variate individual.
+    Metrics are aggregated only across windows (not across series).
+    Uses actual series_names and variate_names from Dataset objects.
+    Args:
+        root_dir (str): Path to the TIME results root directory (e.g., "output/results").
+    Returns:
+        pd.DataFrame: DataFrame with columns:
+            ["dataset_id", "series_name", "variate_name", "is_uts", "model", "horizon", "MASE", "CRPS", "MAE", "MSE"]
+            Number of Rows: num_models x num_datasets x num_horizons x num_series x num_variates
+    """
+    rows = []
+    if not os.path.exists(root_dir):
+        print(f"[get_all_variates_results] root_dir={root_dir} does not exist")
+        return pd.DataFrame(columns=["dataset_id", "series_name", "variate_name", "is_uts", "model", "horizon", "MASE", "CRPS", "MAE", "MSE"])
+    # Cache for dataset info (series_names, variate_names) to avoid repeated loading
+    dataset_info_cache = {}
+    for model in os.listdir(root_dir):
+        model_dir = os.path.join(root_dir, model)
+        if not os.path.isdir(model_dir):
+            continue
+        for dataset in os.listdir(model_dir):
+            dataset_dir = os.path.join(model_dir, dataset)
+            if not os.path.isdir(dataset_dir):
+                continue
+            # Nested structure: model/dataset/freq/horizon/
+            for freq_dir in os.listdir(dataset_dir):
+                freq_path = os.path.join(dataset_dir, freq_dir)
+                if not os.path.isdir(freq_path):
+                    continue
+                dataset_id = f"{dataset}/{freq_dir}"
+                # Get series_names and variate_names (use cache)
+                if dataset_id not in dataset_info_cache:
+                    series_names = None
+                    variate_names = None
+                    is_uts = False
+                    try:
+                        hf_dataset_root = str(get_datasets_root())
+                        if os.path.exists(hf_dataset_root):
+                            config_root = get_config_root()
+                            config_path = config_root / "datasets.yaml"
+                            config = load_dataset_config(config_path) if config_path.exists() else {}
+                            settings = get_dataset_settings(dataset_id, "short", config)
+                            dataset_obj = Dataset(
+                                name=dataset_id,
+                                term="short",
+                                prediction_length=settings.get("prediction_length"),
+                                test_length=settings.get("test_length"),
+                                storage_path=hf_dataset_root,
+                            )
+                            # Get series names
+                            if "item_id" in dataset_obj.hf_dataset.column_names:
+                                series_names = list(dataset_obj.hf_dataset["item_id"])
+                            else:
+                                series_names = [f"item_{i}" for i in range(len(dataset_obj.hf_dataset))]
+                            # Get variate names
+                            variate_names = dataset_obj.get_variate_names()
+                            if variate_names is None:
+                                # UTS mode: variate_names = series_names, and is_uts = True
+                                is_uts = True
+                                variate_names = series_names
+                            else:
+                                variate_names = list(variate_names)
+                    except Exception as e:
+                        print(f"[get_all_variates_results] Error loading Dataset info for {dataset_id}: {e}")
+                    dataset_info_cache[dataset_id] = {
+                        "series_names": series_names,
+                        "variate_names": variate_names,
+                        "is_uts": is_uts,
+                    }
+                info = dataset_info_cache[dataset_id]
+                series_names = info["series_names"]
+                variate_names = info["variate_names"]
+                is_uts = info["is_uts"]
+                for horizon in ["short", "medium", "long"]:
+                    metrics_dict, _, _ = load_time_results(root_dir, model, dataset_id, horizon)
+                    if metrics_dict is None:
+                        continue
+                    # Get metrics arrays: shape = (num_series, num_windows, num_variates)
+                    mase_arr = metrics_dict.get("MASE", np.array([]))
+                    crps_arr = metrics_dict.get("CRPS", np.array([]))
+                    mae_arr = metrics_dict.get("MAE", np.array([]))
+                    mse_arr = metrics_dict.get("MSE", np.array([]))
+                    if mase_arr.size == 0:
+                        continue
+                    num_series, num_windows, num_variates = mase_arr.shape
+                    # Iterate over each (series, variate) combination
+                    for series_idx in range(num_series):
+                        series_name = series_names[series_idx] if series_names and series_idx < len(series_names) else f"item_{series_idx}"
+                        for variate_idx in range(num_variates):
+                            # For UTS: variate_name = series_name (since each series is its own variate)
+                            if is_uts:
+                                variate_name = series_name
+                            else:
+                                variate_name = variate_names[variate_idx] if variate_names and variate_idx < len(variate_names) else str(variate_idx)
+                            # Aggregate only across windows
+                            mase = np.nanmean(mase_arr[series_idx, :, variate_idx])
+                            crps = np.nanmean(crps_arr[series_idx, :, variate_idx])
+                            mae = np.nanmean(mae_arr[series_idx, :, variate_idx])
+                            mse = np.nanmean(mse_arr[series_idx, :, variate_idx])
+                            # Skip if all values are NaN
+                            if np.isnan(mase) and np.isnan(crps):
+                                continue
+                            rows.append({
+                                "dataset_id": dataset_id,
+                                "series_name": series_name,
+                                "variate_name": variate_name,
+                                "is_uts": is_uts,
+                                "model": model,
+                                "horizon": horizon,
+                                "MASE": mase,
+                                "CRPS": crps,
+                                "MAE": mae,
+                                "MSE": mse,
+                            })
+    if rows:
+        return pd.DataFrame(rows)
+    else:
+        return pd.DataFrame(columns=["dataset_id", "series_name", "variate_name", "is_uts", "model", "horizon", "MASE", "CRPS", "MAE", "MSE"])
+def get_all_domains_and_freq(conf_dir="conf/data", datasets=None):
+    """
+    Scan YAML files and collect all unique domains.
+    """
+    domains, freqs = set(), set()
+    for ds in datasets:
+        yaml_path = os.path.join(conf_dir, f"{ds}.yaml")
+        if os.path.exists(yaml_path):
+            with open(yaml_path, "r") as f:
+                meta = yaml.safe_load(f)
+            domain = meta.get("domain")
+            freq = meta.get("freq")
+            if domain:
+                domains.add(domain)
+            if freq:
+                freqs.add(freq)
+    return sorted(list(domains)), sorted(list(freqs))
+def get_dataset_choices(results_root="output/results") -> Tuple[List[str], dict, dict]:
+    """
+    Get list of available datasets from TIME results with smart display names.
+    For datasets with only one freq: display as "dataset" (e.g., "Australia_Solar")
+    For datasets with multiple freqs: display as "dataset/freq" (e.g., "Traffic/15T")
+    Args:
+        results_root: Path to the TIME results root directory
+    Returns:
+        Tuple of:
+            - display_names: Sorted list of display names for UI dropdown
+            - display_to_id: dict mapping display_name -> dataset_id
+            - id_to_display: dict mapping dataset_id -> display_name
+    """
+    if not os.path.exists(results_root):
+        return [], {}, {}
+    # Collect all dataset/freq combinations
+    dataset_freq_pairs = set()  # Set of (dataset, freq) tuples
+    for model in os.listdir(results_root):
+        model_dir = os.path.join(results_root, model)
+        if not os.path.isdir(model_dir):
+            continue
+        for dataset in os.listdir(model_dir):
+            dataset_dir = os.path.join(model_dir, dataset)
+            if not os.path.isdir(dataset_dir):
+                continue
+            # Check directory structure
+            has_horizon_dirs = any(os.path.isdir(os.path.join(dataset_dir, h)) for h in ["short", "medium", "long"])
+            if has_horizon_dirs:
+                # Direct structure (legacy): treat as dataset with empty freq
+                # This shouldn't happen in the new structure but handle for safety
+                for horizon in ["short", "medium", "long"]:
+                    config_path = os.path.join(dataset_dir, horizon, "config.json")
+                    if os.path.exists(config_path):
+                        dataset_freq_pairs.add((dataset, ""))
+                        break
+            else:
+                # Nested structure: model/dataset/freq/horizon/
+                for freq_dir in os.listdir(dataset_dir):
+                    freq_path = os.path.join(dataset_dir, freq_dir)
+                    if not os.path.isdir(freq_path):
+                        continue
+                    for horizon in ["short", "medium", "long"]:
+                        config_path = os.path.join(freq_path, horizon, "config.json")
+                        if os.path.exists(config_path):
+                            dataset_freq_pairs.add((dataset, freq_dir))
+                            break
+    if not dataset_freq_pairs:
+        return [], {}, {}
+    # Count freqs per dataset
+    from collections import Counter
+    dataset_freq_count = Counter(ds for ds, _ in dataset_freq_pairs)
+    # Build mappings
+    id_to_display = {}
+    display_to_id = {}
+    for dataset, freq in dataset_freq_pairs:
+        if freq:
+            dataset_id = f"{dataset}/{freq}"
+        else:
+            dataset_id = dataset
+        if dataset_freq_count[dataset] > 1:
+            # Multiple freqs: display as dataset/freq
+            display_name = dataset_id
+        else:
+            # Single freq: display as dataset only
+            display_name = dataset
+        id_to_display[dataset_id] = display_name
+        display_to_id[display_name] = dataset_id
+    # Sort display names for UI
+    display_names = sorted(display_to_id.keys())
+    return display_names, display_to_id, id_to_display
+def compute_ranks(df: pd.DataFrame, groupby_cols: str | List[str]) -> pd.DataFrame:
+    """
+    Compute ranks for models across datasets based on MASE and CRPS.
+    Args:
+        df (pd.DataFrame): Dataset-level results with columns
+            ["model", "dataset", "MASE", "CRPS"].
+    Returns:
+        pd.DataFrame: Dataframe with ["model", "MASE_rank", "CRPS_rank"].
+    """
+    if isinstance(groupby_cols, str):
+        groupby_cols = [groupby_cols]
+    if df.empty:
+        return pd.DataFrame(columns=["model", "MASE_rank", "CRPS_rank"])
+    df = df.copy()
+    df["MASE_rank"] = df.groupby(groupby_cols)["MASE"].rank(method="first", ascending=True)
+    df["CRPS_rank"] = df.groupby(groupby_cols)["CRPS"].rank(method="first", ascending=True)
+    return df
+def normalize_by_seasonal_naive(
+    df: pd.DataFrame,
+    baseline_model: str = "seasonal_naive",
+    metrics: List[str] = None,
+    groupby_cols: List[str] = None,
+) -> pd.DataFrame:
+    """
+    Normalize metrics by Seasonal Naive baseline for each (dataset_id, horizon) group.
+    For each group, divides each model's metric values by Seasonal Naive's values.
+    This makes Seasonal Naive the baseline (=1.0) for comparison.
+    Args:
+        df (pd.DataFrame): Dataset-level results with columns including
+            ["model", "dataset_id", "horizon", "MASE", "CRPS", ...].
+        baseline_model (str): Name of the baseline model. Defaults to "seasonal_naive".
+        metrics (List[str]): List of metric columns to normalize. Defaults to ["MASE", "CRPS"].
+        groupby_cols (List[str]): Columns to group by for normalization.
+            Defaults to ["dataset_id", "horizon"].
+    Returns:
+        pd.DataFrame: DataFrame with normalized metric values.
+            - Configurations without baseline model results are excluded.
+            - NaN/inf values from division are handled.
+    """
+    if metrics is None:
+        metrics = ["MASE", "CRPS"]
+    if groupby_cols is None:
+        groupby_cols = ["dataset_id", "horizon"]
+    if df.empty:
+        return df.copy()
+    # Check if baseline model exists
+    if baseline_model not in df["model"].values:
+        print(f"[normalize_by_seasonal_naive] Warning: baseline model '{baseline_model}' not found in data")
+        return df.copy()
+    # Work on a copy
+    df_normalized = df.copy()
+    # Get baseline values for each group
+    baseline_df = df[df["model"] == baseline_model].copy()
+    # Create a mapping: (dataset_id, horizon) -> {metric: baseline_value}
+    baseline_values = {}
+    for _, row in baseline_df.iterrows():
+        key = tuple(row[col] for col in groupby_cols)
+        baseline_values[key] = {metric: row[metric] for metric in metrics}
+    # Normalize each row
+    rows_to_keep = []
+    for idx, row in df_normalized.iterrows():
+        key = tuple(row[col] for col in groupby_cols)
+        # Skip configurations without baseline results
+        if key not in baseline_values:
+            continue
+        rows_to_keep.append(idx)
+        # Normalize each metric
+        for metric in metrics:
+            baseline_val = baseline_values[key][metric]
+            if baseline_val is not None and baseline_val != 0 and not np.isnan(baseline_val):
+                df_normalized.at[idx, metric] = row[metric] / baseline_val
+            else:
+                # Handle division by zero or NaN baseline
+                df_normalized.at[idx, metric] = np.nan
+    # Keep only rows with valid baseline
+    df_normalized = df_normalized.loc[rows_to_keep].copy()
+    # Handle any remaining inf values
+    for metric in metrics:
+        df_normalized[metric] = df_normalized[metric].replace([np.inf, -np.inf], np.nan)
+    return df_normalized
+def load_features(root_dir: str = "features", category: str = "public-benchmarks", split: str = "test") -> pd.DataFrame:
+    """
+    Load time series features for all datasets (legacy function).
+    Args:
+        root_dir (str): Path to features root directory.
+        category (str): Dataset category (e.g., "public-benchmarks").
+        split (str): Which split to load ("full" or "test").
+    Returns:
+        pd.DataFrame: Concatenated DataFrame with dataset column.
+    """
+    base_dir = os.path.join(root_dir, category)
+    all_data = []
+    for dataset in os.listdir(base_dir):
+        dataset_dir = os.path.join(base_dir, dataset)
+        csv_path = os.path.join(dataset_dir, f"{split}.csv")
+        if os.path.exists(csv_path):
+            df = pd.read_csv(csv_path)
+            df["dataset"] = dataset  # add dataset name
+            cols = ["dataset"] + [c for c in df.columns if c != "dataset"]  # 让 dataset 列放到第一列
+            df = df[cols]
+            all_data.append(df)
+    if all_data:
+        df = pd.concat(all_data, ignore_index=True)
+        if "unique_id" in df.columns:
+            df = df.rename(columns={"unique_id": "variate_name"})
+        return df
+    else:
+        return pd.DataFrame()
+def load_all_features(features_root: str = "output/features", split: str = "test") -> pd.DataFrame:
+    """
+    Load time series features for all datasets from output/features directory.
+    Expected structure: features_root/{dataset}/{freq}/{split}.csv
+    Each CSV should have columns: dataset_id, series_name, variate_name, ...features...
+    Args:
+        features_root (str): Path to features root directory (e.g., "output/features").
+        split (str): Which split to load ("full" or "test").
+    Returns:
+        pd.DataFrame: Concatenated DataFrame with all variate features.
+            Columns: ["dataset_id", "series_name", "variate_name", "unique_id",
+                      "is_random_walk", "has_spike_presence", "trend_strength", ...]
+    """
+    all_data = []
+    if not os.path.exists(features_root):
+        print(f"[load_all_features] features_root={features_root} does not exist")
+        return pd.DataFrame()
+    for dataset in os.listdir(features_root):
+        dataset_dir = os.path.join(features_root, dataset)
+        if not os.path.isdir(dataset_dir):
+            continue
+        for freq in os.listdir(dataset_dir):
+            freq_dir = os.path.join(dataset_dir, freq)
+            if not os.path.isdir(freq_dir):
+                continue
+            csv_path = os.path.join(freq_dir, f"{split}.csv")
+            if not os.path.exists(csv_path):
+                # Fallback: try full.csv if test.csv doesn't exist
+                csv_path = os.path.join(freq_dir, "full.csv")
+            if os.path.exists(csv_path):
+                try:
+                    df = pd.read_csv(csv_path)
+                    all_data.append(df)
+                except Exception as e:
+                    print(f"[load_all_features] Error loading {csv_path}: {e}")
+    if all_data:
+        features_df = pd.concat(all_data, ignore_index=True)
+        print(f"[load_all_features] Loaded {len(features_df)} variate features from {len(all_data)} datasets")
+        return features_df
+    else:
+        print(f"[load_all_features] No features found in {features_root}")
+        return pd.DataFrame()
+def binarize_features(df: pd.DataFrame, exclude: list) -> pd.DataFrame:
+    """
+    Binarize features in df based on their median values.
+    Columns in exclude will be skipped.
+    Args:
+        df (pd.DataFrame): Input dataframe with feature values.
+        exclude (list): Columns to exclude from binarization.
+    Returns:
+        pd.DataFrame: Model_A dataframe where selected feature columns are binarized (0/1).
+    """
+    # Select target feature columns
+    feature_cols = [col for col in df.columns if col not in exclude]
+    # Copy to avoid modifying original
+    df_binarized = df.copy()
+    # Compute medians
+    medians = df[feature_cols].median()
+    # Apply binarization
+    for col in feature_cols:
+        threshold = medians[col]
+        df_binarized[col] = (df[col] > threshold).astype(int)
+    return df_binarized