Spaces:

Melady
/

TemporalBench_Leaderboard

Running

TemporalBench_Leaderboard / src /leaderboard /load_results.py

Ray0202

update leaderboard

1dd52d9 3 months ago

6.47 kB

	from __future__ import annotations

	import json
	import math
	import os
	from typing import Iterable

	import pandas as pd

	from src.leaderboard.schema import SCHEMA


	class ResultsValidationError(ValueError):
	pass


	def _is_number(value) -> bool:
	if not isinstance(value, (int, float)) or isinstance(value, bool):
	return False
	return math.isfinite(float(value))


	def _is_missing(value) -> bool:
	if value is None:
	return True
	if isinstance(value, float) and math.isnan(value):
	return True
	return False


	def _load_json_records(path: str) -> list[dict]:
	with open(path, "r") as fp:
	data = json.load(fp)

	if isinstance(data, list):
	return data
	if isinstance(data, dict) and "records" in data and isinstance(data["records"], list):
	return data["records"]
	raise ResultsValidationError(
	"JSON must be a list of records or an object with a 'records' list."
	)


	def _load_csv_records(path: str) -> list[dict]:
	df = pd.read_csv(path)
	return df.to_dict(orient="records")


	def load_records(path: str) -> list[dict]:
	if not os.path.exists(path):
	raise ResultsValidationError(f"Results file not found: {path}")

	_, ext = os.path.splitext(path)
	ext = ext.lower()
	if ext == ".json":
	return _load_json_records(path)
	if ext == ".csv":
	return _load_csv_records(path)

	raise ResultsValidationError("Unsupported file type. Use .json or .csv")


	def validate_records(records: Iterable[dict]) -> None:
	records = list(records)
	if not records:
	raise ResultsValidationError("Results file is empty.")

	for idx, record in enumerate(records):
	if not isinstance(record, dict):
	raise ResultsValidationError(f"Record {idx} is not an object.")

	missing = [f for f in SCHEMA.identity_fields if f not in record]
	if missing:
	raise ResultsValidationError(f"Record {idx} is missing fields: {missing}")

	for field in SCHEMA.identity_fields:
	if not isinstance(record[field], str):
	raise ResultsValidationError(
	f"Record {idx} field '{field}' must be a string."
	)

	missing_metrics = [m for m in SCHEMA.required_metrics if m not in record]
	if missing_metrics:
	raise ResultsValidationError(
	f"Record {idx} is missing required metrics: {missing_metrics}"
	)

	for key, value in record.items():
	if key in SCHEMA.identity_fields:
	continue
	if _is_missing(value):
	continue
	if not _is_number(value):
	raise ResultsValidationError(
	f"Record {idx} metric '{key}' must be numeric."
	)


	def infer_metric_columns(records: Iterable[dict]) -> list[str]:
	records = list(records)
	if not records:
	return []

	all_keys = set()
	for record in records:
	all_keys.update(record.keys())
	metric_keys = [k for k in all_keys if k not in SCHEMA.identity_fields]

	ordered = []
	for key in SCHEMA.required_metrics:
	if key in metric_keys:
	ordered.append(key)
	for key in SCHEMA.optional_metrics:
	if key in metric_keys:
	ordered.append(key)

	remaining = sorted([k for k in metric_keys if k not in ordered])
	ordered.extend(remaining)
	return ordered


	def build_dataframe(records: list[dict]) -> tuple[pd.DataFrame, list[str]]:
	validate_records(records)
	metric_cols = infer_metric_columns(records)
	column_order = list(SCHEMA.identity_fields) + metric_cols
	df = pd.DataFrame.from_records(records)
	df = apply_overall_metrics(df)
	# Include computed columns (e.g., overall_mcq_acc) in display order if present.
	for col in df.columns:
	if col in SCHEMA.identity_fields:
	continue
	if col not in column_order:
	column_order.append(col)
	df = df[column_order]
	return df, column_order


	MCQ_QUESTIONS = {
	"MIMIC": {"T1": 188, "T2": 141, "T3": 239, "T4": 141},
	"PSML": {"T1": 200, "T2": 150, "T3": 250, "T4": 150},
	"CausalChambers": {"T1": 150, "T2": 150, "T3": 250, "T4": 150},
	"FreshRetailNet": {"T1": 176, "T2": 132, "T3": 176, "T4": 132},
	}

	FORECAST_SERIES = {
	"MIMIC": {"T2": 282, "T4": 282},
	"PSML": {"T2": 50, "T4": 50},
	"CausalChambers": {"T2": 50, "T4": 50},
	"FreshRetailNet": {"T2": 44, "T4": 44},
	}


	def _weighted_avg(row: pd.Series, columns: list[str], weights: list[int]) -> float \| None:
	total = 0.0
	total_w = 0.0
	for col, w in zip(columns, weights):
	val = row.get(col)
	if _is_missing(val):
	continue
	total += float(val) * w
	total_w += w
	if total_w == 0:
	return None
	return round(total / total_w, 4)


	def apply_overall_metrics(df: pd.DataFrame) -> pd.DataFrame:
	df = df.copy()

	for task in ["T1", "T2", "T3", "T4"]:
	cols = []
	weights = []
	for dataset, task_weights in MCQ_QUESTIONS.items():
	col = f"{dataset}_{task}_acc"
	if col in df.columns:
	cols.append(col)
	weights.append(task_weights[task])
	if cols:
	df[f"{task}_acc"] = df.apply(lambda r: _weighted_avg(r, cols, weights), axis=1)

	overall_cols = []
	overall_weights = []
	for dataset, task_weights in MCQ_QUESTIONS.items():
	for task, weight in task_weights.items():
	col = f"{dataset}_{task}_acc"
	if col in df.columns:
	overall_cols.append(col)
	overall_weights.append(weight)
	if overall_cols:
	df["overall_mcq_acc"] = df.apply(
	lambda r: _weighted_avg(r, overall_cols, overall_weights), axis=1
	)

	for task in ["T2", "T4"]:
	# sMAPE/MAE are defined for non-MIMIC datasets
	for metric in ["sMAPE", "MAE"]:
	cols = []
	weights = []
	for dataset, task_weights in FORECAST_SERIES.items():
	if dataset == "MIMIC":
	continue
	col = f"{dataset}_{task}_{metric}"
	if col in df.columns:
	cols.append(col)
	weights.append(task_weights[task])
	if cols:
	df[f"{task}_{metric}"] = df.apply(lambda r: _weighted_avg(r, cols, weights), axis=1)

	return df