Spaces:
Sleeping
Sleeping
| """ | |
| Snapshot data loader for the ConTSG-Bench HF Space. | |
| Reads the 5 snapshot files (parquet + json) from a local directory or | |
| HF Dataset and provides them as pandas DataFrames / dicts. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| from pathlib import Path | |
| from typing import Any, Dict, List, Optional | |
| import pandas as pd | |
| logger = logging.getLogger(__name__) | |
| class SnapshotData: | |
| """Container for all loaded snapshot data.""" | |
| def __init__(self, snapshot_dir: str | Path): | |
| self.snapshot_dir = Path(snapshot_dir) | |
| self._load() | |
| def _load(self) -> None: | |
| """Load all snapshot files.""" | |
| d = self.snapshot_dir | |
| # Parquet files | |
| self.leaderboard_long: pd.DataFrame = self._read_parquet( | |
| d / "leaderboard_long.parquet" | |
| ) | |
| self.leaderboard_wide: pd.DataFrame = self._read_parquet( | |
| d / "leaderboard_wide.parquet" | |
| ) | |
| self.model_cards: pd.DataFrame = self._read_parquet(d / "model_cards.parquet") | |
| # JSON files | |
| self.metric_catalog: List[Dict[str, Any]] = self._read_json( | |
| d / "metric_catalog.json" | |
| ) | |
| self.version_manifest: Dict[str, Any] = self._read_json( | |
| d / "version_manifest.json" | |
| ) | |
| # Derived lookups | |
| self.metric_lookup: Dict[str, Dict[str, Any]] = { | |
| m["metric_name"]: m for m in self.metric_catalog | |
| } | |
| logger.info( | |
| "Loaded snapshot: %d long rows, %d wide rows, %d models, %d metrics", | |
| len(self.leaderboard_long), | |
| len(self.leaderboard_wide), | |
| len(self.model_cards), | |
| len(self.metric_catalog), | |
| ) | |
| def _read_parquet(self, path: Path) -> pd.DataFrame: | |
| """Read a parquet file, returning empty DataFrame if missing.""" | |
| if path.exists(): | |
| return pd.read_parquet(path) | |
| logger.warning("Parquet file not found: %s", path) | |
| return pd.DataFrame() | |
| def _read_json(self, path: Path) -> Any: | |
| """Read a JSON file, returning empty structure if missing.""" | |
| if path.exists(): | |
| with open(path, "r") as f: | |
| return json.load(f) | |
| logger.warning("JSON file not found: %s", path) | |
| return {} | |
| def version(self) -> str: | |
| """Current snapshot version string.""" | |
| return self.version_manifest.get("current_version", "unknown") | |
| def models(self) -> List[str]: | |
| """List of unique model names in the leaderboard.""" | |
| if self.leaderboard_long.empty: | |
| return [] | |
| return sorted(self.leaderboard_long["model"].unique().tolist()) | |
| def datasets(self) -> List[str]: | |
| """List of unique dataset names in the leaderboard.""" | |
| if self.leaderboard_long.empty: | |
| return [] | |
| return sorted(self.leaderboard_long["dataset"].unique().tolist()) | |
| def metric_groups(self) -> List[str]: | |
| """List of unique metric groups.""" | |
| return sorted(set(m["metric_group"] for m in self.metric_catalog)) | |
| def condition_modalities(self) -> List[str]: | |
| """List of unique condition modalities in the data.""" | |
| if self.leaderboard_long.empty: | |
| return [] | |
| return sorted(self.leaderboard_long["condition_modality"].unique().tolist()) | |
| def semantic_levels(self) -> List[str]: | |
| """List of unique semantic levels in the data.""" | |
| if self.leaderboard_long.empty: | |
| return [] | |
| return sorted(self.leaderboard_long["semantic_level"].unique().tolist()) | |