Spaces:
Sleeping
Sleeping
File size: 3,647 Bytes
d4c6d58 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | """
Snapshot data loader for the ConTSG-Bench HF Space.
Reads the 5 snapshot files (parquet + json) from a local directory or
HF Dataset and provides them as pandas DataFrames / dicts.
"""
from __future__ import annotations
import json
import logging
from pathlib import Path
from typing import Any, Dict, List, Optional
import pandas as pd
logger = logging.getLogger(__name__)
class SnapshotData:
"""Container for all loaded snapshot data."""
def __init__(self, snapshot_dir: str | Path):
self.snapshot_dir = Path(snapshot_dir)
self._load()
def _load(self) -> None:
"""Load all snapshot files."""
d = self.snapshot_dir
# Parquet files
self.leaderboard_long: pd.DataFrame = self._read_parquet(
d / "leaderboard_long.parquet"
)
self.leaderboard_wide: pd.DataFrame = self._read_parquet(
d / "leaderboard_wide.parquet"
)
self.model_cards: pd.DataFrame = self._read_parquet(d / "model_cards.parquet")
# JSON files
self.metric_catalog: List[Dict[str, Any]] = self._read_json(
d / "metric_catalog.json"
)
self.version_manifest: Dict[str, Any] = self._read_json(
d / "version_manifest.json"
)
# Derived lookups
self.metric_lookup: Dict[str, Dict[str, Any]] = {
m["metric_name"]: m for m in self.metric_catalog
}
logger.info(
"Loaded snapshot: %d long rows, %d wide rows, %d models, %d metrics",
len(self.leaderboard_long),
len(self.leaderboard_wide),
len(self.model_cards),
len(self.metric_catalog),
)
def _read_parquet(self, path: Path) -> pd.DataFrame:
"""Read a parquet file, returning empty DataFrame if missing."""
if path.exists():
return pd.read_parquet(path)
logger.warning("Parquet file not found: %s", path)
return pd.DataFrame()
def _read_json(self, path: Path) -> Any:
"""Read a JSON file, returning empty structure if missing."""
if path.exists():
with open(path, "r") as f:
return json.load(f)
logger.warning("JSON file not found: %s", path)
return {}
@property
def version(self) -> str:
"""Current snapshot version string."""
return self.version_manifest.get("current_version", "unknown")
@property
def models(self) -> List[str]:
"""List of unique model names in the leaderboard."""
if self.leaderboard_long.empty:
return []
return sorted(self.leaderboard_long["model"].unique().tolist())
@property
def datasets(self) -> List[str]:
"""List of unique dataset names in the leaderboard."""
if self.leaderboard_long.empty:
return []
return sorted(self.leaderboard_long["dataset"].unique().tolist())
@property
def metric_groups(self) -> List[str]:
"""List of unique metric groups."""
return sorted(set(m["metric_group"] for m in self.metric_catalog))
@property
def condition_modalities(self) -> List[str]:
"""List of unique condition modalities in the data."""
if self.leaderboard_long.empty:
return []
return sorted(self.leaderboard_long["condition_modality"].unique().tolist())
@property
def semantic_levels(self) -> List[str]:
"""List of unique semantic levels in the data."""
if self.leaderboard_long.empty:
return []
return sorted(self.leaderboard_long["semantic_level"].unique().tolist())
|