File size: 3,647 Bytes
d4c6d58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Snapshot data loader for the ConTSG-Bench HF Space.

Reads the 5 snapshot files (parquet + json) from a local directory or
HF Dataset and provides them as pandas DataFrames / dicts.
"""

from __future__ import annotations

import json
import logging
from pathlib import Path
from typing import Any, Dict, List, Optional

import pandas as pd

logger = logging.getLogger(__name__)


class SnapshotData:
    """Container for all loaded snapshot data."""

    def __init__(self, snapshot_dir: str | Path):
        self.snapshot_dir = Path(snapshot_dir)
        self._load()

    def _load(self) -> None:
        """Load all snapshot files."""
        d = self.snapshot_dir

        # Parquet files
        self.leaderboard_long: pd.DataFrame = self._read_parquet(
            d / "leaderboard_long.parquet"
        )
        self.leaderboard_wide: pd.DataFrame = self._read_parquet(
            d / "leaderboard_wide.parquet"
        )
        self.model_cards: pd.DataFrame = self._read_parquet(d / "model_cards.parquet")

        # JSON files
        self.metric_catalog: List[Dict[str, Any]] = self._read_json(
            d / "metric_catalog.json"
        )
        self.version_manifest: Dict[str, Any] = self._read_json(
            d / "version_manifest.json"
        )

        # Derived lookups
        self.metric_lookup: Dict[str, Dict[str, Any]] = {
            m["metric_name"]: m for m in self.metric_catalog
        }

        logger.info(
            "Loaded snapshot: %d long rows, %d wide rows, %d models, %d metrics",
            len(self.leaderboard_long),
            len(self.leaderboard_wide),
            len(self.model_cards),
            len(self.metric_catalog),
        )

    def _read_parquet(self, path: Path) -> pd.DataFrame:
        """Read a parquet file, returning empty DataFrame if missing."""
        if path.exists():
            return pd.read_parquet(path)
        logger.warning("Parquet file not found: %s", path)
        return pd.DataFrame()

    def _read_json(self, path: Path) -> Any:
        """Read a JSON file, returning empty structure if missing."""
        if path.exists():
            with open(path, "r") as f:
                return json.load(f)
        logger.warning("JSON file not found: %s", path)
        return {}

    @property
    def version(self) -> str:
        """Current snapshot version string."""
        return self.version_manifest.get("current_version", "unknown")

    @property
    def models(self) -> List[str]:
        """List of unique model names in the leaderboard."""
        if self.leaderboard_long.empty:
            return []
        return sorted(self.leaderboard_long["model"].unique().tolist())

    @property
    def datasets(self) -> List[str]:
        """List of unique dataset names in the leaderboard."""
        if self.leaderboard_long.empty:
            return []
        return sorted(self.leaderboard_long["dataset"].unique().tolist())

    @property
    def metric_groups(self) -> List[str]:
        """List of unique metric groups."""
        return sorted(set(m["metric_group"] for m in self.metric_catalog))

    @property
    def condition_modalities(self) -> List[str]:
        """List of unique condition modalities in the data."""
        if self.leaderboard_long.empty:
            return []
        return sorted(self.leaderboard_long["condition_modality"].unique().tolist())

    @property
    def semantic_levels(self) -> List[str]:
        """List of unique semantic levels in the data."""
        if self.leaderboard_long.empty:
            return []
        return sorted(self.leaderboard_long["semantic_level"].unique().tolist())