"""Canonical metadata for real-world benchmark datasets.""" from __future__ import annotations from pydantic import BaseModel, Field class HeaderMismatch(BaseModel): """Pair of dirty/clean header names that align by column position.""" dirty_name: str = Field(min_length=1) clean_name: str = Field(min_length=1) model_config = {"frozen": True} class DatasetMetadata(BaseModel): """Metadata describing a canonical benchmark dataset.""" name: str = Field(min_length=1) domain: str = Field(min_length=1) n_rows: int = Field(ge=0) n_columns: int = Field(ge=1) error_types: tuple[str, ...] = Field(default_factory=tuple) source_urls: tuple[str, str] citation: str = Field(min_length=1) header_mismatches: tuple[HeaderMismatch, ...] = Field(default_factory=tuple) model_config = {"frozen": True} _BASE_URL = "https://raw.githubusercontent.com/BigDaMa/raha/refs/heads/master/datasets" DATASET_REGISTRY: dict[str, DatasetMetadata] = { "hospital": DatasetMetadata( name="hospital", domain="healthcare", n_rows=1000, n_columns=20, error_types=("typo", "missing_value", "formatting"), source_urls=( f"{_BASE_URL}/hospital/dirty.csv", f"{_BASE_URL}/hospital/clean.csv", ), citation=( "Mahdavi et al. Raha benchmark dataset (Hospital) via the BigDaMa/raha repository." ), ), "flights": DatasetMetadata( name="flights", domain="aviation", n_rows=2376, n_columns=7, error_types=("missing_value", "formatting", "datetime"), source_urls=( f"{_BASE_URL}/flights/dirty.csv", f"{_BASE_URL}/flights/clean.csv", ), citation=( "Mahdavi et al. Raha benchmark dataset (Flights) via the BigDaMa/raha repository." ), ), "beers": DatasetMetadata( name="beers", domain="consumer", n_rows=2410, n_columns=11, error_types=("formatting", "missing_value", "normalization"), source_urls=( f"{_BASE_URL}/beers/dirty.csv", f"{_BASE_URL}/beers/clean.csv", ), citation=("Mahdavi et al. Raha benchmark dataset (Beers) via the BigDaMa/raha repository."), ), } def get_dataset_metadata(name: str) -> DatasetMetadata: """Return canonical metadata for a named benchmark dataset. Args: name: Canonical dataset name. Returns: The immutable metadata entry for the dataset. Raises: KeyError: If the dataset is not registered. """ return DATASET_REGISTRY[name]