| """Canonical metadata for real-world benchmark datasets.""" |
|
|
| from __future__ import annotations |
|
|
| from pydantic import BaseModel, Field |
|
|
|
|
| class HeaderMismatch(BaseModel): |
| """Pair of dirty/clean header names that align by column position.""" |
|
|
| dirty_name: str = Field(min_length=1) |
| clean_name: str = Field(min_length=1) |
|
|
| model_config = {"frozen": True} |
|
|
|
|
| class DatasetMetadata(BaseModel): |
| """Metadata describing a canonical benchmark dataset.""" |
|
|
| name: str = Field(min_length=1) |
| domain: str = Field(min_length=1) |
| n_rows: int = Field(ge=0) |
| n_columns: int = Field(ge=1) |
| error_types: tuple[str, ...] = Field(default_factory=tuple) |
| source_urls: tuple[str, str] |
| citation: str = Field(min_length=1) |
| header_mismatches: tuple[HeaderMismatch, ...] = Field(default_factory=tuple) |
|
|
| model_config = {"frozen": True} |
|
|
|
|
| _BASE_URL = "https://raw.githubusercontent.com/BigDaMa/raha/refs/heads/master/datasets" |
|
|
| DATASET_REGISTRY: dict[str, DatasetMetadata] = { |
| "hospital": DatasetMetadata( |
| name="hospital", |
| domain="healthcare", |
| n_rows=1000, |
| n_columns=20, |
| error_types=("typo", "missing_value", "formatting"), |
| source_urls=( |
| f"{_BASE_URL}/hospital/dirty.csv", |
| f"{_BASE_URL}/hospital/clean.csv", |
| ), |
| citation=( |
| "Mahdavi et al. Raha benchmark dataset (Hospital) via the BigDaMa/raha repository." |
| ), |
| ), |
| "flights": DatasetMetadata( |
| name="flights", |
| domain="aviation", |
| n_rows=2376, |
| n_columns=7, |
| error_types=("missing_value", "formatting", "datetime"), |
| source_urls=( |
| f"{_BASE_URL}/flights/dirty.csv", |
| f"{_BASE_URL}/flights/clean.csv", |
| ), |
| citation=( |
| "Mahdavi et al. Raha benchmark dataset (Flights) via the BigDaMa/raha repository." |
| ), |
| ), |
| "beers": DatasetMetadata( |
| name="beers", |
| domain="consumer", |
| n_rows=2410, |
| n_columns=11, |
| error_types=("formatting", "missing_value", "normalization"), |
| source_urls=( |
| f"{_BASE_URL}/beers/dirty.csv", |
| f"{_BASE_URL}/beers/clean.csv", |
| ), |
| citation=("Mahdavi et al. Raha benchmark dataset (Beers) via the BigDaMa/raha repository."), |
| ), |
| } |
|
|
|
|
| def get_dataset_metadata(name: str) -> DatasetMetadata: |
| """Return canonical metadata for a named benchmark dataset. |
| |
| Args: |
| name: Canonical dataset name. |
| |
| Returns: |
| The immutable metadata entry for the dataset. |
| |
| Raises: |
| KeyError: If the dataset is not registered. |
| """ |
| return DATASET_REGISTRY[name] |
|
|