File size: 3,559 Bytes
5143557 791c076 5143557 791c076 5143557 791c076 5143557 791c076 5143557 791c076 5143557 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | """Canonical metadata for real-world benchmark datasets."""
from __future__ import annotations
from pydantic import BaseModel, Field
class HeaderMismatch(BaseModel):
"""Pair of dirty/clean header names that align by column position."""
dirty_name: str = Field(min_length=1)
clean_name: str = Field(min_length=1)
model_config = {"frozen": True}
class DatasetMetadata(BaseModel):
"""Metadata describing a canonical benchmark dataset."""
name: str = Field(min_length=1)
domain: str = Field(min_length=1)
n_rows: int = Field(ge=0)
n_columns: int = Field(ge=1)
error_types: tuple[str, ...] = Field(default_factory=tuple)
source_urls: tuple[str, str]
source_revision: str = Field(min_length=7)
dirty_sha256: str = Field(min_length=64, max_length=64)
clean_sha256: str = Field(min_length=64, max_length=64)
citation: str = Field(min_length=1)
header_mismatches: tuple[HeaderMismatch, ...] = Field(default_factory=tuple)
model_config = {"frozen": True}
RAHA_GIT_REVISION = "7be1334b8c7bbdac3f47ef514fb3e1e8c5fc181c"
_BASE_URL = f"https://raw.githubusercontent.com/BigDaMa/raha/{RAHA_GIT_REVISION}/datasets"
DATASET_REGISTRY: dict[str, DatasetMetadata] = {
"hospital": DatasetMetadata(
name="hospital",
domain="healthcare",
n_rows=1000,
n_columns=20,
error_types=("typo", "missing_value", "formatting"),
source_urls=(
f"{_BASE_URL}/hospital/dirty.csv",
f"{_BASE_URL}/hospital/clean.csv",
),
source_revision=RAHA_GIT_REVISION,
dirty_sha256="dbc5575b915fe8b5e0ac6dc6172f38ba91e611fdb76d09a8f4a81cb7ea9925ac",
clean_sha256="ea3ee44998455c0b491750c348509de176c758a3bbf58e4530c0a136bb248b4b",
citation=(
"Mahdavi et al. Raha benchmark dataset (Hospital) via the BigDaMa/raha repository."
),
),
"flights": DatasetMetadata(
name="flights",
domain="aviation",
n_rows=2376,
n_columns=7,
error_types=("missing_value", "formatting", "datetime"),
source_urls=(
f"{_BASE_URL}/flights/dirty.csv",
f"{_BASE_URL}/flights/clean.csv",
),
source_revision=RAHA_GIT_REVISION,
dirty_sha256="1b5c1afa10aa0e7c20fd7e14d05c56772715b2771aa0f5fa67ed1709e1eecd46",
clean_sha256="0acfcfd8985b06fdd363965c9e8d9522c43e7589a93d79ae7dc311e1c37fdf3b",
citation=(
"Mahdavi et al. Raha benchmark dataset (Flights) via the BigDaMa/raha repository."
),
),
"beers": DatasetMetadata(
name="beers",
domain="consumer",
n_rows=2410,
n_columns=11,
error_types=("formatting", "missing_value", "normalization"),
source_urls=(
f"{_BASE_URL}/beers/dirty.csv",
f"{_BASE_URL}/beers/clean.csv",
),
source_revision=RAHA_GIT_REVISION,
dirty_sha256="7110bf4931a9445a1675e544d6c996817c739136239f8a2b02e088c7ec0a1f68",
clean_sha256="373227df59ad197e154dd5149125789e415019535c7223355e9486ee1b3b93de",
citation=("Mahdavi et al. Raha benchmark dataset (Beers) via the BigDaMa/raha repository."),
),
}
def get_dataset_metadata(name: str) -> DatasetMetadata:
"""Return canonical metadata for a named benchmark dataset.
Args:
name: Canonical dataset name.
Returns:
The immutable metadata entry for the dataset.
Raises:
KeyError: If the dataset is not registered.
"""
return DATASET_REGISTRY[name]
|