File size: 2,663 Bytes
66b1c50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""Canonical metadata for real-world benchmark datasets."""

from __future__ import annotations

from pydantic import BaseModel, Field


class HeaderMismatch(BaseModel):
    """Pair of dirty/clean header names that align by column position."""

    dirty_name: str = Field(min_length=1)
    clean_name: str = Field(min_length=1)

    model_config = {"frozen": True}


class DatasetMetadata(BaseModel):
    """Metadata describing a canonical benchmark dataset."""

    name: str = Field(min_length=1)
    domain: str = Field(min_length=1)
    n_rows: int = Field(ge=0)
    n_columns: int = Field(ge=1)
    error_types: tuple[str, ...] = Field(default_factory=tuple)
    source_urls: tuple[str, str]
    citation: str = Field(min_length=1)
    header_mismatches: tuple[HeaderMismatch, ...] = Field(default_factory=tuple)

    model_config = {"frozen": True}


_BASE_URL = "https://raw.githubusercontent.com/BigDaMa/raha/refs/heads/master/datasets"

DATASET_REGISTRY: dict[str, DatasetMetadata] = {
    "hospital": DatasetMetadata(
        name="hospital",
        domain="healthcare",
        n_rows=1000,
        n_columns=20,
        error_types=("typo", "missing_value", "formatting"),
        source_urls=(
            f"{_BASE_URL}/hospital/dirty.csv",
            f"{_BASE_URL}/hospital/clean.csv",
        ),
        citation=(
            "Mahdavi et al. Raha benchmark dataset (Hospital) via the BigDaMa/raha repository."
        ),
    ),
    "flights": DatasetMetadata(
        name="flights",
        domain="aviation",
        n_rows=2376,
        n_columns=7,
        error_types=("missing_value", "formatting", "datetime"),
        source_urls=(
            f"{_BASE_URL}/flights/dirty.csv",
            f"{_BASE_URL}/flights/clean.csv",
        ),
        citation=(
            "Mahdavi et al. Raha benchmark dataset (Flights) via the BigDaMa/raha repository."
        ),
    ),
    "beers": DatasetMetadata(
        name="beers",
        domain="consumer",
        n_rows=2410,
        n_columns=11,
        error_types=("formatting", "missing_value", "normalization"),
        source_urls=(
            f"{_BASE_URL}/beers/dirty.csv",
            f"{_BASE_URL}/beers/clean.csv",
        ),
        citation=("Mahdavi et al. Raha benchmark dataset (Beers) via the BigDaMa/raha repository."),
    ),
}


def get_dataset_metadata(name: str) -> DatasetMetadata:
    """Return canonical metadata for a named benchmark dataset.

    Args:
        name: Canonical dataset name.

    Returns:
        The immutable metadata entry for the dataset.

    Raises:
        KeyError: If the dataset is not registered.
    """
    return DATASET_REGISTRY[name]