"""Seeded, self-verifying error injection — turns any CLEAN table into dirty/clean validation. This is the de-biasing core of the north-star: our 20+ harvested clean domains become per-cell-ground-truth validation across error types, far beyond any one published benchmark. Self-contained (no nlpaug/BART deps): we inject a KNOWN corruption into a clean cell, so the (dirty -> clean) ground truth is exact and the run is reproducible (fixed seed). Injects RECOVERABLE error types (the cleaner can restore the clean value): typo, ocr, case, whitespace — i.e. the canonicalization + format axes. Targets CATEGORICAL text columns (recurring values), where canonicalization is the task. """ from __future__ import annotations import random import string _OCR = {"O": "0", "o": "0", "l": "1", "I": "1", "S": "5", "s": "5", "B": "8", "Z": "2", "z": "2", "g": "9", "G": "6", "b": "6"} def _typo(s: str, rng: random.Random) -> str: if len(s) < 4: return s i = rng.randrange(1, len(s) - 1) if not s[i].isalpha(): return s m = rng.random() if m < 0.55: # substitute (the classic 'birminghxm') pool = string.ascii_uppercase if s[i].isupper() else string.ascii_lowercase return s[:i] + rng.choice([c for c in pool if c != s[i].lower()]) + s[i + 1:] if m < 0.78: # delete return s[:i] + s[i + 1:] return s[:i] + s[i + 1] + s[i] + s[i + 2:] # transpose def _ocr(s: str, rng: random.Random) -> str: idxs = [i for i, c in enumerate(s) if c in _OCR] if not idxs: return _typo(s, rng) i = rng.choice(idxs) return s[:i] + _OCR[s[i]] + s[i + 1:] def _case(s: str, rng: random.Random) -> str: return rng.choice([s.upper(), s.lower(), s.title()]) def _ws(s: str, rng: random.Random) -> str: return rng.choice([" " * rng.randint(1, 2) + s, s + " " * rng.randint(1, 2), s.replace(" ", " ", 1) if " " in s else " " + s]) INJECTORS = {"typo": _typo, "ocr": _ocr, "case": _case, "whitespace": _ws} def _categorical_text_cols(df, max_cols: int = 12) -> list[str]: """Text columns whose values RECUR (canonicalization is meaningful).""" out = [] for c in df.columns: vals = [str(v).strip() for v in df[c].tolist() if str(v).strip()] if len(vals) < 20: continue alpha = sum(1 for v in vals if any(ch.isalpha() for ch in v)) / len(vals) nonnum = 0 for v in vals: try: float(v.replace(",", "")) except ValueError: nonnum += 1 if alpha < 0.7 or nonnum / len(vals) < 0.7: continue if len(set(vals)) / len(vals) > 0.5: # must recur (categorical) continue out.append(c) if len(out) >= max_cols: break return out def inject(clean_df, error_type: str, seed: int, rate: float = 0.07): """Return a dirty copy of `clean_df` with `error_type` errors injected into a `rate` fraction of cells in its categorical-text columns, or None if no eligible column. The original `clean_df` is the exact ground truth.""" fn = INJECTORS[error_type] cols = _categorical_text_cols(clean_df) if not cols: return None rng = random.Random(seed) dirty = clean_df.copy() touched = 0 for c in cols: col = dirty[c].tolist() for i, v in enumerate(col): s = str(v) if s.strip() and rng.random() < rate: nv = fn(s, rng) if nv != s: col[i] = nv touched += 1 dirty[c] = col return dirty if touched else None