scrubdata / eval /inject.py
OpenAI Codex
deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build
16dc556
Raw
History Blame Contribute Delete
3.73 kB
"""Seeded, self-verifying error injection — turns any CLEAN table into dirty/clean
validation. This is the de-biasing core of the north-star: our 20+ harvested clean
domains become per-cell-ground-truth validation across error types, far beyond any one
published benchmark.
Self-contained (no nlpaug/BART deps): we inject a KNOWN corruption into a clean cell, so
the (dirty -> clean) ground truth is exact and the run is reproducible (fixed seed).
Injects RECOVERABLE error types (the cleaner can restore the clean value): typo, ocr,
case, whitespace — i.e. the canonicalization + format axes. Targets CATEGORICAL text
columns (recurring values), where canonicalization is the task.
"""
from __future__ import annotations
import random
import string
_OCR = {"O": "0", "o": "0", "l": "1", "I": "1", "S": "5", "s": "5",
"B": "8", "Z": "2", "z": "2", "g": "9", "G": "6", "b": "6"}
def _typo(s: str, rng: random.Random) -> str:
if len(s) < 4:
return s
i = rng.randrange(1, len(s) - 1)
if not s[i].isalpha():
return s
m = rng.random()
if m < 0.55: # substitute (the classic 'birminghxm')
pool = string.ascii_uppercase if s[i].isupper() else string.ascii_lowercase
return s[:i] + rng.choice([c for c in pool if c != s[i].lower()]) + s[i + 1:]
if m < 0.78: # delete
return s[:i] + s[i + 1:]
return s[:i] + s[i + 1] + s[i] + s[i + 2:] # transpose
def _ocr(s: str, rng: random.Random) -> str:
idxs = [i for i, c in enumerate(s) if c in _OCR]
if not idxs:
return _typo(s, rng)
i = rng.choice(idxs)
return s[:i] + _OCR[s[i]] + s[i + 1:]
def _case(s: str, rng: random.Random) -> str:
return rng.choice([s.upper(), s.lower(), s.title()])
def _ws(s: str, rng: random.Random) -> str:
return rng.choice([" " * rng.randint(1, 2) + s, s + " " * rng.randint(1, 2),
s.replace(" ", " ", 1) if " " in s else " " + s])
INJECTORS = {"typo": _typo, "ocr": _ocr, "case": _case, "whitespace": _ws}
def _categorical_text_cols(df, max_cols: int = 12) -> list[str]:
"""Text columns whose values RECUR (canonicalization is meaningful)."""
out = []
for c in df.columns:
vals = [str(v).strip() for v in df[c].tolist() if str(v).strip()]
if len(vals) < 20:
continue
alpha = sum(1 for v in vals if any(ch.isalpha() for ch in v)) / len(vals)
nonnum = 0
for v in vals:
try:
float(v.replace(",", ""))
except ValueError:
nonnum += 1
if alpha < 0.7 or nonnum / len(vals) < 0.7:
continue
if len(set(vals)) / len(vals) > 0.5: # must recur (categorical)
continue
out.append(c)
if len(out) >= max_cols:
break
return out
def inject(clean_df, error_type: str, seed: int, rate: float = 0.07):
"""Return a dirty copy of `clean_df` with `error_type` errors injected into a
`rate` fraction of cells in its categorical-text columns, or None if no eligible
column. The original `clean_df` is the exact ground truth."""
fn = INJECTORS[error_type]
cols = _categorical_text_cols(clean_df)
if not cols:
return None
rng = random.Random(seed)
dirty = clean_df.copy()
touched = 0
for c in cols:
col = dirty[c].tolist()
for i, v in enumerate(col):
s = str(v)
if s.strip() and rng.random() < rate:
nv = fn(s, rng)
if nv != s:
col[i] = nv
touched += 1
dirty[c] = col
return dirty if touched else None