scrubdata / tests /test_engine.py
OpenAI Codex
deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build
16dc556
Raw
History Blame Contribute Delete
25 kB
"""Engine regression tests: profiler detection, every executor op, end-to-end."""
import math
import pandas as pd
import pytest
from scrubdata import apply_plan, mock_plan, profile_dataframe
from scrubdata import detect
from eval.metrics import is_valid
def _col_plan(name, ops):
return {"table_operations": [], "flags": [],
"columns": [{"name": name, "operations": ops}]}
def _apply(series_vals, ops, col="x"):
df = pd.DataFrame({col: series_vals})
out, _ = apply_plan(df, _col_plan(col, ops))
return out[col].tolist()
# ---- value-level ops --------------------------------------------------------
def test_strip_whitespace():
assert _apply([" a b ", "c"], [{"op": "strip_whitespace"}]) == ["a b", "c"]
def test_parse_currency_us_eu_accounting():
out = _apply(["$1,200.50", "1.200,50", "(500)", "950"], [{"op": "parse_currency"}])
assert out == [1200.50, 1200.50, -500.0, 950.0]
def test_parse_percent():
out = _apply(["12.5%", "100%"], [{"op": "parse_percent"}])
assert out[0] == pytest.approx(0.125) and out[1] == pytest.approx(1.0)
def test_parse_date_formats():
out = _apply(["2023-01-05", "1/6/2023", "5 Jan 2023", "44931"],
[{"op": "parse_date"}])
assert out[0] == "2023-01-05"
assert out[1] == "2023-01-06"
assert out[2] == "2023-01-05"
assert out[3] == "2023-01-05" # excel serial
def test_standardize_boolean():
out = _apply(["Yes", "Y", "TRUE", "1", "No", "N", "FALSE", "0"],
[{"op": "standardize_boolean"}])
assert out == [True, True, True, True, False, False, False, False]
def test_standardize_phone_us():
assert _apply(["555.123.4567"], [{"op": "standardize_phone"}]) == ["(555) 123-4567"]
def test_normalize_email():
assert _apply([" Bob@X.COM "], [{"op": "normalize_email"}]) == ["bob@x.com"]
def test_standardize_case():
assert _apply(["hello WORLD"], [{"op": "standardize_case", "case": "title"}]) == ["Hello World"]
def test_normalize_disguised_nulls():
out = _apply(["N/A", "-", "real"], [{"op": "normalize_disguised_nulls"}])
assert out[0] is None or pd.isna(out[0])
assert out[2] == "real"
def test_canonicalize_categories():
out = _apply(["usa", "U.S.A"], [{"op": "canonicalize_categories",
"mapping": {"usa": "United States", "U.S.A": "United States"}}])
assert out == ["United States", "United States"]
# ---- table-level + end-to-end ----------------------------------------------
def test_drop_dupes_empty_rows_cols():
df = pd.DataFrame({"a": ["1", "1", ""], "junk": ["", "", ""]})
plan = {"table_operations": [
{"op": "drop_empty_columns", "columns": ["junk"]},
{"op": "drop_empty_rows"}, {"op": "drop_exact_duplicates"}],
"columns": [], "flags": []}
out, _ = apply_plan(df, plan)
assert list(out.columns) == ["a"]
assert out["a"].tolist() == ["1"]
def test_sample_end_to_end():
df = pd.read_csv("samples/dirty_contacts.csv", dtype=str, keep_default_na=False)
before = profile_dataframe(df)
plan = mock_plan(df, before)
cleaned, log = apply_plan(df, plan)
assert "notes2" not in cleaned.columns # empty col dropped
assert len(cleaned) == 13 # 16 -> dedup/empty -> 13
assert is_valid(plan) # plan conforms to schema
def test_phone_conservatism():
# consistent format -> heuristic should NOT emit standardize_phone
df = pd.DataFrame({"phone": ["5551234567", "5559876543", "5550001111"]})
plan = mock_plan(df)
ops = [o["op"] for c in plan["columns"] for o in c["operations"]]
assert "standardize_phone" not in ops
def test_detect_types():
assert detect.detect_semantic_type("email", ["a@b.com", "c@d.com"]) == "email"
assert detect.detect_semantic_type("x", ["Yes", "No", "Y"]) == "boolean"
def test_batched_planner():
# agentic column-batching wrapper merges per-batch plans + deterministic table ops
from scrubdata.model_planner import make_batched_planner
from scrubdata.planner import mock_plan
df = pd.read_csv("samples/dirty_contacts.csv", dtype=str, keep_default_na=False)
plan = make_batched_planner(mock_plan, batch_size=3)(df)
names = {c["name"] for c in plan["columns"]}
assert {"country", "amount"} <= names # all columns covered
assert any(o["op"] == "drop_empty_columns" for o in plan["table_operations"])
assert is_valid(plan)
def test_reconcile_grounds_and_abstains():
from scrubdata.reconcile import default_index
idx = default_index()
assert idx.reconcile("USA", "country")[0] == "United States"
assert idx.reconcile("Germny", "country")[0] == "Germany" # fuzzy
assert idx.reconcile("Xyzzylandia", "country") is None # ABSTAIN
assert idx.reconcile("Califrnia", "state")[0] == "California"
def test_grounded_planner_no_wrong_merge():
# 'guntxrsvillx' (a town not in the reference) must NOT be merged into a similar real
# city — the structural fix for guntxrsvillx->huntsville.
df = pd.DataFrame({"loc": ["birminghxm", "Birmingham", "guntxrsvillx", "Chicago",
"Chcago", "Birmingham", "Chicago", "Birmingham"]})
plan = mock_plan(df)
mapping = {k: v for c in plan["columns"] for o in c["operations"]
if o["op"] == "canonicalize_categories" for k, v in o["mapping"].items()}
assert mapping.get("birminghxm") == "Birmingham"
assert mapping.get("guntxrsvillx", "") != "Huntsville"
def test_grounded_wrapper_overrides_model_overcorrection():
from scrubdata.grounded import make_grounded_planner
# a model that over-corrects (invents canonicals + wrong-merges)
def fake_model(df, *a):
return {"table_operations": [], "flags": [], "columns": [
{"name": "city", "detected_semantic_type": "categorical", "issues": [],
"operations": [{"op": "canonicalize_categories",
"mapping": {"birminghxm": "Birmingham City USA",
"guntxrsvillx": "Huntsville"}}]}]}
df = pd.DataFrame({"city": ["birminghxm", "Birmingham", "guntxrsvillx", "Chicago",
"Chcago", "Birmingham", "Chicago", "Birmingham"]})
plan = make_grounded_planner(fake_model)(df)
m = {k: v for c in plan["columns"] for o in c["operations"]
if o["op"] == "canonicalize_categories" for k, v in o["mapping"].items()}
assert m.get("birminghxm") == "Birmingham" # grounded, not "Birmingham City USA"
assert m.get("guntxrsvillx", "") != "Huntsville" # wrong-merge blocked
assert any(f["column"] == "city" for f in plan["flags"]) # abstained -> review flag
def test_pii_validators():
from scrubdata.pii import luhn_ok, _is_credit_card, _is_iban
assert luhn_ok("4532015112830366")
assert not luhn_ok("4532015112830367")
assert _is_credit_card("4532-0151-1283-0366")
assert not _is_credit_card("1234567890123456") # fails Luhn
assert _is_iban("DE89370400440532013000")
assert not _is_iban("DE89370400440532013001") # fails mod-97
def test_pii_column_detection_and_negatives():
from scrubdata.pii import detect_column_pii
cards = ["4532015112830366", "4716461583322103", "5425233430109903", "4024007103939509"]
r = detect_column_pii("card", cards)
assert r and r["pii_type"] == "credit_card" and r["checksum"]
r = detect_column_pii("ssn", ["123-45-6789", "987-65-4321", "111-22-3333"])
assert r and r["pii_type"] == "ssn"
assert detect_column_pii("city", ["Boston", "Chicago", "Dallas", "Boston"]) is None
assert detect_column_pii("qty", ["1", "2", "3", "4", "5"]) is None
def test_pii_planner_masks_and_never_reformats_identifiers():
df = pd.DataFrame({
"card": ["4532015112830366", "4716461583322103", "5425233430109903",
"4024007103939509", "370434978549371"], # last one fails Luhn (80% rate)
"email": ["ana@corp.io", "luis@mail.com", "sofia@test.org", "raul@corp.io",
"mia@mail.com"],
"city": ["Boston", "Chicago", "Boston", "Dallas", "Chicago"],
})
plan = mock_plan(df)
assert is_valid(plan)
ops = {c["name"]: [o["op"] for o in c["operations"]] for c in plan["columns"]}
# checksum-confirmed at 80% coverage -> still auto-masked, never parse_number'd
assert ops["card"] == ["flag_pii", "mask_pii"]
assert "flag_pii" in ops["email"] and "mask_pii" not in ops["email"]
assert "city" not in ops or not any("pii" in o for o in ops.get("city", []))
cleaned, _ = apply_plan(df, plan)
from scrubdata.pii import detect_column_pii
assert detect_column_pii("card", cleaned["card"].tolist()) is None # leak-free
assert cleaned["card"][0].endswith("0366") and cleaned["card"][0].startswith("*")
assert cleaned["email"][0] == "ana@corp.io" # flagged, not destroyed
def test_pii_hash_and_pseudonymize_deterministic():
from scrubdata.pii import hash_value, pseudonymize_value
assert hash_value("4532015112830366", "s1") == hash_value("4532015112830366", "s1")
assert hash_value("4532015112830366", "s1") != hash_value("4532015112830366", "s2")
p1 = pseudonymize_value("ana@corp.io", "s1", "email")
assert p1 == pseudonymize_value("ana@corp.io", "s1", "email") # join-stable
assert p1.startswith("EMAIL_") and "ana" not in p1
def test_active_planner_defaults_to_heuristic(monkeypatch):
monkeypatch.delenv("SCRUBDATA_MODEL", raising=False)
from scrubdata.active import get_planner
from scrubdata.planner import mock_plan
assert get_planner() is mock_plan
def test_union_plans_model_wins_and_heuristic_extends():
from scrubdata.verifier import union_plans
primary = {"columns": [{"name": "city", "operations": [
{"op": "canonicalize_categories", "mapping": {"bostn": "Boston"}}]}], "flags": []}
secondary = {"columns": [
{"name": "city", "operations": [{"op": "canonicalize_categories",
"mapping": {"bostn": "BOSTON", "chcago": "Chicago"}}]},
{"name": "state", "operations": [{"op": "canonicalize_categories",
"mapping": {"texs": "Texas"}}]},
]}
out = union_plans(primary, secondary)
maps = {c["name"]: c["operations"][0]["mapping"] for c in out["columns"]}
assert maps["city"]["bostn"] == "Boston" # primary wins on conflict
assert maps["city"]["chcago"] == "Chicago" # secondary extends coverage
assert maps["state"] == {"texs": "Texas"} # secondary-only column added
assert primary["columns"][0]["operations"][0]["mapping"] == {"bostn": "Boston"} # no mutation
def test_active_planner_is_verified_union(monkeypatch):
monkeypatch.setenv("SCRUBDATA_MODEL", "test-model")
from scrubdata.active import get_planner
planner = get_planner()
# the model backend isn't reachable in tests -> every batch falls back to the
# heuristic; get_planner must return the verified-union wrapper (only it emits this
# honest label) and tag the plan as fallback rather than claiming the model ran.
df = pd.DataFrame({"city": ["Boston", "Boston", "Bostn", "Chicago", "Chicago"]})
plan = planner(df)
assert plan["_generated_by"] == "deterministic (model unavailable, fell back)"
assert is_valid(plan)
def test_convention_gates_regression():
from scrubdata import detect
from scrubdata.executor import _parse_percent, _standardize_phone
# date gate: uniform slash / uniform month-name = consistent; mixed = not
assert detect.date_formats_consistent(["1/4/2016", "12/23/2015", "3/7/2014"])
assert detect.date_formats_consistent(["28 July 2016", "4 May 2015"])
assert not detect.date_formats_consistent(["1/4/2016", "2015-12-23", "3/7/2014",
"2014-01-02"])
# 90% boundary: 1 stray in 20 stays consistent
assert detect.date_formats_consistent(["1/4/2016"] * 19 + ["2016-01-04"])
# percent gate: uniform-% gated; one stray of 20 still gated (no cliff)
assert detect.percent_formats_consistent(["10%", "20%", "30%"])
assert detect.percent_formats_consistent(["10%"] * 19 + ["0.6"])
assert not detect.percent_formats_consistent(["10%", "0.2", "0.3"])
# parse_percent abstains on bare values instead of /100 corruption
assert _parse_percent("0.6") == "0.6"
assert _parse_percent("45%") == 0.45
# zip guard + Excel-serial name gate + 7-digit phone
assert detect.detect_semantic_type("zipcode(long)", ["40231", "40213"] * 10) == "text"
assert detect.detect_semantic_type("zcta", ["48371", "48380"] * 10) == "text"
assert detect.detect_semantic_type("record_id", ["40231", "40213"] * 10) == "number"
assert _standardize_phone("454.1763") == "454-1763"
# end-to-end: consistent date column -> NO parse_date op + minority flagged
df = pd.DataFrame({"issue_date": ["1/4/2016"] * 18 + ["1/5/2016", "2016-01-04"]})
plan = mock_plan(df)
ops = [o["op"] for c in plan["columns"] for o in c["operations"]]
assert "parse_date" not in ops
assert any(f["issue"] == "off_convention_dates" for f in plan["flags"])
# mixed date column -> op present
df2 = pd.DataFrame({"start": ["1/4/2016", "2015-12-23", "Apr-2014", "04/16/23"] * 5})
ops2 = [o["op"] for c in mock_plan(df2)["columns"] for o in c["operations"]]
assert "parse_date" in ops2
def test_verifier_gates_model_format_ops():
from scrubdata.verifier import verify_plan
df = pd.DataFrame({"d": ["1/4/2016", "2/5/2016", "3/6/2016"] * 4,
"p": ["10%", "20%", "30%"] * 4})
model_plan = {"table_operations": [], "flags": [], "columns": [
{"name": "d", "operations": [{"op": "parse_date", "rationale": "x"}]},
{"name": "p", "operations": [{"op": "parse_percent", "rationale": "x"}]},
]}
out = verify_plan(df, model_plan, tau=0.5)
ops = [o["op"] for c in out["columns"] for o in c["operations"]]
assert "parse_date" not in ops and "parse_percent" not in ops
assert sum(1 for f in out["flags"] if f["issue"] == "convention_preserved") == 2
def test_voting_guards_regression():
from scrubdata.planner import detect_entity_groups
from scrubdata.executor import apply_plan
# numeric votable column: detection excludes it; executor never crashes
rows = []
for f in range(25):
for s in range(5):
rows.append({"sku": f"SKU-{f}", "src": f"s{s}",
"label": ("ok" if not (f == 2 and s == 1) else "okk")
+ str(f % 4),
"qty": f * 10 + s})
df = pd.DataFrame(rows)
df["qty"] = df["qty"].astype("int64")
eg = detect_entity_groups(df)
if eg:
assert "qty" not in eg[1]
apply_plan(df, mock_plan(df)) # must not raise
# missing-like keys never form an entity group
plan = {"table_operations": [{"op": "resolve_by_majority", "key_column": "k",
"columns": ["v"]}], "columns": [], "flags": []}
df2 = pd.DataFrame({"k": ["N/A"] * 6 + ["X-1"] * 3,
"v": ["a", "a", "a", "a", "b", "c", "z", "z", "y"]})
cleaned, _ = apply_plan(df2, plan)
assert list(cleaned["v"][:6]) == ["a", "a", "a", "a", "b", "c"] # untouched
# plan params are clamped: model-emitted min_share=0 cannot force rewrites
plan2 = {"table_operations": [{"op": "resolve_by_majority", "key_column": "k",
"columns": ["v"], "min_share": 0.0,
"min_group": 1}], "columns": [], "flags": []}
df3 = pd.DataFrame({"k": ["G1"] * 4, "v": ["a", "b", "b", "c"]}) # 50% max
cleaned3, _ = apply_plan(df3, plan2)
assert list(cleaned3["v"]) == ["a", "b", "b", "c"]
# false-consensus guard: fat minorities (1 of 4 = legitimate updates) decline;
# thin minorities (1 of 10 = reporting errors) proceed
df4 = pd.DataFrame({"k": [f"G{i//4}" for i in range(40)],
"v": ["m", "m", "m", "x"] * 10})
plan4 = {"table_operations": [{"op": "resolve_by_majority", "key_column": "k",
"columns": ["v"]}], "columns": [], "flags": []}
cleaned4, log4 = apply_plan(df4, plan4)
entry = next(e for e in log4 if e["op"] == "resolve_by_majority")
assert entry["cells_changed"] == 0 and "declined" in entry["detail"]
df5 = pd.DataFrame({"k": [f"G{i//10}" for i in range(40)],
"v": (["m"] * 9 + ["x"]) * 4})
cleaned5, log5 = apply_plan(df5, plan4)
entry5 = next(e for e in log5 if e["op"] == "resolve_by_majority")
assert entry5["cells_changed"] == 4 # thin dissenters resolved
# date-shaped keys are rejected
rows = [{"date": f"2024-01-{d+1:02d}", "site": f"site-{r % 3}", "crew": f"c{r % 4}",
"reading": f"v{r}"} for d in range(25) for r in range(5)]
assert detect_entity_groups(pd.DataFrame(rows)) is None
def test_union_inherits_vote_op_and_preserves_op_order():
from scrubdata.verifier import union_plans
primary = {"table_operations": [], "columns": [], "flags": []}
secondary = {"table_operations": [{"op": "resolve_by_majority", "key_column": "k",
"columns": ["v"], "rationale": "vote"}],
"columns": [{"name": "t", "operations": [
{"op": "fix_encoding", "rationale": "enc"},
{"op": "normalize_punctuation", "rationale": "punct"},
]}], "flags": []}
out = union_plans(primary, secondary)
assert any(o["op"] == "resolve_by_majority" for o in out["table_operations"])
t_ops = [o["op"] for c in out["columns"] if c["name"] == "t"
for o in c["operations"]]
assert t_ops.index("fix_encoding") < t_ops.index("normalize_punctuation")
def test_fix_encoding_op():
from scrubdata.executor import _fix_encoding
assert _fix_encoding("café".encode("utf-8").decode("cp1252")) == "café"
assert _fix_encoding("naïve résumé".encode("utf-8").decode("latin-1")) == "naïve résumé"
assert _fix_encoding("plain text") == "plain text" # untouched
df = pd.DataFrame({"title": ["café latte", "normal row"] * 6})
plan = mock_plan(df)
ops = [o["op"] for c in plan["columns"] for o in c["operations"]]
assert "fix_encoding" in ops
cleaned, _ = apply_plan(df, plan)
assert cleaned["title"][0] == "café latte"
def test_resolve_by_majority_voting():
rows = []
for f in range(25): # 25 flights x 5 source reports
for s in range(5):
dep = f"{(f % 12) + 1}:58 p.m."
arr = f"{(f % 11) + 1}:10 a.m."
if (f, s) in ((3, 4), (9, 1)):
dep = "7:59 p.m." # two corrupted reports, two groups
if (f, s) in ((7, 2), (12, 0)):
arr = "9:40 a.m."
rows.append({"flight": f"AA-{1000+f}", "src": f"src{s}", "dep": dep,
"arr": arr, "gate": f"G{f}"})
df = pd.DataFrame(rows)
plan = mock_plan(df)
vote = [o for o in plan["table_operations"] if o["op"] == "resolve_by_majority"]
assert vote and vote[0]["key_column"] == "flight"
cleaned, log = apply_plan(df, plan)
assert set(cleaned[cleaned["flight"] == "AA-1003"]["dep"]) == {"4:58 p.m."}
entry = next(e for e in log if e["op"] == "resolve_by_majority")
assert entry["cells_changed"] >= 1 # the minority report was resolved
# no key regime -> no vote op
df2 = pd.DataFrame({"a": [str(i) for i in range(40)], "b": ["x"] * 40})
assert not any(o["op"] == "resolve_by_majority"
for o in mock_plan(df2)["table_operations"])
def test_suspects_visibility_high_cardinality():
from scrubdata.profiler import profile_column
# high-card "text" column: 60 unique names + one near-dup of a repeated one
names = [f"unique business {i}" for i in range(57)]
col = names + ["acme holdings", "acme holdings", "acme holdngs"]
prof = profile_column(pd.Series(col, name="business"))
assert prof["detected_semantic_type"] == "text"
sus = {s["raw"]: s["candidates"] for s in prof["suspect_values"]}
assert "acme holdngs" in sus and "acme holdings" in sus["acme holdngs"]
assert len(prof["suspect_values"]) <= 25 # bounded
# heuristic now repairs it (verifier-gated), where before it emitted nothing
df = pd.DataFrame({"business": col})
plan = mock_plan(df)
maps = {r: c for col_ in plan["columns"] for o in col_["operations"]
if o["op"] == "canonicalize_categories" for r, c in o["mapping"].items()}
assert maps.get("acme holdngs") == "acme holdings"
cleaned, _ = apply_plan(df, plan)
assert "acme holdngs" not in set(cleaned["business"])
# garbage suspect-free value stays put; plan still schema-valid
assert is_valid(plan)
def test_suspects_garbage_flagged_not_mapped():
from scrubdata.profiler import profile_column
col = [f"item {i}" for i in range(40)] + ["it€m ’junk", "it€m ’junk"]
df = pd.DataFrame({"thing": col})
plan = mock_plan(df)
maps = {r for c in plan["columns"] for o in c["operations"]
if o["op"] == "canonicalize_categories" for r in o["mapping"]}
assert "it€m ’junk" not in maps # no invented target
assert is_valid(plan)
def test_normalize_punctuation_op():
df = pd.DataFrame({"name": ["palm’s thai", "joe‘s “grill”", "a–b — c", "plain's ok"]})
plan = mock_plan(df)
ops = [o["op"] for c in plan["columns"] for o in c["operations"]]
assert "normalize_punctuation" in ops
cleaned, _ = apply_plan(df, plan)
assert cleaned["name"][0] == "palm's thai"
assert cleaned["name"][1] == 'joe\'s "grill"'
assert cleaned["name"][2] == "a-b - c"
# a clean column must NOT get the op
plan2 = mock_plan(pd.DataFrame({"name": ["plain's ok", "also fine"]}))
ops2 = [o["op"] for c in plan2["columns"] for o in c["operations"]]
assert "normalize_punctuation" not in ops2
def test_pair_profile_candidates_and_constraint():
from scrubdata.pair_profile import candidate_pairs, constrain_plan
col = ["Boston"] * 8 + ["Chicago"] * 6 + ["Bostn", "Chcago", "Qwortelby"]
pairs = candidate_pairs(col)
by_raw = {p["raw"]: [c["canon"] for c in p["candidates"]] for p in pairs}
assert "Boston" in by_raw.get("Bostn", [])
assert "Chicago" in by_raw.get("Chcago", [])
assert "Qwortelby" not in by_raw # garbage gets no candidates
assert "Boston" not in by_raw # frequent values are not suspicious
plan = {"columns": [{"name": "city", "operations": [{
"op": "canonicalize_categories", "rationale": "typos",
"mapping": {"Bostn": "Boston", "Chcago": "Dallas", "Qwortelby": "Boston"}}]}],
"flags": []}
out = constrain_plan(plan, {"city": [{"raw": p["raw"],
"candidates": [c["canon"] for c in p["candidates"]]}
for p in pairs]})
kept = out["columns"][0]["operations"][0]["mapping"]
assert kept == {"Bostn": "Boston"} # off-candidate + garbage dropped
assert out["flags"] and out["flags"][0]["issue"] == "outside_candidate_pairs"
def test_jellyfish_prompt_construction():
from eval.baselines_learned import di_prompt, ed_prompt, parse_di, parse_ed
rec = {"city": "Bostn", "state": "MA"}
ed = ed_prompt(rec, "city")
assert "Record [city: Bostn, state: MA]" in ed
assert "Attribute for Verification: [city: Bostn]" in ed
assert ed.endswith("### Response:\n\n")
di = di_prompt(rec, "city", "geography")
assert "Record: [state: MA]" in di # flagged attribute removed
assert "city" in di and "Bostn" not in di # model infers, never copies
assert parse_ed("Yes, there is an error") and not parse_ed("No.")
assert parse_di(" Boston ", "Bostn") == "Boston"
assert parse_di("", "Bostn") == "Bostn" # abstain on empty
assert parse_di("The value is\nBoston", "Bostn") == "Bostn" # abstain on rambling
def test_value_counts_profile():
df = pd.DataFrame({"country": ["USA", "USA", "usa", "Canada"]})
prof = profile_dataframe(df)
vc = dict((v, n) for v, n in prof["columns"][0]["value_counts"])
assert vc["USA"] == 2 and "value_counts" in prof["columns"][0]
def test_cli(tmp_path):
from scrubdata.cli import main
out = tmp_path / "clean.csv"
plan = tmp_path / "plan.json"
rc = main(["samples/dirty_contacts.csv", "-o", str(out), "--plan", str(plan), "--quiet"])
assert rc == 0
assert out.exists() and plan.exists()
cleaned = pd.read_csv(out)
assert "notes2" not in cleaned.columns and len(cleaned) == 13