Spaces:

build-small-hackathon
/

scrubdata

Running

File size: 25,036 Bytes

16dc556

"""Engine regression tests: profiler detection, every executor op, end-to-end."""

import math

import pandas as pd
import pytest

from scrubdata import apply_plan, mock_plan, profile_dataframe
from scrubdata import detect
from eval.metrics import is_valid


def _col_plan(name, ops):
    return {"table_operations": [], "flags": [],
            "columns": [{"name": name, "operations": ops}]}


def _apply(series_vals, ops, col="x"):
    df = pd.DataFrame({col: series_vals})
    out, _ = apply_plan(df, _col_plan(col, ops))
    return out[col].tolist()


# ---- value-level ops --------------------------------------------------------

def test_strip_whitespace():
    assert _apply(["  a  b ", "c"], [{"op": "strip_whitespace"}]) == ["a b", "c"]


def test_parse_currency_us_eu_accounting():
    out = _apply(["$1,200.50", "1.200,50", "(500)", "950"], [{"op": "parse_currency"}])
    assert out == [1200.50, 1200.50, -500.0, 950.0]


def test_parse_percent():
    out = _apply(["12.5%", "100%"], [{"op": "parse_percent"}])
    assert out[0] == pytest.approx(0.125) and out[1] == pytest.approx(1.0)


def test_parse_date_formats():
    out = _apply(["2023-01-05", "1/6/2023", "5 Jan 2023", "44931"],
                 [{"op": "parse_date"}])
    assert out[0] == "2023-01-05"
    assert out[1] == "2023-01-06"
    assert out[2] == "2023-01-05"
    assert out[3] == "2023-01-05"  # excel serial


def test_standardize_boolean():
    out = _apply(["Yes", "Y", "TRUE", "1", "No", "N", "FALSE", "0"],
                 [{"op": "standardize_boolean"}])
    assert out == [True, True, True, True, False, False, False, False]


def test_standardize_phone_us():
    assert _apply(["555.123.4567"], [{"op": "standardize_phone"}]) == ["(555) 123-4567"]


def test_normalize_email():
    assert _apply([" Bob@X.COM "], [{"op": "normalize_email"}]) == ["bob@x.com"]


def test_standardize_case():
    assert _apply(["hello WORLD"], [{"op": "standardize_case", "case": "title"}]) == ["Hello World"]


def test_normalize_disguised_nulls():
    out = _apply(["N/A", "-", "real"], [{"op": "normalize_disguised_nulls"}])
    assert out[0] is None or pd.isna(out[0])
    assert out[2] == "real"


def test_canonicalize_categories():
    out = _apply(["usa", "U.S.A"], [{"op": "canonicalize_categories",
                  "mapping": {"usa": "United States", "U.S.A": "United States"}}])
    assert out == ["United States", "United States"]


# ---- table-level + end-to-end ----------------------------------------------

def test_drop_dupes_empty_rows_cols():
    df = pd.DataFrame({"a": ["1", "1", ""], "junk": ["", "", ""]})
    plan = {"table_operations": [
        {"op": "drop_empty_columns", "columns": ["junk"]},
        {"op": "drop_empty_rows"}, {"op": "drop_exact_duplicates"}],
        "columns": [], "flags": []}
    out, _ = apply_plan(df, plan)
    assert list(out.columns) == ["a"]
    assert out["a"].tolist() == ["1"]


def test_sample_end_to_end():
    df = pd.read_csv("samples/dirty_contacts.csv", dtype=str, keep_default_na=False)
    before = profile_dataframe(df)
    plan = mock_plan(df, before)
    cleaned, log = apply_plan(df, plan)
    assert "notes2" not in cleaned.columns          # empty col dropped
    assert len(cleaned) == 13                        # 16 -> dedup/empty -> 13
    assert is_valid(plan)                            # plan conforms to schema


def test_phone_conservatism():
    # consistent format -> heuristic should NOT emit standardize_phone
    df = pd.DataFrame({"phone": ["5551234567", "5559876543", "5550001111"]})
    plan = mock_plan(df)
    ops = [o["op"] for c in plan["columns"] for o in c["operations"]]
    assert "standardize_phone" not in ops


def test_detect_types():
    assert detect.detect_semantic_type("email", ["a@b.com", "c@d.com"]) == "email"
    assert detect.detect_semantic_type("x", ["Yes", "No", "Y"]) == "boolean"


def test_batched_planner():
    # agentic column-batching wrapper merges per-batch plans + deterministic table ops
    from scrubdata.model_planner import make_batched_planner
    from scrubdata.planner import mock_plan
    df = pd.read_csv("samples/dirty_contacts.csv", dtype=str, keep_default_na=False)
    plan = make_batched_planner(mock_plan, batch_size=3)(df)
    names = {c["name"] for c in plan["columns"]}
    assert {"country", "amount"} <= names                       # all columns covered
    assert any(o["op"] == "drop_empty_columns" for o in plan["table_operations"])
    assert is_valid(plan)


def test_reconcile_grounds_and_abstains():
    from scrubdata.reconcile import default_index
    idx = default_index()
    assert idx.reconcile("USA", "country")[0] == "United States"
    assert idx.reconcile("Germny", "country")[0] == "Germany"      # fuzzy
    assert idx.reconcile("Xyzzylandia", "country") is None          # ABSTAIN
    assert idx.reconcile("Califrnia", "state")[0] == "California"


def test_grounded_planner_no_wrong_merge():
    # 'guntxrsvillx' (a town not in the reference) must NOT be merged into a similar real
    # city — the structural fix for guntxrsvillx->huntsville.
    df = pd.DataFrame({"loc": ["birminghxm", "Birmingham", "guntxrsvillx", "Chicago",
                               "Chcago", "Birmingham", "Chicago", "Birmingham"]})
    plan = mock_plan(df)
    mapping = {k: v for c in plan["columns"] for o in c["operations"]
               if o["op"] == "canonicalize_categories" for k, v in o["mapping"].items()}
    assert mapping.get("birminghxm") == "Birmingham"
    assert mapping.get("guntxrsvillx", "") != "Huntsville"


def test_grounded_wrapper_overrides_model_overcorrection():
    from scrubdata.grounded import make_grounded_planner
    # a model that over-corrects (invents canonicals + wrong-merges)
    def fake_model(df, *a):
        return {"table_operations": [], "flags": [], "columns": [
            {"name": "city", "detected_semantic_type": "categorical", "issues": [],
             "operations": [{"op": "canonicalize_categories",
                             "mapping": {"birminghxm": "Birmingham City USA",
                                         "guntxrsvillx": "Huntsville"}}]}]}
    df = pd.DataFrame({"city": ["birminghxm", "Birmingham", "guntxrsvillx", "Chicago",
                               "Chcago", "Birmingham", "Chicago", "Birmingham"]})
    plan = make_grounded_planner(fake_model)(df)
    m = {k: v for c in plan["columns"] for o in c["operations"]
         if o["op"] == "canonicalize_categories" for k, v in o["mapping"].items()}
    assert m.get("birminghxm") == "Birmingham"            # grounded, not "Birmingham City USA"
    assert m.get("guntxrsvillx", "") != "Huntsville"      # wrong-merge blocked
    assert any(f["column"] == "city" for f in plan["flags"])   # abstained -> review flag


def test_pii_validators():
    from scrubdata.pii import luhn_ok, _is_credit_card, _is_iban
    assert luhn_ok("4532015112830366")
    assert not luhn_ok("4532015112830367")
    assert _is_credit_card("4532-0151-1283-0366")
    assert not _is_credit_card("1234567890123456")          # fails Luhn
    assert _is_iban("DE89370400440532013000")
    assert not _is_iban("DE89370400440532013001")           # fails mod-97


def test_pii_column_detection_and_negatives():
    from scrubdata.pii import detect_column_pii
    cards = ["4532015112830366", "4716461583322103", "5425233430109903", "4024007103939509"]
    r = detect_column_pii("card", cards)
    assert r and r["pii_type"] == "credit_card" and r["checksum"]
    r = detect_column_pii("ssn", ["123-45-6789", "987-65-4321", "111-22-3333"])
    assert r and r["pii_type"] == "ssn"
    assert detect_column_pii("city", ["Boston", "Chicago", "Dallas", "Boston"]) is None
    assert detect_column_pii("qty", ["1", "2", "3", "4", "5"]) is None


def test_pii_planner_masks_and_never_reformats_identifiers():
    df = pd.DataFrame({
        "card": ["4532015112830366", "4716461583322103", "5425233430109903",
                 "4024007103939509", "370434978549371"],   # last one fails Luhn (80% rate)
        "email": ["ana@corp.io", "luis@mail.com", "sofia@test.org", "raul@corp.io",
                  "mia@mail.com"],
        "city": ["Boston", "Chicago", "Boston", "Dallas", "Chicago"],
    })
    plan = mock_plan(df)
    assert is_valid(plan)
    ops = {c["name"]: [o["op"] for o in c["operations"]] for c in plan["columns"]}
    # checksum-confirmed at 80% coverage -> still auto-masked, never parse_number'd
    assert ops["card"] == ["flag_pii", "mask_pii"]
    assert "flag_pii" in ops["email"] and "mask_pii" not in ops["email"]
    assert "city" not in ops or not any("pii" in o for o in ops.get("city", []))
    cleaned, _ = apply_plan(df, plan)
    from scrubdata.pii import detect_column_pii
    assert detect_column_pii("card", cleaned["card"].tolist()) is None   # leak-free
    assert cleaned["card"][0].endswith("0366") and cleaned["card"][0].startswith("*")
    assert cleaned["email"][0] == "ana@corp.io"                          # flagged, not destroyed


def test_pii_hash_and_pseudonymize_deterministic():
    from scrubdata.pii import hash_value, pseudonymize_value
    assert hash_value("4532015112830366", "s1") == hash_value("4532015112830366", "s1")
    assert hash_value("4532015112830366", "s1") != hash_value("4532015112830366", "s2")
    p1 = pseudonymize_value("ana@corp.io", "s1", "email")
    assert p1 == pseudonymize_value("ana@corp.io", "s1", "email")   # join-stable
    assert p1.startswith("EMAIL_") and "ana" not in p1


def test_active_planner_defaults_to_heuristic(monkeypatch):
    monkeypatch.delenv("SCRUBDATA_MODEL", raising=False)
    from scrubdata.active import get_planner
    from scrubdata.planner import mock_plan
    assert get_planner() is mock_plan


def test_union_plans_model_wins_and_heuristic_extends():
    from scrubdata.verifier import union_plans
    primary = {"columns": [{"name": "city", "operations": [
        {"op": "canonicalize_categories", "mapping": {"bostn": "Boston"}}]}], "flags": []}
    secondary = {"columns": [
        {"name": "city", "operations": [{"op": "canonicalize_categories",
                                         "mapping": {"bostn": "BOSTON", "chcago": "Chicago"}}]},
        {"name": "state", "operations": [{"op": "canonicalize_categories",
                                          "mapping": {"texs": "Texas"}}]},
    ]}
    out = union_plans(primary, secondary)
    maps = {c["name"]: c["operations"][0]["mapping"] for c in out["columns"]}
    assert maps["city"]["bostn"] == "Boston"          # primary wins on conflict
    assert maps["city"]["chcago"] == "Chicago"        # secondary extends coverage
    assert maps["state"] == {"texs": "Texas"}         # secondary-only column added
    assert primary["columns"][0]["operations"][0]["mapping"] == {"bostn": "Boston"}  # no mutation


def test_active_planner_is_verified_union(monkeypatch):
    monkeypatch.setenv("SCRUBDATA_MODEL", "test-model")
    from scrubdata.active import get_planner
    planner = get_planner()
    # the model backend isn't reachable in tests -> every batch falls back to the
    # heuristic; get_planner must return the verified-union wrapper (only it emits this
    # honest label) and tag the plan as fallback rather than claiming the model ran.
    df = pd.DataFrame({"city": ["Boston", "Boston", "Bostn", "Chicago", "Chicago"]})
    plan = planner(df)
    assert plan["_generated_by"] == "deterministic (model unavailable, fell back)"
    assert is_valid(plan)


def test_convention_gates_regression():
    from scrubdata import detect
    from scrubdata.executor import _parse_percent, _standardize_phone
    # date gate: uniform slash / uniform month-name = consistent; mixed = not
    assert detect.date_formats_consistent(["1/4/2016", "12/23/2015", "3/7/2014"])
    assert detect.date_formats_consistent(["28 July 2016", "4 May 2015"])
    assert not detect.date_formats_consistent(["1/4/2016", "2015-12-23", "3/7/2014",
                                               "2014-01-02"])
    # 90% boundary: 1 stray in 20 stays consistent
    assert detect.date_formats_consistent(["1/4/2016"] * 19 + ["2016-01-04"])
    # percent gate: uniform-% gated; one stray of 20 still gated (no cliff)
    assert detect.percent_formats_consistent(["10%", "20%", "30%"])
    assert detect.percent_formats_consistent(["10%"] * 19 + ["0.6"])
    assert not detect.percent_formats_consistent(["10%", "0.2", "0.3"])
    # parse_percent abstains on bare values instead of /100 corruption
    assert _parse_percent("0.6") == "0.6"
    assert _parse_percent("45%") == 0.45
    # zip guard + Excel-serial name gate + 7-digit phone
    assert detect.detect_semantic_type("zipcode(long)", ["40231", "40213"] * 10) == "text"
    assert detect.detect_semantic_type("zcta", ["48371", "48380"] * 10) == "text"
    assert detect.detect_semantic_type("record_id", ["40231", "40213"] * 10) == "number"
    assert _standardize_phone("454.1763") == "454-1763"
    # end-to-end: consistent date column -> NO parse_date op + minority flagged
    df = pd.DataFrame({"issue_date": ["1/4/2016"] * 18 + ["1/5/2016", "2016-01-04"]})
    plan = mock_plan(df)
    ops = [o["op"] for c in plan["columns"] for o in c["operations"]]
    assert "parse_date" not in ops
    assert any(f["issue"] == "off_convention_dates" for f in plan["flags"])
    # mixed date column -> op present
    df2 = pd.DataFrame({"start": ["1/4/2016", "2015-12-23", "Apr-2014", "04/16/23"] * 5})
    ops2 = [o["op"] for c in mock_plan(df2)["columns"] for o in c["operations"]]
    assert "parse_date" in ops2


def test_verifier_gates_model_format_ops():
    from scrubdata.verifier import verify_plan
    df = pd.DataFrame({"d": ["1/4/2016", "2/5/2016", "3/6/2016"] * 4,
                       "p": ["10%", "20%", "30%"] * 4})
    model_plan = {"table_operations": [], "flags": [], "columns": [
        {"name": "d", "operations": [{"op": "parse_date", "rationale": "x"}]},
        {"name": "p", "operations": [{"op": "parse_percent", "rationale": "x"}]},
    ]}
    out = verify_plan(df, model_plan, tau=0.5)
    ops = [o["op"] for c in out["columns"] for o in c["operations"]]
    assert "parse_date" not in ops and "parse_percent" not in ops
    assert sum(1 for f in out["flags"] if f["issue"] == "convention_preserved") == 2


def test_voting_guards_regression():
    from scrubdata.planner import detect_entity_groups
    from scrubdata.executor import apply_plan
    # numeric votable column: detection excludes it; executor never crashes
    rows = []
    for f in range(25):
        for s in range(5):
            rows.append({"sku": f"SKU-{f}", "src": f"s{s}",
                         "label": ("ok" if not (f == 2 and s == 1) else "okk")
                                  + str(f % 4),
                         "qty": f * 10 + s})
    df = pd.DataFrame(rows)
    df["qty"] = df["qty"].astype("int64")
    eg = detect_entity_groups(df)
    if eg:
        assert "qty" not in eg[1]
    apply_plan(df, mock_plan(df))                  # must not raise
    # missing-like keys never form an entity group
    plan = {"table_operations": [{"op": "resolve_by_majority", "key_column": "k",
                                  "columns": ["v"]}], "columns": [], "flags": []}
    df2 = pd.DataFrame({"k": ["N/A"] * 6 + ["X-1"] * 3,
                        "v": ["a", "a", "a", "a", "b", "c", "z", "z", "y"]})
    cleaned, _ = apply_plan(df2, plan)
    assert list(cleaned["v"][:6]) == ["a", "a", "a", "a", "b", "c"]   # untouched
    # plan params are clamped: model-emitted min_share=0 cannot force rewrites
    plan2 = {"table_operations": [{"op": "resolve_by_majority", "key_column": "k",
                                   "columns": ["v"], "min_share": 0.0,
                                   "min_group": 1}], "columns": [], "flags": []}
    df3 = pd.DataFrame({"k": ["G1"] * 4, "v": ["a", "b", "b", "c"]})   # 50% max
    cleaned3, _ = apply_plan(df3, plan2)
    assert list(cleaned3["v"]) == ["a", "b", "b", "c"]
    # false-consensus guard: fat minorities (1 of 4 = legitimate updates) decline;
    # thin minorities (1 of 10 = reporting errors) proceed
    df4 = pd.DataFrame({"k": [f"G{i//4}" for i in range(40)],
                        "v": ["m", "m", "m", "x"] * 10})
    plan4 = {"table_operations": [{"op": "resolve_by_majority", "key_column": "k",
                                   "columns": ["v"]}], "columns": [], "flags": []}
    cleaned4, log4 = apply_plan(df4, plan4)
    entry = next(e for e in log4 if e["op"] == "resolve_by_majority")
    assert entry["cells_changed"] == 0 and "declined" in entry["detail"]
    df5 = pd.DataFrame({"k": [f"G{i//10}" for i in range(40)],
                        "v": (["m"] * 9 + ["x"]) * 4})
    cleaned5, log5 = apply_plan(df5, plan4)
    entry5 = next(e for e in log5 if e["op"] == "resolve_by_majority")
    assert entry5["cells_changed"] == 4                  # thin dissenters resolved
    # date-shaped keys are rejected
    rows = [{"date": f"2024-01-{d+1:02d}", "site": f"site-{r % 3}", "crew": f"c{r % 4}",
             "reading": f"v{r}"} for d in range(25) for r in range(5)]
    assert detect_entity_groups(pd.DataFrame(rows)) is None


def test_union_inherits_vote_op_and_preserves_op_order():
    from scrubdata.verifier import union_plans
    primary = {"table_operations": [], "columns": [], "flags": []}
    secondary = {"table_operations": [{"op": "resolve_by_majority", "key_column": "k",
                                       "columns": ["v"], "rationale": "vote"}],
                 "columns": [{"name": "t", "operations": [
                     {"op": "fix_encoding", "rationale": "enc"},
                     {"op": "normalize_punctuation", "rationale": "punct"},
                 ]}], "flags": []}
    out = union_plans(primary, secondary)
    assert any(o["op"] == "resolve_by_majority" for o in out["table_operations"])
    t_ops = [o["op"] for c in out["columns"] if c["name"] == "t"
             for o in c["operations"]]
    assert t_ops.index("fix_encoding") < t_ops.index("normalize_punctuation")


def test_fix_encoding_op():
    from scrubdata.executor import _fix_encoding
    assert _fix_encoding("café".encode("utf-8").decode("cp1252")) == "café"
    assert _fix_encoding("naïve résumé".encode("utf-8").decode("latin-1")) == "naïve résumé"
    assert _fix_encoding("plain text") == "plain text"        # untouched
    df = pd.DataFrame({"title": ["cafÃ© latte", "normal row"] * 6})
    plan = mock_plan(df)
    ops = [o["op"] for c in plan["columns"] for o in c["operations"]]
    assert "fix_encoding" in ops
    cleaned, _ = apply_plan(df, plan)
    assert cleaned["title"][0] == "café latte"


def test_resolve_by_majority_voting():
    rows = []
    for f in range(25):                       # 25 flights x 5 source reports
        for s in range(5):
            dep = f"{(f % 12) + 1}:58 p.m."
            arr = f"{(f % 11) + 1}:10 a.m."
            if (f, s) in ((3, 4), (9, 1)):
                dep = "7:59 p.m."             # two corrupted reports, two groups
            if (f, s) in ((7, 2), (12, 0)):
                arr = "9:40 a.m."
            rows.append({"flight": f"AA-{1000+f}", "src": f"src{s}", "dep": dep,
                         "arr": arr, "gate": f"G{f}"})
    df = pd.DataFrame(rows)
    plan = mock_plan(df)
    vote = [o for o in plan["table_operations"] if o["op"] == "resolve_by_majority"]
    assert vote and vote[0]["key_column"] == "flight"
    cleaned, log = apply_plan(df, plan)
    assert set(cleaned[cleaned["flight"] == "AA-1003"]["dep"]) == {"4:58 p.m."}
    entry = next(e for e in log if e["op"] == "resolve_by_majority")
    assert entry["cells_changed"] >= 1        # the minority report was resolved
    # no key regime -> no vote op
    df2 = pd.DataFrame({"a": [str(i) for i in range(40)], "b": ["x"] * 40})
    assert not any(o["op"] == "resolve_by_majority"
                   for o in mock_plan(df2)["table_operations"])


def test_suspects_visibility_high_cardinality():
    from scrubdata.profiler import profile_column
    # high-card "text" column: 60 unique names + one near-dup of a repeated one
    names = [f"unique business {i}" for i in range(57)]
    col = names + ["acme holdings", "acme holdings", "acme holdngs"]
    prof = profile_column(pd.Series(col, name="business"))
    assert prof["detected_semantic_type"] == "text"
    sus = {s["raw"]: s["candidates"] for s in prof["suspect_values"]}
    assert "acme holdngs" in sus and "acme holdings" in sus["acme holdngs"]
    assert len(prof["suspect_values"]) <= 25            # bounded
    # heuristic now repairs it (verifier-gated), where before it emitted nothing
    df = pd.DataFrame({"business": col})
    plan = mock_plan(df)
    maps = {r: c for col_ in plan["columns"] for o in col_["operations"]
            if o["op"] == "canonicalize_categories" for r, c in o["mapping"].items()}
    assert maps.get("acme holdngs") == "acme holdings"
    cleaned, _ = apply_plan(df, plan)
    assert "acme holdngs" not in set(cleaned["business"])
    # garbage suspect-free value stays put; plan still schema-valid
    assert is_valid(plan)


def test_suspects_garbage_flagged_not_mapped():
    from scrubdata.profiler import profile_column
    col = [f"item {i}" for i in range(40)] + ["it€m ’junk", "it€m ’junk"]
    df = pd.DataFrame({"thing": col})
    plan = mock_plan(df)
    maps = {r for c in plan["columns"] for o in c["operations"]
            if o["op"] == "canonicalize_categories" for r in o["mapping"]}
    assert "it€m ’junk" not in maps                     # no invented target
    assert is_valid(plan)


def test_normalize_punctuation_op():
    df = pd.DataFrame({"name": ["palm’s thai", "joe‘s “grill”", "a–b — c", "plain's ok"]})
    plan = mock_plan(df)
    ops = [o["op"] for c in plan["columns"] for o in c["operations"]]
    assert "normalize_punctuation" in ops
    cleaned, _ = apply_plan(df, plan)
    assert cleaned["name"][0] == "palm's thai"
    assert cleaned["name"][1] == 'joe\'s "grill"'
    assert cleaned["name"][2] == "a-b - c"
    # a clean column must NOT get the op
    plan2 = mock_plan(pd.DataFrame({"name": ["plain's ok", "also fine"]}))
    ops2 = [o["op"] for c in plan2["columns"] for o in c["operations"]]
    assert "normalize_punctuation" not in ops2


def test_pair_profile_candidates_and_constraint():
    from scrubdata.pair_profile import candidate_pairs, constrain_plan
    col = ["Boston"] * 8 + ["Chicago"] * 6 + ["Bostn", "Chcago", "Qwortelby"]
    pairs = candidate_pairs(col)
    by_raw = {p["raw"]: [c["canon"] for c in p["candidates"]] for p in pairs}
    assert "Boston" in by_raw.get("Bostn", [])
    assert "Chicago" in by_raw.get("Chcago", [])
    assert "Qwortelby" not in by_raw                  # garbage gets no candidates
    assert "Boston" not in by_raw                     # frequent values are not suspicious
    plan = {"columns": [{"name": "city", "operations": [{
        "op": "canonicalize_categories", "rationale": "typos",
        "mapping": {"Bostn": "Boston", "Chcago": "Dallas", "Qwortelby": "Boston"}}]}],
        "flags": []}
    out = constrain_plan(plan, {"city": [{"raw": p["raw"],
                                          "candidates": [c["canon"] for c in p["candidates"]]}
                                         for p in pairs]})
    kept = out["columns"][0]["operations"][0]["mapping"]
    assert kept == {"Bostn": "Boston"}                # off-candidate + garbage dropped
    assert out["flags"] and out["flags"][0]["issue"] == "outside_candidate_pairs"


def test_jellyfish_prompt_construction():
    from eval.baselines_learned import di_prompt, ed_prompt, parse_di, parse_ed
    rec = {"city": "Bostn", "state": "MA"}
    ed = ed_prompt(rec, "city")
    assert "Record [city: Bostn, state: MA]" in ed
    assert "Attribute for Verification: [city: Bostn]" in ed
    assert ed.endswith("### Response:\n\n")
    di = di_prompt(rec, "city", "geography")
    assert "Record: [state: MA]" in di          # flagged attribute removed
    assert "city" in di and "Bostn" not in di   # model infers, never copies
    assert parse_ed("Yes, there is an error") and not parse_ed("No.")
    assert parse_di(" Boston ", "Bostn") == "Boston"
    assert parse_di("", "Bostn") == "Bostn"                      # abstain on empty
    assert parse_di("The value is\nBoston", "Bostn") == "Bostn"  # abstain on rambling


def test_value_counts_profile():
    df = pd.DataFrame({"country": ["USA", "USA", "usa", "Canada"]})
    prof = profile_dataframe(df)
    vc = dict((v, n) for v, n in prof["columns"][0]["value_counts"])
    assert vc["USA"] == 2 and "value_counts" in prof["columns"][0]


def test_cli(tmp_path):
    from scrubdata.cli import main
    out = tmp_path / "clean.csv"
    plan = tmp_path / "plan.json"
    rc = main(["samples/dirty_contacts.csv", "-o", str(out), "--plan", str(plan), "--quiet"])
    assert rc == 0
    assert out.exists() and plan.exists()
    cleaned = pd.read_csv(out)
    assert "notes2" not in cleaned.columns and len(cleaned) == 13