Spaces:

build-small-hackathon
/

scrubdata

Running

OpenAI Codex

deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build

16dc556 11 days ago

25 kB

	"""Engine regression tests: profiler detection, every executor op, end-to-end."""

	import math

	import pandas as pd
	import pytest

	from scrubdata import apply_plan, mock_plan, profile_dataframe
	from scrubdata import detect
	from eval.metrics import is_valid


	def _col_plan(name, ops):
	return {"table_operations": [], "flags": [],
	"columns": [{"name": name, "operations": ops}]}


	def _apply(series_vals, ops, col="x"):
	df = pd.DataFrame({col: series_vals})
	out, _ = apply_plan(df, _col_plan(col, ops))
	return out[col].tolist()


	# ---- value-level ops --------------------------------------------------------

	def test_strip_whitespace():
	assert _apply([" a b ", "c"], [{"op": "strip_whitespace"}]) == ["a b", "c"]


	def test_parse_currency_us_eu_accounting():
	out = _apply(["$1,200.50", "1.200,50", "(500)", "950"], [{"op": "parse_currency"}])
	assert out == [1200.50, 1200.50, -500.0, 950.0]


	def test_parse_percent():
	out = _apply(["12.5%", "100%"], [{"op": "parse_percent"}])
	assert out[0] == pytest.approx(0.125) and out[1] == pytest.approx(1.0)


	def test_parse_date_formats():
	out = _apply(["2023-01-05", "1/6/2023", "5 Jan 2023", "44931"],
	[{"op": "parse_date"}])
	assert out[0] == "2023-01-05"
	assert out[1] == "2023-01-06"
	assert out[2] == "2023-01-05"
	assert out[3] == "2023-01-05" # excel serial


	def test_standardize_boolean():
	out = _apply(["Yes", "Y", "TRUE", "1", "No", "N", "FALSE", "0"],
	[{"op": "standardize_boolean"}])
	assert out == [True, True, True, True, False, False, False, False]


	def test_standardize_phone_us():
	assert _apply(["555.123.4567"], [{"op": "standardize_phone"}]) == ["(555) 123-4567"]


	def test_normalize_email():
	assert _apply([" Bob@X.COM "], [{"op": "normalize_email"}]) == ["bob@x.com"]


	def test_standardize_case():
	assert _apply(["hello WORLD"], [{"op": "standardize_case", "case": "title"}]) == ["Hello World"]


	def test_normalize_disguised_nulls():
	out = _apply(["N/A", "-", "real"], [{"op": "normalize_disguised_nulls"}])
	assert out[0] is None or pd.isna(out[0])
	assert out[2] == "real"


	def test_canonicalize_categories():
	out = _apply(["usa", "U.S.A"], [{"op": "canonicalize_categories",
	"mapping": {"usa": "United States", "U.S.A": "United States"}}])
	assert out == ["United States", "United States"]


	# ---- table-level + end-to-end ----------------------------------------------

	def test_drop_dupes_empty_rows_cols():
	df = pd.DataFrame({"a": ["1", "1", ""], "junk": ["", "", ""]})
	plan = {"table_operations": [
	{"op": "drop_empty_columns", "columns": ["junk"]},
	{"op": "drop_empty_rows"}, {"op": "drop_exact_duplicates"}],
	"columns": [], "flags": []}
	out, _ = apply_plan(df, plan)
	assert list(out.columns) == ["a"]
	assert out["a"].tolist() == ["1"]


	def test_sample_end_to_end():
	df = pd.read_csv("samples/dirty_contacts.csv", dtype=str, keep_default_na=False)
	before = profile_dataframe(df)
	plan = mock_plan(df, before)
	cleaned, log = apply_plan(df, plan)
	assert "notes2" not in cleaned.columns # empty col dropped
	assert len(cleaned) == 13 # 16 -> dedup/empty -> 13
	assert is_valid(plan) # plan conforms to schema


	def test_phone_conservatism():
	# consistent format -> heuristic should NOT emit standardize_phone
	df = pd.DataFrame({"phone": ["5551234567", "5559876543", "5550001111"]})
	plan = mock_plan(df)
	ops = [o["op"] for c in plan["columns"] for o in c["operations"]]
	assert "standardize_phone" not in ops


	def test_detect_types():
	assert detect.detect_semantic_type("email", ["a@b.com", "c@d.com"]) == "email"
	assert detect.detect_semantic_type("x", ["Yes", "No", "Y"]) == "boolean"


	def test_batched_planner():
	# agentic column-batching wrapper merges per-batch plans + deterministic table ops
	from scrubdata.model_planner import make_batched_planner
	from scrubdata.planner import mock_plan
	df = pd.read_csv("samples/dirty_contacts.csv", dtype=str, keep_default_na=False)
	plan = make_batched_planner(mock_plan, batch_size=3)(df)
	names = {c["name"] for c in plan["columns"]}
	assert {"country", "amount"} <= names # all columns covered
	assert any(o["op"] == "drop_empty_columns" for o in plan["table_operations"])
	assert is_valid(plan)


	def test_reconcile_grounds_and_abstains():
	from scrubdata.reconcile import default_index
	idx = default_index()
	assert idx.reconcile("USA", "country")[0] == "United States"
	assert idx.reconcile("Germny", "country")[0] == "Germany" # fuzzy
	assert idx.reconcile("Xyzzylandia", "country") is None # ABSTAIN
	assert idx.reconcile("Califrnia", "state")[0] == "California"


	def test_grounded_planner_no_wrong_merge():
	# 'guntxrsvillx' (a town not in the reference) must NOT be merged into a similar real
	# city — the structural fix for guntxrsvillx->huntsville.
	df = pd.DataFrame({"loc": ["birminghxm", "Birmingham", "guntxrsvillx", "Chicago",
	"Chcago", "Birmingham", "Chicago", "Birmingham"]})
	plan = mock_plan(df)
	mapping = {k: v for c in plan["columns"] for o in c["operations"]
	if o["op"] == "canonicalize_categories" for k, v in o["mapping"].items()}
	assert mapping.get("birminghxm") == "Birmingham"
	assert mapping.get("guntxrsvillx", "") != "Huntsville"


	def test_grounded_wrapper_overrides_model_overcorrection():
	from scrubdata.grounded import make_grounded_planner
	# a model that over-corrects (invents canonicals + wrong-merges)
	def fake_model(df, *a):
	return {"table_operations": [], "flags": [], "columns": [
	{"name": "city", "detected_semantic_type": "categorical", "issues": [],
	"operations": [{"op": "canonicalize_categories",
	"mapping": {"birminghxm": "Birmingham City USA",
	"guntxrsvillx": "Huntsville"}}]}]}
	df = pd.DataFrame({"city": ["birminghxm", "Birmingham", "guntxrsvillx", "Chicago",
	"Chcago", "Birmingham", "Chicago", "Birmingham"]})
	plan = make_grounded_planner(fake_model)(df)
	m = {k: v for c in plan["columns"] for o in c["operations"]
	if o["op"] == "canonicalize_categories" for k, v in o["mapping"].items()}
	assert m.get("birminghxm") == "Birmingham" # grounded, not "Birmingham City USA"
	assert m.get("guntxrsvillx", "") != "Huntsville" # wrong-merge blocked
	assert any(f["column"] == "city" for f in plan["flags"]) # abstained -> review flag


	def test_pii_validators():
	from scrubdata.pii import luhn_ok, _is_credit_card, _is_iban
	assert luhn_ok("4532015112830366")
	assert not luhn_ok("4532015112830367")
	assert _is_credit_card("4532-0151-1283-0366")
	assert not _is_credit_card("1234567890123456") # fails Luhn
	assert _is_iban("DE89370400440532013000")
	assert not _is_iban("DE89370400440532013001") # fails mod-97


	def test_pii_column_detection_and_negatives():
	from scrubdata.pii import detect_column_pii
	cards = ["4532015112830366", "4716461583322103", "5425233430109903", "4024007103939509"]
	r = detect_column_pii("card", cards)
	assert r and r["pii_type"] == "credit_card" and r["checksum"]
	r = detect_column_pii("ssn", ["123-45-6789", "987-65-4321", "111-22-3333"])
	assert r and r["pii_type"] == "ssn"
	assert detect_column_pii("city", ["Boston", "Chicago", "Dallas", "Boston"]) is None
	assert detect_column_pii("qty", ["1", "2", "3", "4", "5"]) is None


	def test_pii_planner_masks_and_never_reformats_identifiers():
	df = pd.DataFrame({
	"card": ["4532015112830366", "4716461583322103", "5425233430109903",
	"4024007103939509", "370434978549371"], # last one fails Luhn (80% rate)
	"email": ["ana@corp.io", "luis@mail.com", "sofia@test.org", "raul@corp.io",
	"mia@mail.com"],
	"city": ["Boston", "Chicago", "Boston", "Dallas", "Chicago"],
	})
	plan = mock_plan(df)
	assert is_valid(plan)
	ops = {c["name"]: [o["op"] for o in c["operations"]] for c in plan["columns"]}
	# checksum-confirmed at 80% coverage -> still auto-masked, never parse_number'd
	assert ops["card"] == ["flag_pii", "mask_pii"]
	assert "flag_pii" in ops["email"] and "mask_pii" not in ops["email"]
	assert "city" not in ops or not any("pii" in o for o in ops.get("city", []))
	cleaned, _ = apply_plan(df, plan)
	from scrubdata.pii import detect_column_pii
	assert detect_column_pii("card", cleaned["card"].tolist()) is None # leak-free
	assert cleaned["card"][0].endswith("0366") and cleaned["card"][0].startswith("*")
	assert cleaned["email"][0] == "ana@corp.io" # flagged, not destroyed


	def test_pii_hash_and_pseudonymize_deterministic():
	from scrubdata.pii import hash_value, pseudonymize_value
	assert hash_value("4532015112830366", "s1") == hash_value("4532015112830366", "s1")
	assert hash_value("4532015112830366", "s1") != hash_value("4532015112830366", "s2")
	p1 = pseudonymize_value("ana@corp.io", "s1", "email")
	assert p1 == pseudonymize_value("ana@corp.io", "s1", "email") # join-stable
	assert p1.startswith("EMAIL_") and "ana" not in p1


	def test_active_planner_defaults_to_heuristic(monkeypatch):
	monkeypatch.delenv("SCRUBDATA_MODEL", raising=False)
	from scrubdata.active import get_planner
	from scrubdata.planner import mock_plan
	assert get_planner() is mock_plan


	def test_union_plans_model_wins_and_heuristic_extends():
	from scrubdata.verifier import union_plans
	primary = {"columns": [{"name": "city", "operations": [
	{"op": "canonicalize_categories", "mapping": {"bostn": "Boston"}}]}], "flags": []}
	secondary = {"columns": [
	{"name": "city", "operations": [{"op": "canonicalize_categories",
	"mapping": {"bostn": "BOSTON", "chcago": "Chicago"}}]},
	{"name": "state", "operations": [{"op": "canonicalize_categories",
	"mapping": {"texs": "Texas"}}]},
	]}
	out = union_plans(primary, secondary)
	maps = {c["name"]: c["operations"][0]["mapping"] for c in out["columns"]}
	assert maps["city"]["bostn"] == "Boston" # primary wins on conflict
	assert maps["city"]["chcago"] == "Chicago" # secondary extends coverage
	assert maps["state"] == {"texs": "Texas"} # secondary-only column added
	assert primary["columns"][0]["operations"][0]["mapping"] == {"bostn": "Boston"} # no mutation


	def test_active_planner_is_verified_union(monkeypatch):
	monkeypatch.setenv("SCRUBDATA_MODEL", "test-model")
	from scrubdata.active import get_planner
	planner = get_planner()
	# the model backend isn't reachable in tests -> every batch falls back to the
	# heuristic; get_planner must return the verified-union wrapper (only it emits this
	# honest label) and tag the plan as fallback rather than claiming the model ran.
	df = pd.DataFrame({"city": ["Boston", "Boston", "Bostn", "Chicago", "Chicago"]})
	plan = planner(df)
	assert plan["_generated_by"] == "deterministic (model unavailable, fell back)"
	assert is_valid(plan)


	def test_convention_gates_regression():
	from scrubdata import detect
	from scrubdata.executor import _parse_percent, _standardize_phone
	# date gate: uniform slash / uniform month-name = consistent; mixed = not
	assert detect.date_formats_consistent(["1/4/2016", "12/23/2015", "3/7/2014"])
	assert detect.date_formats_consistent(["28 July 2016", "4 May 2015"])
	assert not detect.date_formats_consistent(["1/4/2016", "2015-12-23", "3/7/2014",
	"2014-01-02"])
	# 90% boundary: 1 stray in 20 stays consistent
	assert detect.date_formats_consistent(["1/4/2016"] * 19 + ["2016-01-04"])
	# percent gate: uniform-% gated; one stray of 20 still gated (no cliff)
	assert detect.percent_formats_consistent(["10%", "20%", "30%"])
	assert detect.percent_formats_consistent(["10%"] * 19 + ["0.6"])
	assert not detect.percent_formats_consistent(["10%", "0.2", "0.3"])
	# parse_percent abstains on bare values instead of /100 corruption
	assert _parse_percent("0.6") == "0.6"
	assert _parse_percent("45%") == 0.45
	# zip guard + Excel-serial name gate + 7-digit phone
	assert detect.detect_semantic_type("zipcode(long)", ["40231", "40213"] * 10) == "text"
	assert detect.detect_semantic_type("zcta", ["48371", "48380"] * 10) == "text"
	assert detect.detect_semantic_type("record_id", ["40231", "40213"] * 10) == "number"
	assert _standardize_phone("454.1763") == "454-1763"
	# end-to-end: consistent date column -> NO parse_date op + minority flagged
	df = pd.DataFrame({"issue_date": ["1/4/2016"] * 18 + ["1/5/2016", "2016-01-04"]})
	plan = mock_plan(df)
	ops = [o["op"] for c in plan["columns"] for o in c["operations"]]
	assert "parse_date" not in ops
	assert any(f["issue"] == "off_convention_dates" for f in plan["flags"])
	# mixed date column -> op present
	df2 = pd.DataFrame({"start": ["1/4/2016", "2015-12-23", "Apr-2014", "04/16/23"] * 5})
	ops2 = [o["op"] for c in mock_plan(df2)["columns"] for o in c["operations"]]
	assert "parse_date" in ops2


	def test_verifier_gates_model_format_ops():
	from scrubdata.verifier import verify_plan
	df = pd.DataFrame({"d": ["1/4/2016", "2/5/2016", "3/6/2016"] * 4,
	"p": ["10%", "20%", "30%"] * 4})
	model_plan = {"table_operations": [], "flags": [], "columns": [
	{"name": "d", "operations": [{"op": "parse_date", "rationale": "x"}]},
	{"name": "p", "operations": [{"op": "parse_percent", "rationale": "x"}]},
	]}
	out = verify_plan(df, model_plan, tau=0.5)
	ops = [o["op"] for c in out["columns"] for o in c["operations"]]
	assert "parse_date" not in ops and "parse_percent" not in ops
	assert sum(1 for f in out["flags"] if f["issue"] == "convention_preserved") == 2


	def test_voting_guards_regression():
	from scrubdata.planner import detect_entity_groups
	from scrubdata.executor import apply_plan
	# numeric votable column: detection excludes it; executor never crashes
	rows = []
	for f in range(25):
	for s in range(5):
	rows.append({"sku": f"SKU-{f}", "src": f"s{s}",
	"label": ("ok" if not (f == 2 and s == 1) else "okk")
	+ str(f % 4),
	"qty": f * 10 + s})
	df = pd.DataFrame(rows)
	df["qty"] = df["qty"].astype("int64")
	eg = detect_entity_groups(df)
	if eg:
	assert "qty" not in eg[1]
	apply_plan(df, mock_plan(df)) # must not raise
	# missing-like keys never form an entity group
	plan = {"table_operations": [{"op": "resolve_by_majority", "key_column": "k",
	"columns": ["v"]}], "columns": [], "flags": []}
	df2 = pd.DataFrame({"k": ["N/A"] * 6 + ["X-1"] * 3,
	"v": ["a", "a", "a", "a", "b", "c", "z", "z", "y"]})
	cleaned, _ = apply_plan(df2, plan)
	assert list(cleaned["v"][:6]) == ["a", "a", "a", "a", "b", "c"] # untouched
	# plan params are clamped: model-emitted min_share=0 cannot force rewrites
	plan2 = {"table_operations": [{"op": "resolve_by_majority", "key_column": "k",
	"columns": ["v"], "min_share": 0.0,
	"min_group": 1}], "columns": [], "flags": []}
	df3 = pd.DataFrame({"k": ["G1"] * 4, "v": ["a", "b", "b", "c"]}) # 50% max
	cleaned3, _ = apply_plan(df3, plan2)
	assert list(cleaned3["v"]) == ["a", "b", "b", "c"]
	# false-consensus guard: fat minorities (1 of 4 = legitimate updates) decline;
	# thin minorities (1 of 10 = reporting errors) proceed
	df4 = pd.DataFrame({"k": [f"G{i//4}" for i in range(40)],
	"v": ["m", "m", "m", "x"] * 10})
	plan4 = {"table_operations": [{"op": "resolve_by_majority", "key_column": "k",
	"columns": ["v"]}], "columns": [], "flags": []}
	cleaned4, log4 = apply_plan(df4, plan4)
	entry = next(e for e in log4 if e["op"] == "resolve_by_majority")
	assert entry["cells_changed"] == 0 and "declined" in entry["detail"]
	df5 = pd.DataFrame({"k": [f"G{i//10}" for i in range(40)],
	"v": (["m"] * 9 + ["x"]) * 4})
	cleaned5, log5 = apply_plan(df5, plan4)
	entry5 = next(e for e in log5 if e["op"] == "resolve_by_majority")
	assert entry5["cells_changed"] == 4 # thin dissenters resolved
	# date-shaped keys are rejected
	rows = [{"date": f"2024-01-{d+1:02d}", "site": f"site-{r % 3}", "crew": f"c{r % 4}",
	"reading": f"v{r}"} for d in range(25) for r in range(5)]
	assert detect_entity_groups(pd.DataFrame(rows)) is None


	def test_union_inherits_vote_op_and_preserves_op_order():
	from scrubdata.verifier import union_plans
	primary = {"table_operations": [], "columns": [], "flags": []}
	secondary = {"table_operations": [{"op": "resolve_by_majority", "key_column": "k",
	"columns": ["v"], "rationale": "vote"}],
	"columns": [{"name": "t", "operations": [
	{"op": "fix_encoding", "rationale": "enc"},
	{"op": "normalize_punctuation", "rationale": "punct"},
	]}], "flags": []}
	out = union_plans(primary, secondary)
	assert any(o["op"] == "resolve_by_majority" for o in out["table_operations"])
	t_ops = [o["op"] for c in out["columns"] if c["name"] == "t"
	for o in c["operations"]]
	assert t_ops.index("fix_encoding") < t_ops.index("normalize_punctuation")


	def test_fix_encoding_op():
	from scrubdata.executor import _fix_encoding
	assert _fix_encoding("café".encode("utf-8").decode("cp1252")) == "café"
	assert _fix_encoding("naïve résumé".encode("utf-8").decode("latin-1")) == "naïve résumé"
	assert _fix_encoding("plain text") == "plain text" # untouched
	df = pd.DataFrame({"title": ["cafÃ© latte", "normal row"] * 6})
	plan = mock_plan(df)
	ops = [o["op"] for c in plan["columns"] for o in c["operations"]]
	assert "fix_encoding" in ops
	cleaned, _ = apply_plan(df, plan)
	assert cleaned["title"][0] == "café latte"


	def test_resolve_by_majority_voting():
	rows = []
	for f in range(25): # 25 flights x 5 source reports
	for s in range(5):
	dep = f"{(f % 12) + 1}:58 p.m."
	arr = f"{(f % 11) + 1}:10 a.m."
	if (f, s) in ((3, 4), (9, 1)):
	dep = "7:59 p.m." # two corrupted reports, two groups
	if (f, s) in ((7, 2), (12, 0)):
	arr = "9:40 a.m."
	rows.append({"flight": f"AA-{1000+f}", "src": f"src{s}", "dep": dep,
	"arr": arr, "gate": f"G{f}"})
	df = pd.DataFrame(rows)
	plan = mock_plan(df)
	vote = [o for o in plan["table_operations"] if o["op"] == "resolve_by_majority"]
	assert vote and vote[0]["key_column"] == "flight"
	cleaned, log = apply_plan(df, plan)
	assert set(cleaned[cleaned["flight"] == "AA-1003"]["dep"]) == {"4:58 p.m."}
	entry = next(e for e in log if e["op"] == "resolve_by_majority")
	assert entry["cells_changed"] >= 1 # the minority report was resolved
	# no key regime -> no vote op
	df2 = pd.DataFrame({"a": [str(i) for i in range(40)], "b": ["x"] * 40})
	assert not any(o["op"] == "resolve_by_majority"
	for o in mock_plan(df2)["table_operations"])


	def test_suspects_visibility_high_cardinality():
	from scrubdata.profiler import profile_column
	# high-card "text" column: 60 unique names + one near-dup of a repeated one
	names = [f"unique business {i}" for i in range(57)]
	col = names + ["acme holdings", "acme holdings", "acme holdngs"]
	prof = profile_column(pd.Series(col, name="business"))
	assert prof["detected_semantic_type"] == "text"
	sus = {s["raw"]: s["candidates"] for s in prof["suspect_values"]}
	assert "acme holdngs" in sus and "acme holdings" in sus["acme holdngs"]
	assert len(prof["suspect_values"]) <= 25 # bounded
	# heuristic now repairs it (verifier-gated), where before it emitted nothing
	df = pd.DataFrame({"business": col})
	plan = mock_plan(df)
	maps = {r: c for col_ in plan["columns"] for o in col_["operations"]
	if o["op"] == "canonicalize_categories" for r, c in o["mapping"].items()}
	assert maps.get("acme holdngs") == "acme holdings"
	cleaned, _ = apply_plan(df, plan)
	assert "acme holdngs" not in set(cleaned["business"])
	# garbage suspect-free value stays put; plan still schema-valid
	assert is_valid(plan)


	def test_suspects_garbage_flagged_not_mapped():
	from scrubdata.profiler import profile_column
	col = [f"item {i}" for i in range(40)] + ["it€m ’junk", "it€m ’junk"]
	df = pd.DataFrame({"thing": col})
	plan = mock_plan(df)
	maps = {r for c in plan["columns"] for o in c["operations"]
	if o["op"] == "canonicalize_categories" for r in o["mapping"]}
	assert "it€m ’junk" not in maps # no invented target
	assert is_valid(plan)


	def test_normalize_punctuation_op():
	df = pd.DataFrame({"name": ["palm’s thai", "joe‘s “grill”", "a–b — c", "plain's ok"]})
	plan = mock_plan(df)
	ops = [o["op"] for c in plan["columns"] for o in c["operations"]]
	assert "normalize_punctuation" in ops
	cleaned, _ = apply_plan(df, plan)
	assert cleaned["name"][0] == "palm's thai"
	assert cleaned["name"][1] == 'joe\'s "grill"'
	assert cleaned["name"][2] == "a-b - c"
	# a clean column must NOT get the op
	plan2 = mock_plan(pd.DataFrame({"name": ["plain's ok", "also fine"]}))
	ops2 = [o["op"] for c in plan2["columns"] for o in c["operations"]]
	assert "normalize_punctuation" not in ops2


	def test_pair_profile_candidates_and_constraint():
	from scrubdata.pair_profile import candidate_pairs, constrain_plan
	col = ["Boston"] * 8 + ["Chicago"] * 6 + ["Bostn", "Chcago", "Qwortelby"]
	pairs = candidate_pairs(col)
	by_raw = {p["raw"]: [c["canon"] for c in p["candidates"]] for p in pairs}
	assert "Boston" in by_raw.get("Bostn", [])
	assert "Chicago" in by_raw.get("Chcago", [])
	assert "Qwortelby" not in by_raw # garbage gets no candidates
	assert "Boston" not in by_raw # frequent values are not suspicious
	plan = {"columns": [{"name": "city", "operations": [{
	"op": "canonicalize_categories", "rationale": "typos",
	"mapping": {"Bostn": "Boston", "Chcago": "Dallas", "Qwortelby": "Boston"}}]}],
	"flags": []}
	out = constrain_plan(plan, {"city": [{"raw": p["raw"],
	"candidates": [c["canon"] for c in p["candidates"]]}
	for p in pairs]})
	kept = out["columns"][0]["operations"][0]["mapping"]
	assert kept == {"Bostn": "Boston"} # off-candidate + garbage dropped
	assert out["flags"] and out["flags"][0]["issue"] == "outside_candidate_pairs"


	def test_jellyfish_prompt_construction():
	from eval.baselines_learned import di_prompt, ed_prompt, parse_di, parse_ed
	rec = {"city": "Bostn", "state": "MA"}
	ed = ed_prompt(rec, "city")
	assert "Record [city: Bostn, state: MA]" in ed
	assert "Attribute for Verification: [city: Bostn]" in ed
	assert ed.endswith("### Response:\n\n")
	di = di_prompt(rec, "city", "geography")
	assert "Record: [state: MA]" in di # flagged attribute removed
	assert "city" in di and "Bostn" not in di # model infers, never copies
	assert parse_ed("Yes, there is an error") and not parse_ed("No.")
	assert parse_di(" Boston ", "Bostn") == "Boston"
	assert parse_di("", "Bostn") == "Bostn" # abstain on empty
	assert parse_di("The value is\nBoston", "Bostn") == "Bostn" # abstain on rambling


	def test_value_counts_profile():
	df = pd.DataFrame({"country": ["USA", "USA", "usa", "Canada"]})
	prof = profile_dataframe(df)
	vc = dict((v, n) for v, n in prof["columns"][0]["value_counts"])
	assert vc["USA"] == 2 and "value_counts" in prof["columns"][0]


	def test_cli(tmp_path):
	from scrubdata.cli import main
	out = tmp_path / "clean.csv"
	plan = tmp_path / "plan.json"
	rc = main(["samples/dirty_contacts.csv", "-o", str(out), "--plan", str(plan), "--quiet"])
	assert rc == 0
	assert out.exists() and plan.exists()
	cleaned = pd.read_csv(out)
	assert "notes2" not in cleaned.columns and len(cleaned) == 13