"""Generate samples/maria_crm_export.csv — the hero demo dataset. Persona: Maria, ops coordinator, exported the company CRM on Monday morning. The file is engineered against the SHIPPED pipeline (scrubdata.planner.mock_plan + reconcile.grounded_mapping + pii.detect_column_pii) so that: - country / state / plan_tier canonicalize (auto-fix) - dates / mrr / phone normalize (auto-fix) - cp1252->utf8 mojibake repairs - a dedicated credit-card column (Luhn-valid) -> flag_pii + mask_pii - exact + near-duplicate rows -> drop_exact_duplicates / left as judgment - 3 GENUINE 'YOUR CALL' abstentions actually fire as review flags: Slovia -> Slovakia vs Slovenia (score 0.857, margin 0.0) ABSTAIN Austrai -> Australia vs Austria (score 0.875, margin 0.018) ABSTAIN Indai -> India near-miss (score 0.80 < 0.84) ABSTAIN - PLUS the Acme Corp vs ACME Corporation entity near-tie, which the tool correctly does NOT auto-merge (documented as the human judgment call). Deterministic: fixed seed, fixed planted rows. Re-runnable. """ from __future__ import annotations import csv import random random.seed(20260612) HEADER = ["company", "contact_name", "email", "phone", "country", "state", "signup_date", "plan_tier", "mrr", "status", "cc_on_file", "notes"] # --- Luhn-valid fake cards (verified by pii.luhn_ok) ------------------------- CARDS = [ "4539578763621486", "4916 3385 0608 2832", "5500-0055-5555-5559", "4111 1111 1111 1111", "6011000990139424", "3782 822463 10005", "4485275742308327", "5105105105105100", ] # clearly-fake SSN-shaped values for the notes field SSNS = ["123-45-6789", "078-05-1120", "219-09-9999", "457-55-5462"] # canonical pools (each value carries its target frequency intent via how often we draw it) PHONE_STYLES = [ "(555) {a}-{b}", "555.{a}.{b}", "+1 555 {a} {b}", "555{a}{b}", "1-555-{a}-{b}", ] DATE_STYLES = ["iso", "slash", "text", "dmy"] PLAN_VARIANTS = ["Premium", "premium", "PREMIUM", "Prem", "Basic", "basic", "BASIC", "Enterprise", "enterprise", "Ent", "Free", "free"] STATUS_VARIANTS = ["Active", "active", "ACTIVE", "Churned", "churned", "Trial", "trial", "Lapsed", "lapsed"] # Large, distinctive pools so random First+Last pairs DON'T collide into spurious # one-edit "typo" clusters (those wrongly trip the high-cardinality suspect path and # pollute the demo with fake name merges). Distinctive surnames keep edit-distance high. FIRST = ["Maria", "James", "Wei", "Aisha", "Carlos", "Priya", "Tom", "Sofia", "Liam", "Yuki", "Hassan", "Elena", "Jonathan", "Fatima", "Diego", "Anneliese", "Raj", "Chloe", "Omar", "Ingrid", "Pedro", "Gabrielle", "Ivan", "Lucia", "Samuel", "Zoe", "Benjamin", "Mei", "Olivier", "Tatiana", "Kwame", "Beatriz", "Soren", "Yasmin", "Dmitri", "Imani", "Lorenzo", "Saoirse", "Mateo", "Freya"] LAST = ["Hawthorne", "Underwood", "Castellanos", "Okonkwo", "Lindqvist", "Patelkar", "Nakamura", "Rossellini", "Fitzgerald", "Abramovich", "Whitfield", "Delacroix", "Montgomery", "Vasquez", "Kowalski", "Brennan", "Sandoval", "Thornton", "Eriksson", "Bukowski", "Calloway", "Mwangi", "Domingo", "Ferreira", "Halvorsen", "Mackenzie", "Petrosyan", "Yamamoto", "Cavendish", "Olszewski"] # Country surfaces. KEY: every non-canonical spelling must stay rare (freq<3 over the # WHOLE column) so grounded_mapping folds it; the dominant clean surface carries the bulk. # 'USA' is intentionally repeated a LOT so the 4-spellings demo is visible, but because # freq>=3 surfaces are treated as data, the canonicalization is shown via the RARE variants. CLEAN_COUNTRIES = ["United States", "United Kingdom", "Canada", "Germany", "France", "Spain", "Italy", "Netherlands", "Brazil", "Japan", "Australia", "India", "Mexico", "Sweden", "Poland"] # rare misspellings that SHOULD auto-fold (each used exactly once or twice): COUNTRY_TYPOS_AUTOFIX = { "U.S.A": "United States", "us": "United States", "USA.": "United States", "Nigeia": "Nigeria", "germny": "Germany", "Brasil": "Brazil", "Polnd": "Poland", "Swedn": "Sweden", "Frnace": "France", } # THE YOUR-CALL country ambiguities (each used once, must abstain): COUNTRY_ABSTAIN = ["Slovia", "Austrai", "Indai"] CLEAN_STATES = ["CA", "NY", "TX", "WA", "FL", "IL", "MA", "CO", "GA", "OR", "NJ", "AZ"] STATE_TYPOS_AUTOFIX = {"Calfornia": "California", "Wahsington": "Washington", "Virgina": "Virginia", "Mississipi": "Mississippi"} STATE_CLEAN_LONG = ["California", "Texas", "New York", "Florida"] # Company pools. Acme is the planted entity near-tie. COMPANIES = ["Globex", "Initech", "Hooli", "Umbrella", "Stark Industries", "Wayne Enterprises", "Wonka", "Cyberdyne", "Soylent", "Tyrell", "Massive Dynamic", "Vandelay", "Pied Piper", "Aperture", "Black Mesa", "Gekko & Co", "Bluth Company", "Dunder Mifflin", "Prestige Worldwide", "Sterling Cooper"] def _mk_phone() -> str: a, b = random.randint(200, 998), random.randint(1000, 9998) return random.choice(PHONE_STYLES).format(a=a, b=b) def _mk_date() -> str: y, m, d = 2024, random.randint(1, 12), random.randint(1, 28) style = random.choice(DATE_STYLES) months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"] if style == "iso": return f"{y}-{m:02d}-{d:02d}" if style == "slash": return f"{m}/{d}/{str(y)[2:]}" if style == "text": return f"{months[m-1]} {d} {y}" return f"{d:02d}-{m:02d}-{y}" # dmy: 01-03-2024 def _mk_mrr() -> str: base = random.choice([99, 199, 299, 499, 999, 1200, 2400, 4999]) style = random.randint(0, 3) if style == 0: return f"${base:,}.00" if style == 1: return f"{base}.00" if style == 2: return f"{base:,} USD" return str(base) def _mk_name() -> str: n = f"{random.choice(FIRST)} {random.choice(LAST)}" r = random.random() if r < 0.12: return n.upper() if r < 0.22: return n.lower() if r < 0.30: return " " + n + " " # whitespace chaos return n def _mk_email(name: str) -> str: base = name.strip().lower().replace(" ", " ").replace(" ", ".") dom = random.choice(["example.com", "acme.co", "globex.io", "mail.com", "corp.net"]) e = f"{base}@{dom}" if random.random() < 0.15: e = " " + e.upper() + " " # case + whitespace chaos for normalize_email return e def base_row() -> dict: name = _mk_name() return { "company": random.choice(COMPANIES), "contact_name": name, "email": _mk_email(name), "phone": _mk_phone(), "country": random.choice(CLEAN_COUNTRIES), "state": random.choice(CLEAN_STATES), "signup_date": _mk_date(), "plan_tier": random.choice(PLAN_VARIANTS), "mrr": _mk_mrr(), "status": random.choice(STATUS_VARIANTS), "cc_on_file": random.choice(CARDS), "notes": "", } rows: list[dict] = [] # --- bulk filler rows -------------------------------------------------------- for _ in range(330): rows.append(base_row()) # --- planted: Acme entity near-tie (genuine YOUR CALL, tool should NOT merge) - for _ in range(7): r = base_row(); r["company"] = "Acme Corp"; rows.append(r) for _ in range(2): r = base_row(); r["company"] = "acme corp "; rows.append(r) # case+ws -> folds to Acme Corp r = base_row(); r["company"] = "ACME Corp"; rows.append(r) # case -> folds for _ in range(4): r = base_row(); r["company"] = "ACME Corporation"; rows.append(r) # NEAR-TIE: should NOT auto-merge # --- planted: country canonicalization (auto-fix) + abstentions -------------- for surf in COUNTRY_TYPOS_AUTOFIX: r = base_row(); r["country"] = surf; rows.append(r) # extra visible 4-ways for USA so the demo "sees" the chaos (these are freq>=3 = data) for _ in range(4): r = base_row(); r["country"] = "USA"; rows.append(r) for _ in range(3): r = base_row(); r["country"] = "U.S.A."; rows.append(r) # THE YOUR CALL country ambiguities — one row each, rare so they reach the abstain path for surf in COUNTRY_ABSTAIN: r = base_row(); r["country"] = surf; rows.append(r) # --- planted: state canonicalization ----------------------------------------- for surf in STATE_TYPOS_AUTOFIX: r = base_row(); r["state"] = surf; rows.append(r) for surf in STATE_CLEAN_LONG: r = base_row(); r["state"] = surf; rows.append(r) # --- planted: mojibake (cp1252 -> utf8) -------------------------------------- MOJIBAKE = [ ("Café Noir SARL", "André Mercier"), # Café Noir / André ("Naïve Ventures", "Björn Anderssøn"), # Naïve / Björn ("Müller GmbH", "Günther Müller"), # Müller / Günther ("Crème Brûlée Co", "François Dubois"), # Crème / François ] for comp, person in MOJIBAKE: r = base_row(); r["company"] = comp; r["contact_name"] = person; rows.append(r) # --- planted: PII in notes (SSN-shaped) -------------------------------------- for ssn in SSNS: r = base_row(); r["notes"] = f"follow up re: SSN {ssn} on file"; rows.append(r) # --- planted: exact + near duplicate rows ------------------------------------ dup_seed = base_row() dup_seed["company"] = "Initech"; dup_seed["contact_name"] = "Peter Gibbons" dup_seed["email"] = "peter.gibbons@initech.com"; dup_seed["country"] = "United States" dup_seed["state"] = "TX"; dup_seed["cc_on_file"] = "4111 1111 1111 1111" rows.append(dict(dup_seed)) rows.append(dict(dup_seed)) # EXACT duplicate near = dict(dup_seed); near["phone"] = "(555) 867-5309" # near-dup: one field differs rows.append(near) near2 = dict(dup_seed); near2["contact_name"] = " Peter Gibbons " # near-dup: whitespace rows.append(near2) # --- shuffle so planted rows aren't clustered (keep exact dups adjacent-able) - random.shuffle(rows) with open("samples/maria_crm_export.csv", "w", newline="", encoding="utf-8") as fh: w = csv.DictWriter(fh, fieldnames=HEADER) w.writeheader() for r in rows: w.writerow(r) print(f"wrote samples/maria_crm_export.csv ({len(rows)} rows x {len(HEADER)} cols)")