scrubdata / samples /make_maria_crm.py
OpenAI Codex
deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build
16dc556
Raw
History Blame Contribute Delete
10.4 kB
"""Generate samples/maria_crm_export.csv — the hero demo dataset.
Persona: Maria, ops coordinator, exported the company CRM on Monday morning.
The file is engineered against the SHIPPED pipeline (scrubdata.planner.mock_plan +
reconcile.grounded_mapping + pii.detect_column_pii) so that:
- country / state / plan_tier canonicalize (auto-fix)
- dates / mrr / phone normalize (auto-fix)
- cp1252->utf8 mojibake repairs
- a dedicated credit-card column (Luhn-valid) -> flag_pii + mask_pii
- exact + near-duplicate rows -> drop_exact_duplicates / left as judgment
- 3 GENUINE 'YOUR CALL' abstentions actually fire as review flags:
Slovia -> Slovakia vs Slovenia (score 0.857, margin 0.0) ABSTAIN
Austrai -> Australia vs Austria (score 0.875, margin 0.018) ABSTAIN
Indai -> India near-miss (score 0.80 < 0.84) ABSTAIN
- PLUS the Acme Corp vs ACME Corporation entity near-tie, which the tool
correctly does NOT auto-merge (documented as the human judgment call).
Deterministic: fixed seed, fixed planted rows. Re-runnable.
"""
from __future__ import annotations
import csv
import random
random.seed(20260612)
HEADER = ["company", "contact_name", "email", "phone", "country", "state",
"signup_date", "plan_tier", "mrr", "status", "cc_on_file", "notes"]
# --- Luhn-valid fake cards (verified by pii.luhn_ok) -------------------------
CARDS = [
"4539578763621486", "4916 3385 0608 2832", "5500-0055-5555-5559",
"4111 1111 1111 1111", "6011000990139424", "3782 822463 10005",
"4485275742308327", "5105105105105100",
]
# clearly-fake SSN-shaped values for the notes field
SSNS = ["123-45-6789", "078-05-1120", "219-09-9999", "457-55-5462"]
# canonical pools (each value carries its target frequency intent via how often we draw it)
PHONE_STYLES = [
"(555) {a}-{b}", "555.{a}.{b}", "+1 555 {a} {b}", "555{a}{b}", "1-555-{a}-{b}",
]
DATE_STYLES = ["iso", "slash", "text", "dmy"]
PLAN_VARIANTS = ["Premium", "premium", "PREMIUM", "Prem", "Basic", "basic", "BASIC",
"Enterprise", "enterprise", "Ent", "Free", "free"]
STATUS_VARIANTS = ["Active", "active", "ACTIVE", "Churned", "churned", "Trial", "trial",
"Lapsed", "lapsed"]
# Large, distinctive pools so random First+Last pairs DON'T collide into spurious
# one-edit "typo" clusters (those wrongly trip the high-cardinality suspect path and
# pollute the demo with fake name merges). Distinctive surnames keep edit-distance high.
FIRST = ["Maria", "James", "Wei", "Aisha", "Carlos", "Priya", "Tom", "Sofia", "Liam",
"Yuki", "Hassan", "Elena", "Jonathan", "Fatima", "Diego", "Anneliese", "Raj",
"Chloe", "Omar", "Ingrid", "Pedro", "Gabrielle", "Ivan", "Lucia", "Samuel",
"Zoe", "Benjamin", "Mei", "Olivier", "Tatiana", "Kwame", "Beatriz", "Soren",
"Yasmin", "Dmitri", "Imani", "Lorenzo", "Saoirse", "Mateo", "Freya"]
LAST = ["Hawthorne", "Underwood", "Castellanos", "Okonkwo", "Lindqvist", "Patelkar",
"Nakamura", "Rossellini", "Fitzgerald", "Abramovich", "Whitfield", "Delacroix",
"Montgomery", "Vasquez", "Kowalski", "Brennan", "Sandoval", "Thornton",
"Eriksson", "Bukowski", "Calloway", "Mwangi", "Domingo", "Ferreira",
"Halvorsen", "Mackenzie", "Petrosyan", "Yamamoto", "Cavendish", "Olszewski"]
# Country surfaces. KEY: every non-canonical spelling must stay rare (freq<3 over the
# WHOLE column) so grounded_mapping folds it; the dominant clean surface carries the bulk.
# 'USA' is intentionally repeated a LOT so the 4-spellings demo is visible, but because
# freq>=3 surfaces are treated as data, the canonicalization is shown via the RARE variants.
CLEAN_COUNTRIES = ["United States", "United Kingdom", "Canada", "Germany", "France",
"Spain", "Italy", "Netherlands", "Brazil", "Japan", "Australia",
"India", "Mexico", "Sweden", "Poland"]
# rare misspellings that SHOULD auto-fold (each used exactly once or twice):
COUNTRY_TYPOS_AUTOFIX = {
"U.S.A": "United States", "us": "United States", "USA.": "United States",
"Nigeia": "Nigeria", "germny": "Germany", "Brasil": "Brazil",
"Polnd": "Poland", "Swedn": "Sweden", "Frnace": "France",
}
# THE YOUR-CALL country ambiguities (each used once, must abstain):
COUNTRY_ABSTAIN = ["Slovia", "Austrai", "Indai"]
CLEAN_STATES = ["CA", "NY", "TX", "WA", "FL", "IL", "MA", "CO", "GA", "OR", "NJ", "AZ"]
STATE_TYPOS_AUTOFIX = {"Calfornia": "California", "Wahsington": "Washington",
"Virgina": "Virginia", "Mississipi": "Mississippi"}
STATE_CLEAN_LONG = ["California", "Texas", "New York", "Florida"]
# Company pools. Acme is the planted entity near-tie.
COMPANIES = ["Globex", "Initech", "Hooli", "Umbrella", "Stark Industries", "Wayne Enterprises",
"Wonka", "Cyberdyne", "Soylent", "Tyrell", "Massive Dynamic", "Vandelay",
"Pied Piper", "Aperture", "Black Mesa", "Gekko & Co", "Bluth Company",
"Dunder Mifflin", "Prestige Worldwide", "Sterling Cooper"]
def _mk_phone() -> str:
a, b = random.randint(200, 998), random.randint(1000, 9998)
return random.choice(PHONE_STYLES).format(a=a, b=b)
def _mk_date() -> str:
y, m, d = 2024, random.randint(1, 12), random.randint(1, 28)
style = random.choice(DATE_STYLES)
months = ["January", "February", "March", "April", "May", "June", "July", "August",
"September", "October", "November", "December"]
if style == "iso":
return f"{y}-{m:02d}-{d:02d}"
if style == "slash":
return f"{m}/{d}/{str(y)[2:]}"
if style == "text":
return f"{months[m-1]} {d} {y}"
return f"{d:02d}-{m:02d}-{y}" # dmy: 01-03-2024
def _mk_mrr() -> str:
base = random.choice([99, 199, 299, 499, 999, 1200, 2400, 4999])
style = random.randint(0, 3)
if style == 0:
return f"${base:,}.00"
if style == 1:
return f"{base}.00"
if style == 2:
return f"{base:,} USD"
return str(base)
def _mk_name() -> str:
n = f"{random.choice(FIRST)} {random.choice(LAST)}"
r = random.random()
if r < 0.12:
return n.upper()
if r < 0.22:
return n.lower()
if r < 0.30:
return " " + n + " " # whitespace chaos
return n
def _mk_email(name: str) -> str:
base = name.strip().lower().replace(" ", " ").replace(" ", ".")
dom = random.choice(["example.com", "acme.co", "globex.io", "mail.com", "corp.net"])
e = f"{base}@{dom}"
if random.random() < 0.15:
e = " " + e.upper() + " " # case + whitespace chaos for normalize_email
return e
def base_row() -> dict:
name = _mk_name()
return {
"company": random.choice(COMPANIES),
"contact_name": name,
"email": _mk_email(name),
"phone": _mk_phone(),
"country": random.choice(CLEAN_COUNTRIES),
"state": random.choice(CLEAN_STATES),
"signup_date": _mk_date(),
"plan_tier": random.choice(PLAN_VARIANTS),
"mrr": _mk_mrr(),
"status": random.choice(STATUS_VARIANTS),
"cc_on_file": random.choice(CARDS),
"notes": "",
}
rows: list[dict] = []
# --- bulk filler rows --------------------------------------------------------
for _ in range(330):
rows.append(base_row())
# --- planted: Acme entity near-tie (genuine YOUR CALL, tool should NOT merge) -
for _ in range(7):
r = base_row(); r["company"] = "Acme Corp"; rows.append(r)
for _ in range(2):
r = base_row(); r["company"] = "acme corp "; rows.append(r) # case+ws -> folds to Acme Corp
r = base_row(); r["company"] = "ACME Corp"; rows.append(r) # case -> folds
for _ in range(4):
r = base_row(); r["company"] = "ACME Corporation"; rows.append(r) # NEAR-TIE: should NOT auto-merge
# --- planted: country canonicalization (auto-fix) + abstentions --------------
for surf in COUNTRY_TYPOS_AUTOFIX:
r = base_row(); r["country"] = surf; rows.append(r)
# extra visible 4-ways for USA so the demo "sees" the chaos (these are freq>=3 = data)
for _ in range(4):
r = base_row(); r["country"] = "USA"; rows.append(r)
for _ in range(3):
r = base_row(); r["country"] = "U.S.A."; rows.append(r)
# THE YOUR CALL country ambiguities — one row each, rare so they reach the abstain path
for surf in COUNTRY_ABSTAIN:
r = base_row(); r["country"] = surf; rows.append(r)
# --- planted: state canonicalization -----------------------------------------
for surf in STATE_TYPOS_AUTOFIX:
r = base_row(); r["state"] = surf; rows.append(r)
for surf in STATE_CLEAN_LONG:
r = base_row(); r["state"] = surf; rows.append(r)
# --- planted: mojibake (cp1252 -> utf8) --------------------------------------
MOJIBAKE = [
("Café Noir SARL", "André Mercier"), # Café Noir / André
("Naïve Ventures", "Björn Anderssøn"), # Naïve / Björn
("Müller GmbH", "Günther Müller"), # Müller / Günther
("Crème Brûlée Co", "François Dubois"), # Crème / François
]
for comp, person in MOJIBAKE:
r = base_row(); r["company"] = comp; r["contact_name"] = person; rows.append(r)
# --- planted: PII in notes (SSN-shaped) --------------------------------------
for ssn in SSNS:
r = base_row(); r["notes"] = f"follow up re: SSN {ssn} on file"; rows.append(r)
# --- planted: exact + near duplicate rows ------------------------------------
dup_seed = base_row()
dup_seed["company"] = "Initech"; dup_seed["contact_name"] = "Peter Gibbons"
dup_seed["email"] = "peter.gibbons@initech.com"; dup_seed["country"] = "United States"
dup_seed["state"] = "TX"; dup_seed["cc_on_file"] = "4111 1111 1111 1111"
rows.append(dict(dup_seed))
rows.append(dict(dup_seed)) # EXACT duplicate
near = dict(dup_seed); near["phone"] = "(555) 867-5309" # near-dup: one field differs
rows.append(near)
near2 = dict(dup_seed); near2["contact_name"] = " Peter Gibbons " # near-dup: whitespace
rows.append(near2)
# --- shuffle so planted rows aren't clustered (keep exact dups adjacent-able) -
random.shuffle(rows)
with open("samples/maria_crm_export.csv", "w", newline="", encoding="utf-8") as fh:
w = csv.DictWriter(fh, fieldnames=HEADER)
w.writeheader()
for r in rows:
w.writerow(r)
print(f"wrote samples/maria_crm_export.csv ({len(rows)} rows x {len(HEADER)} cols)")