Spaces:

build-small-hackathon
/

scrubdata

Running

OpenAI Codex OpenAI Codex commited on 18 days ago

Commit

16dc556

0 Parent(s):

deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build

Tags the submission for OpenAI's Best Use of Codex prize — backed by real
Codex-attributed commits (@codex in the connected GitHub repo + this Space's history).
Same human-verified Codex-hardened build (84 tests green).

Co-authored-by: OpenAI Codex <codex@openai.com>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +47 -0
.gitignore +37 -0
.python-version +1 -0
PRODUCT.md +162 -0
README.md +231 -0
TRANSFER.md +69 -0
app.py +90 -0
design/mockups/calm/index.html +430 -0
design/mockups/cozy/index.html +526 -0
design/mockups/helper/index.html +517 -0
design/mockups/office/index.html +219 -0
docs/DATASETS.md +57 -0
docs/DEGENERATE_BASELINES.md +30 -0
docs/FIELD_NOTES.md +128 -0
docs/GITTABLES_AUDIT.md +24 -0
docs/PAIRED_BENCH.md +49 -0
docs/PAPER.md +66 -0
docs/SCALING_ARM.md +46 -0
docs/TOOL_REFERENCE.md +251 -0
docs/WILD_BENCH.md +41 -0
docs/assets/space_landing.png +3 -0
docs/assets/space_results.png +3 -0
docs/paper/fig_label_curve.pdf +3 -0
docs/paper/fig_label_curve.png +3 -0
docs/paper/fig_precision_coverage.pdf +3 -0
docs/paper/fig_precision_coverage.png +3 -0
docs/paper/fig_risk_coverage.pdf +3 -0
docs/paper/fig_risk_coverage.png +3 -0
docs/paper/main.aux +59 -0
docs/paper/main.log +269 -0
docs/paper/main.pdf +3 -0
docs/paper/main.tex +1021 -0
docs/paper/numbers.tex +146 -0
eval/README.md +136 -0
eval/__init__.py +12 -0
eval/ablations.py +64 -0
eval/baselines_learned.py +145 -0
eval/calibration.py +119 -0
eval/capture_plan_local.py +90 -0
eval/contamination_probe.py +57 -0
eval/cross_scoring.py +294 -0
eval/degenerate.py +172 -0
eval/diagnose_model.py +91 -0
eval/equivalence.py +119 -0
eval/generalization.py +180 -0
eval/gittables_audit.py +95 -0
eval/gold.jsonl +0 -0
eval/gold.py +64 -0
eval/inject.py +103 -0
eval/inject_validity.py +317 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,47 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/assets/space_results.png filter=lfs diff=lfs merge=lfs -text
+docs/paper/main.pdf filter=lfs diff=lfs merge=lfs -text
+docs/paper-eab/main.pdf filter=lfs diff=lfs merge=lfs -text
+docs/paper-pvldb/main.pdf filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,37 @@

+# Python
+__pycache__/
+*.py[cod]
+.venv/
+*.egg-info/
+# uv
+.uv/
+# Gradio
+.gradio/
+flagged/
+# Models / data (keep large artifacts out of git; push to the Hub instead)
+*.gguf
+*.bin
+*.safetensors
+models/
+data/
+# Env / secrets
+.env
+.env.*
+# OS / editor
+.DS_Store
+.idea/
+.vscode/
+.gstack/
+# internal: working memory + agent/skill defs — never publish
+project-memory/
+.claude/
+_private/
+# demo video assets (local only)
+_video/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

PRODUCT.md ADDED Viewed

	@@ -0,0 +1,162 @@

+# ScrubData — Product Research & Spec
+> What does an office worker actually mean by "just clean my data"? This doc
+> pins down the expectations so the cleaning-plan schema and UX aren't guesses.
+> (Living doc — refine when the deep-research workflows land.)
+## 1. The user & the moment
+**Who:** an operations / sales-ops / finance / admin person. Lives in
+spreadsheets exported from a CRM, an ERP, a Google Form, a POS, a bank portal.
+Not a pandas user. Competent with Excel but doesn't want to write `=PROPER()`
+across 40 columns or learn Power Query.
+**The moment of pain:** they exported a file to do their actual job —
+build a report, upload to another system, send a mail-merge, reconcile numbers —
+and the file is dirty enough that the next step breaks or lies. The import fails,
+the pivot double-counts, the vlookup misses, the "total revenue" is wrong because
+amounts are text.
+**What they want:** drop the file in, get a *trustworthy* clean file back, and
+a plain sentence telling them what was wrong so they can vouch for it to their
+boss. They do **not** want 30 config toggles. Hands-off is the whole pitch.
+**What they fear (must design against):** that the tool silently changed
+something it shouldn't have. Trust is the product. Every change must be
+**visible, explained, and reversible**.
+## 2. Taxonomy of "dirty" — what we must detect & fix
+Grouped by how an office worker would describe it. This list *is* the operation
+set the planner emits and the executor implements.
+### A. Structural / table-level
+- **Exact duplicate rows** — "this person is in here 3 times."
+- **Near-duplicate rows** — same entity, trivial differences (later/stretch).
+- **Empty rows & empty columns** — junk from the export.
+- **Header problems** — header not in row 1, merged cells, `Unnamed: 0`,
+  duplicated column names, units baked into headers (`Amount (USD)`).
+- **Inconsistent column naming** — `First Name` vs `first_name` (normalize to
+  snake_case as an option, off by default — it's a rename, higher-trust-risk).
+### B. Whitespace & casing (the silent killers behind failed joins)
+- Leading/trailing whitespace; doubled internal spaces; non-breaking spaces.
+- Inconsistent casing (`ACME`, `Acme`, `acme corp`).
+- Invisible characters (zero-width, BOM), smart quotes.
+### C. Missing values, disguised
+- Real blanks **plus** disguised nulls: `N/A`, `na`, `-`, `--`, `null`, `None`,
+  `#N/A`, `TBD`, `?`, `0` (context-dependent — risky, don't auto-assume).
+- Decision: normalize disguised nulls → true missing; **imputation is opt-in**,
+  never silent (filling values is a claim about reality).
+### D. Type & format inconsistency (where the model earns its keep)
+- **Numbers stored as text:** `"$1,200.50"`, `"1.200,50"` (EU), `"(500)"`
+  (accounting negative), `"12%"`, `"1,2k"`.
+- **Dates in mixed formats:** `2023-01-05`, `01/05/2023`, `5 Jan 2023`,
+  `Jan-23`, Excel serial `44931`. Ambiguous DMY vs MDY must be detected, not
+  guessed blindly — infer from the column's evidence, flag if undecidable.
+- **Booleans:** `Yes/No`, `Y/N`, `TRUE/FALSE`, `1/0`, `T/F`, `✓`.
+- **Phone numbers:** wildly inconsistent; standardize to E.164-ish where region
+  is inferable, else just strip to digits + canonical format.
+- **Emails:** casing, whitespace, obvious typos (`@gmial.com`), trailing junk.
+### E. Categorical canonicalization (the headline AI feature)
+- Inconsistent labels for the same thing: `USA / U.S.A. / United States / us`,
+  `M/F vs Male/Female`, `NY / New York / new york`, status fields, product
+  names. Rules can't enumerate these — **the small model proposes the mapping**,
+  the executor applies it, the report shows the mapping for approval.
+### F. Validity / anomaly flags (flag, don't auto-delete)
+- Out-of-range numbers (age 999, negative price), impossible dates (1899-12-31
+  Excel epoch), malformed emails/phones, values that don't match the column's
+  inferred type. Default action = **flag in the report**, not silent edit.
+## 3. The trust contract (design principles)
+1. **Visible** — every operation appears in a before/after diff and the report.
+2. **Explained** — plain-English rationale per operation ("standardized 4 date
+   formats into ISO `YYYY-MM-DD`").
+3. **Conservative by default** — destructive/assumptive ops (imputation, row
+   deletion beyond exact dups, renames) are surfaced as suggestions, applied
+   only if the user keeps them on. Safe ops (trim whitespace, normalize disguised
+   nulls, parse types) are on by default.
+4. **Reversible** — original file untouched; output is a new file + a machine-
+   readable plan the user could replay or undo.
+5. **No config to start** — sensible defaults run immediately on upload; the
+   plan is editable *after* the user sees it, not a wall of options before.
+## 4. Competitive landscape (what to learn / what to beat)
+| Tool | What it does well | Why an office worker bounces |
+|------|-------------------|------------------------------|
+| **Excel / Power Query** | Ubiquitous, trusted | Manual; canonicalization is hand-built; steep |
+| **OpenRefine** | Powerful clustering/canonicalization (key-collision, kNN) | Intimidating UI, GREL expressions, local Java app |
+| **ydata-profiling / pandas-profiling** | Great *profiling* report | Diagnoses, doesn't *fix* |
+| **Trifacta / Tableau Prep / Alteryx** | Visual prep pipelines | Enterprise, paid, config-heavy |
+| **OpenRefine reconciliation** | Entity canonicalization | Manual, needs setup |
+**Our wedge:** OpenRefine's clustering *automated and explained by a small
+model*, with zero config and a one-screen trust-preserving UX. We borrow
+OpenRefine's clustering idea but the model proposes the clusters/mappings and
+narrates them, so the user never learns a tool — they just approve sentences.
+## 5. Cleaning-plan schema (v0 — drives the mock & later the model)
+The model outputs this JSON; the executor consumes it. Designed so the model
+only does *semantic/fuzzy* judgment, and all execution is deterministic.
+```json
+{
+  "dataset_summary": "Contacts export, 38 rows × 9 cols; sales-lead data.",
+  "table_operations": [
+    {"op": "drop_exact_duplicates", "rationale": "5 identical rows."},
+    {"op": "drop_empty_rows"},
+    {"op": "drop_empty_columns", "columns": ["notes2"]}
+  ],
+  "columns": [
+    {
+      "name": "country",
+      "detected_semantic_type": "country",
+      "issues": ["inconsistent_categories", "whitespace", "casing"],
+      "operations": [
+        {"op": "strip_whitespace"},
+        {"op": "canonicalize_categories",
+         "mapping": {"usa": "United States", "u.s.a.": "United States",
+                     "us": "United States", "uk": "United Kingdom"},
+         "rationale": "Unified 4 spellings into 2 canonical country names."}
+      ],
+      "confidence": 0.93
+    },
+    {
+      "name": "amount",
+      "detected_semantic_type": "currency",
+      "issues": ["numeric_stored_as_text", "currency_symbols"],
+      "operations": [
+        {"op": "parse_currency", "rationale": "Stripped $ and thousands separators; → float."}
+      ],
+      "confidence": 0.97
+    }
+  ],
+  "flags": [
+    {"column": "age", "row_hint": "value 999", "issue": "out_of_range",
+     "action": "flag_only", "rationale": "Likely placeholder; left for human review."}
+  ]
+}
+```
+### Operation vocabulary (executor must implement)
+Safe-by-default: `strip_whitespace`, `collapse_internal_whitespace`,
+`normalize_disguised_nulls`, `standardize_case`, `parse_currency`,
+`parse_number`, `parse_percent`, `parse_date`, `standardize_boolean`,
+`standardize_phone`, `normalize_email`, `drop_exact_duplicates`,
+`drop_empty_rows`, `drop_empty_columns`, `canonicalize_categories`.
+Opt-in (assumptive): `impute_missing`, `drop_near_duplicates`,
+`rename_columns_snake_case`, `coerce_outliers`.
+Flag-only: `flag_out_of_range`, `flag_invalid_format`, `flag_type_mismatch`.
+## 6. Success metric for the demo (Backyard AI judging)
+A real office person uploads a real ugly export, clicks one button, and says
+"oh thank god" — then trusts the result enough to use it, because the report
+told them exactly what changed. That sentence is the bar.

README.md ADDED Viewed

	@@ -0,0 +1,231 @@

+---
+title: ScrubData
+emoji: 🏔️
+colorFrom: green
+colorTo: indigo
+sdk: gradio
+sdk_version: 6.16.0
+app_file: server.py
+pinned: true
+license: mit
+tags:
+  - track:backyard
+  - sponsor:openai
+  - sponsor:modal
+  - achievement:offgrid
+  - achievement:welltuned
+  - achievement:offbrand
+  - achievement:llama
+  - achievement:sharing
+  - achievement:fieldnotes
+---
+# ScrubData — hands-off data cleaning, with the receipts
+Entry for the **Build Small Hackathon** (Gradio · Hugging Face), 🏡 Backyard AI track.
+Runs a ≤4B model — a local-runnable GGUF, no third-party AI APIs → also in the running for
+**Tiny Titan**, **Off-Brand**, **Best Demo**, **Best Agent**, and **Bonus Quest Champion**
+(all six quests claimed above).
+<!-- SUBMISSION LINKS (all set for June 15):
+  Demo video: https://www.loom.com/share/2fa868147527496e8097d82dd546d663  [DONE]
+  Social post: https://x.com/ric_alanis/status/2066598533738692983  [DONE]
+  These links + this write-up are required by the build-small-hackathon /submit tool. -->
+> **Hosted demo vs. local — read this.** This Space is a **no-install demo** that cleans with
+> the real **Qwen3-4B fine-tune** by default (served on an A100 GPU, ~1 min/clean warm; first
+> run after idle ~2 min on cold start) — the whole point
+> is the small model doing the work. Your file is processed on Hugging Face / the GPU endpoint
+> (sent to no third-party API, not stored); untick the box for an instant deterministic pass.
+> The **privacy story is a property of running it yourself**: `SCRUBDATA_MODEL=scrubdata-ft uv
+> run server.py` reads and cleans your file on-device with the same fine-tune — nothing leaves
+> your machine. The app labels its own mode honestly (the ribbon says which one you're using).
+> Same auditable plan→verify→execute pipeline either way.
+> **Modal** (`sponsor:modal`): the hosted Space cleans with the Qwen3-4B fine-tune served from a
+> **scale-to-zero Modal GPU endpoint** (`scripts/modal_serve.py`, Ollama on an A100; $0 when idle,
+> pre-warmed on page load to hide the cold start). Modal also drove the headless training +
+> evaluation loop behind the published model. The deterministic planner is the silent fallback
+> if the GPU is cold or down, so the demo never hard-fails.
+> **Drop a messy export. Get clean data back — every change named, reversible, and
+> explained. Anything sensitive is protected locally. The judgment calls stay yours.**
+>
+> For the office/ops person trying to do their job while their data is a mess.
+**Built by:** [@ricalanis](https://huggingface.co/ricalanis) (solo) · 🤗 Hugging Face: `ricalanis`
+**Live Space:** https://huggingface.co/spaces/build-small-hackathon/scrubdata
+**Code (open source):** https://github.com/ricalanis/scrubdata-hackathon
+**Demo video:** https://www.loom.com/share/2fa868147527496e8097d82dd546d663
+**Write-up / post:** https://x.com/ric_alanis/status/2066598533738692983
+## How it works
+A small local model is the **planner**, never a row-by-row editor:
+1. **Profile** — pandas aggregates each column into a value–frequency distribution
+   (scale-invariant: a million rows profile like a hundred).
+2. **Plan** — the model reads the profile and emits a structured JSON cleaning plan:
+   canonicalization mappings, format fixes, dedup, anomaly flags.
+3. **Ground** — canonical forms are never invented: values reconcile against reference
+   taxonomies (GeoNames 196k cities, ISO countries/states, and a pluggable **entity
+   reference** built from harvested vocabularies — ToughTables/MusicBrainz/Wikidata/ROR,
+   ~100k entities) with fuzzy retrieval; ambiguous matches **abstain** and surface for
+   human review (calibrated: 90% precision at the default threshold, ≥95% at 0.91).
+   Profiles carry **suspect_values** — rare anomalous surfaces with evidence-backed
+   candidates — so high-cardinality columns are no longer invisible to the planner
+   (measured: five all-unique-surface benchmark tables went 0.0 → 0.96 F1 at zero damage).
+4. **Verify** — every model-proposed mapping is scored by deterministic evidence
+   (errors-are-rare frequency gates, variant similarity, reference agreement); entries
+   below the confidence threshold (`SCRUBDATA_TAU`, default 0.5) become review flags
+   instead of edits. The shipped **verified union planner** (gated model plan ∪ grounded
+   heuristic) measures **0.905 precision @ 0.413 coverage** on hospital's 509 real errors
+   — the gated model plan alone is 0.993 @ 0.287.
+5. **Protect** — PII is detected locally (Luhn/IBAN checksums + a 44M OpenMed-PII
+   classifier): cards/SSNs masked format-preservingly, contacts flagged, **0/360 residual
+   PII** after masking in our leak test.
+6. **Execute** — deterministic pandas applies the plan. No silent edits, by construction;
+   every run exports an audit trail (OpenTelemetry-GenAI spans + open traces).
+**Model:** `Qwen3-4B-Instruct-2507` (Tiny Titan), QLoRA fine-tuned on **execution-verified**
+synthetic + real-derived data (every training plan provably recovers the clean table),
+runnable via llama.cpp GGUF.
+## The app (what judges see)
+A custom `gr.Server` frontend (no default Gradio chrome — the **Off-Brand** quest), built
+around the trust story:
+- **YOUR CALL cards** — when the model is genuinely torn (e.g. *Slovia → Slovakia 86% vs
+  Slovenia 86%*) it abstains and hands you the tie with both candidates; pick the right one
+  and **stage several decisions**, then "✓ Clean now" replays them as one plan.
+- **Named, reversible receipts** — every edit shows as a row in the audit grid with its op +
+  rationale and a before/after diff; nothing is silent.
+- **PII review cards** — embedded cards/SSNs (Luhn/strict-regex) flagged and masked
+  format-preservingly, on-device.
+- **Save / replay recipe** — export the cleaning plan as JSON and re-apply it to next week's
+  export in one click (the "Monday ritual").
+- **Honest, self-aware copy** — the app injects its own runtime state and the ribbon says
+  exactly which planner ran and where your data was processed.
+- **A fun, size-aware ETA timer** + cold-start readiness gate + page-load GPU pre-warm, so
+  the model path feels responsive and never lies about progress.
+- Drag-and-drop, two bundled sample exports, mobile-responsive layout.
+## What real users told us (and what we changed)
+Before submission we put the live Space in front of people who **aren't** data folks — the
+exact audience the tool is for — and sent the link with one line: *"if you have a messy
+spreadsheet, try it."* The most useful finding wasn't a bug. It was that the word
+**"cleaning" didn't land**:
+- One tester read "clean my Excel" as *deleting* data:
+  *"¿Te refieres a que elimine algo de algún archivo?"* — "You mean it removes something
+  from the file?"
+- Another didn't know where to begin:
+  *"¿eso del Excel te lo subimos ahí o cómo?"* — "the Excel thing, do we upload it there,
+  or how?"
+- The clearest explanation in the whole thread was one we had to type by hand in chat:
+  *"it fixes text errors — names, phones, emails, cities."* That sentence wasn't anywhere
+  in the product.
+So we changed the product to **show** what cleaning means instead of naming it:
+- the hero now leads with a literal before→after strip
+  (`nigeia → Nigeria`, `Calfornia → California`, `Ana@GMAIL.com → ana@gmail.com`,
+  `415.555.0192 → (415) 555-0192`) so the value is obvious *before* any upload;
+- the headline is the sentence that worked in chat — **"Fix the messy text in your
+  spreadsheet"** — and the copy says plainly **"I never delete your data"** (killing the
+  "does it erase things?" misread);
+- a one-click **"watch it run on a sample file"** path removes the "where do I start?" wall;
+- jargon labels are gone ("HR payroll (with PII)" → "an HR file with sensitive data").
+n is small and informal (friends-and-network, ~3 people), so this isn't a usability *study* —
+but the feedback was real, it pointed at a failure of the *framing* rather than the engine,
+and it changed the build. The persona "Maria" below is the controlled walk-through; the
+quotes above are verbatim from people we know.
+## Measured (not vibes)
+- **Canonicalization micro-F1 0.90 (best single run; 0.80 ± 0.01 over 3 training seeds)** for the 4B
+  fine-tune vs **0.45** for a much larger generic model vs **0.15** for rules.
+- Real errors (5-benchmark macro): grounded cleaning reaches REAL-F1 **0.225**, 3.9×
+  OpenRefine kNN (0.058) and 5.7× fingerprint (0.039); the verified-union gate repairs
+  41% of hospital's 509 real errors at **0.905 precision**, every declined merge
+  surfaced for review.
+- Evaluated on a **65-dataset suite** (Raha benchmarks + seeded error injection over 15
+  open-data domains) with a churn-neutral metric that can't be gamed by mass rewriting.
+- Full write-up: `docs/paper/` (preprint draft) · details in `eval/README.md`.
+## Run it
+```bash
+uv sync
+uv run server.py                                   # gr.Server + custom UI (grounded heuristic)
+# fine-tuned model as planner (needs Ollama + the GGUF, see notebooks/Modelfile):
+ollama pull hf.co/ricalanis/scrubdata-qwen3-4b-v6-q8:Q8_0
+ollama create scrubdata-ft -f notebooks/Modelfile
+SCRUBDATA_MODEL=scrubdata-ft uv run server.py      # model planner, heuristic fallback (on-device)
+SCRUBDATA_PII_NER=1 uv run server.py               # +44M NER for name/address columns
+uv run python -m scrubdata.cli messy.csv -o clean.csv --plan plan.json
+uv run pytest tests/                               # engine + scorer tests (69)
+```
+The hosted Space serves the same fine-tune from a scale-to-zero **Modal A100**
+(`scripts/modal_serve.py`) and the planner adds `format=json` on that path
+(`SCRUBDATA_OLLAMA_FORMAT_JSON=1`) to grammar-constrain the GGUF on the A100's kernels.
+`scripts/modal_warm.py on|off` pins/un-pins a warm container (no cold start) without a
+redeploy — leave it `off` (scale-to-zero, $0 idle), flip `on` for a live judging window.
+## Repo map
+- `scrubdata/` — `profiler` · `planner` · `reconcile` (reference grounding + abstain) ·
+  `grounded` (RACOON wrapper) · `verifier` (selective prediction + union planner) ·
+  `pair_profile` (candidate-constrained canonicalization, opt-in) · `pii` (checksum +
+  NER tiers, mask/hash/pseudonymize) · `executor` · `observability` · `trace` ·
+  `baselines` (OpenRefine) · `cli`.
+- `training/` — execution-verified synthetic generator + real-data derivation
+  (`real_data.py`: paired benchmarks + frequency-derived unpaired open data).
+- `eval/` — frozen gold · wide suite + double-macro north-star (`run_real_multi.py`) ·
+  ablations · calibration (risk–coverage) · PII leak test.
+- `docs/paper/` — preprint: *Verified Cleaning Plans: Plan-Level Selective Prediction
+  Turns Local LLM Planners into Trustworthy Table Cleaners*.
+- `scripts/` — Modal train/eval (headless GPU loop), trace publishing.
+## Research & resources
+Everything behind the demo is public:
+- 🚀 **Live Space** — https://huggingface.co/spaces/build-small-hackathon/scrubdata
+- 💻 **Code (open source)** — https://github.com/ricalanis/scrubdata-hackathon
+- 🧠 **Fine-tuned model** — https://huggingface.co/ricalanis/scrubdata-qwen3-4b
+  (Q8_0 GGUF: https://huggingface.co/ricalanis/scrubdata-qwen3-4b-v6-q8)
+- 📊 **WildClean dataset** (real-world dirty tables + injected-error benches) —
+  https://huggingface.co/datasets/ricalanis/wildclean
+- 🔍 **Agent traces** (OpenTelemetry-GenAI spans from real runs) —
+  https://huggingface.co/datasets/build-small-hackathon/scrubdata-traces
+- 📄 **Preprint** — *Verified Cleaning Plans: Plan-Level Selective Prediction Turns Local
+  LLM Planners into Trustworthy Table Cleaners* (`docs/paper/main.pdf`)
+- 📓 **Field notes** (the build story, failures included) — `docs/FIELD_NOTES.md`
+- 🛠️ **Tool reference** (the whole system, end to end) — `docs/TOOL_REFERENCE.md`
+## Built with Codex
+The final review-and-refine pass used **OpenAI Codex** (gpt-5.5) as a reviewer / last
+refiner — not to write the product, but to harden it. It added the executor's
+never-corrupt-clean-data regression tests, made column sanitization collision-proof,
+did the accessibility pass (ARIA + keyboard + reduced-motion + focus-visible), and wrote
+characterization tests for the reference matcher. Every change was human-reviewed and
+verified green (84 tests, golden behavior unchanged) before commit; the commits are
+attributed to `@codex` in the git history.
+## Submission checklist (verified against the build-small-hackathon `/submit` tool)
+- [x] Public Gradio Space in the `build-small-hackathon` org
+- [x] Every model ≤ 32B (here ≤ 4B → **Tiny Titan**-eligible): `Qwen3-4B-Instruct-2507`
+- [x] README `tags:` set — `track:backyard` + all six `achievement:*` quests (above)
+- [x] **Off the Grid** (`offgrid`) — no third-party AI APIs; the planner is a local-runnable GGUF (Qwen3-4B). Self-hosted = fully on-device (zero external egress); the hosted demo serves the *same* model from a self-managed Modal GPU, not a SaaS API
+- [x] **Well-Tuned** (`welltuned`) — fine-tune published: `ricalanis/scrubdata-qwen3-4b` (+ `-v6-q8` GGUF)
+- [x] **Off-Brand** (`offbrand`) — custom `gr.Server` HTML/CSS frontend, not default Gradio
+- [x] **Llama Champion** (`llama`) — runs through llama.cpp (Q8_0 GGUF)
+- [x] **Sharing is Caring** (`sharing`) — agent traces on the Hub: `build-small-hackathon/scrubdata-traces`
+- [x] **Field Notes** (`fieldnotes`) — build report: `docs/FIELD_NOTES.md`
+- [x] Write-up in this README (idea + tech)
+- [x] **Demo video** link in README: https://www.loom.com/share/2fa868147527496e8097d82dd546d663
+- [x] **Social post** link in README: https://x.com/ric_alanis/status/2066598533738692983
+- [x] Confirm deadline time/timezone — **June 15 2026, 23:59 UTC** (confirmed on the hackathon page)
+Judged (no tag needed, just qualify): Tiny Titan · Off-Brand prize · Best Demo · Best Agent · Bonus Quest Champion.

TRANSFER.md ADDED Viewed

	@@ -0,0 +1,69 @@

+# Machine transfer guide
+Everything needed to continue this project on a new machine.
+## 1. Clone + deps
+```bash
+git clone https://github.com/ricalanis/scrubdata-hackathon.git ~/Dev/hackaton-small
+cd ~/Dev/hackaton-small && uv sync
+uv run pytest tests/   # 25 tests should pass
+```
+## 2. Restore Claude Code memory (IMPORTANT)
+The agent's persistent memory is bundled in `project-memory/`. On the new machine, after
+opening the project in Claude Code once (so the project dir exists):
+```bash
+cp project-memory/*.md ~/.claude/projects/-Users-<USER>-Dev-hackaton-small/memory/
+```
+(Adjust the path-keyed directory name to the new machine's project path. `MEMORY.md` is the
+index; the rest are the knowledge base — data-loop-playbook.md and arxiv-paper.md are the
+operational core.)
+## 3. Cloud auth (state lives in the cloud, just re-authenticate)
+```bash
+uv run modal token new        # Modal: adapters in volume scrubdata-v5-adapter
+                              #   (/v5 = v5, /v5_seed21 = v6/mixA winner, seeds 1-3,25,26)
+                              # results Dicts: scrubdata-train-results (seedN keys),
+                              #   scrubdata-eval-v5-results, scrubdata-suite-results
+hf auth login                 # HF: Space build-small-hackathon/scrubdata, model repos
+                              #   ricalanis/scrubdata-qwen3-4b{,-v6-q8}, traces dataset
+gh auth login                 # GitHub
+```
+## 4. Local model (optional, 4.3GB)
+```bash
+ollama pull hf.co/ricalanis/scrubdata-qwen3-4b-v6-q8:Q8_0
+ollama create scrubdata-ft-v6 -f notebooks/Modelfile
+SCRUBDATA_MODEL=scrubdata-ft-v6 uv run server.py
+```
+## 5. Regenerable data (data/ is gitignored)
+Harvested alias vocabularies + paired examples are PRESERVED in `training/harvests/` —
+copy them back so the generator finds them:
+```bash
+mkdir -p data && cp training/harvests/*.jsonl data/
+```
+Big training mixes are regenerable:
+```bash
+uv run python -m training.build_dataset --n 1600 --out data/v5_synth.jsonl --seed 5
+uv run python -m training.real_data --datasets hospital beers movies_1 --per-dataset 80 --out data/v6_paired_big.jsonl
+# mix recipe (mixA = winner): synth + paired*4, shuffled -> data/v5_train.jsonl
+```
+The eval suite re-fetches Raha benchmarks automatically; harvested gov/GitHub CSVs
+(data/real/cache) re-download via training/unpaired_sources.json.
+## 6. In-flight at transfer time
+- mixH (additive-composition test, seed 30): Modal call `fc-01KTRXTHJKW3G81BT4Q0FZET8G`,
+  result lands in Dict `scrubdata-train-results` key `seed30`. Retrieve from any machine:
+  ```bash
+  uv run python -c "import modal; print(modal.Dict.from_name('scrubdata-train-results').get('seed30'))"
+  ```
+- Open question it answers: whether the vocab-mix regressions (mixE/F/G ~0.57-0.59 vs mixA
+  0.748) were eval-coverage shift. See project-memory/data-loop-playbook.md.
+## 7. Where everything lives
+- Paper: `docs/paper/main.tex` (+ numbers.tex, fig) — compiles with pdflatex; COMPLETE.
+- Submission kit: `docs/SUBMISSION.md` (demo script + social post), `docs/FIELD_NOTES.md`.
+- Live Space: https://huggingface.co/spaces/build-small-hackathon/scrubdata
+- arXiv next steps: cs.DB endorser etc. — project-memory/arxiv-paper.md.
+- Hackathon deadline: 2026-06-15 (demo video + social post remain).

app.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""ScrubData — hands-off data cleaning (Gradio app).
+Runnable MOCK demo on gr.Blocks: upload → profile → plan → clean → diff +
+report → download. The planner is a heuristic stand-in for the fine-tuned ≤4B
+model; the rest of the pipeline is real. Final version will port this flow to
+gr.Server + a custom HTML frontend for the Off-Brand bonus quest.
+"""
+from __future__ import annotations
+import tempfile
+from pathlib import Path
+import gradio as gr
+import pandas as pd
+from scrubdata import apply_plan, mock_plan, profile_dataframe, render_report
+from scrubdata.active import get_planner
+from scrubdata.trace import log_run
+PLANNER = get_planner()   # fine-tuned model if SCRUBDATA_MODEL is set, else heuristic
+SAMPLE = Path(__file__).parent / "samples" / "dirty_contacts.csv"
+def _read_any(path: str) -> pd.DataFrame:
+    """Read CSV or Excel as raw strings (cleaning decides the real types)."""
+    p = Path(path)
+    if p.suffix.lower() in {".xlsx", ".xls"}:
+        return pd.read_excel(p, dtype=str)
+    return pd.read_csv(p, dtype=str, keep_default_na=False)
+def clean(file_path: str):
+    if not file_path:
+        return (gr.update(), gr.update(), "Upload a CSV or Excel file to begin.", None)
+    raw = _read_any(file_path)
+    before = profile_dataframe(raw)
+    plan = PLANNER(raw)
+    cleaned, log = apply_plan(raw, plan)
+    after = profile_dataframe(cleaned)
+    report = render_report(plan, log, before, after)
+    out = Path(tempfile.gettempdir()) / "scrubbed.csv"
+    cleaned.to_csv(out, index=False)
+    try:  # best-effort agent-trace capture (Open trace bonus quest)
+        log_run(before, raw, plan, log, model=plan.get("_generated_by", "mock_planner"))
+    except Exception:
+        pass
+    return raw, cleaned, report, str(out)
+def load_sample():
+    return str(SAMPLE)
+with gr.Blocks(title="ScrubData") as demo:
+    gr.Markdown(
+        "# 🧽 ScrubData\n"
+        "**Upload your dirty spreadsheet. Get clean data back. No config.**\n\n"
+        "_Mock demo — heuristic planner standing in for the fine-tuned model._"
+    )
+    with gr.Row():
+        file_in = gr.File(label="Upload CSV / Excel", file_types=[".csv", ".xlsx", ".xls"],
+                          type="filepath")
+        with gr.Column():
+            run_btn = gr.Button("🧽 Clean it", variant="primary")
+            sample_btn = gr.Button("Use the messy sample")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Before")
+            before_df = gr.Dataframe(label="Original", interactive=False, wrap=True)
+        with gr.Column():
+            gr.Markdown("### After")
+            after_df = gr.Dataframe(label="Cleaned", interactive=False, wrap=True)
+    report_md = gr.Markdown()
+    download = gr.File(label="Download cleaned file")
+    run_btn.click(clean, inputs=file_in, outputs=[before_df, after_df, report_md, download])
+    sample_btn.click(load_sample, outputs=file_in)
+if __name__ == "__main__":
+    demo.launch(theme=gr.themes.Soft())

design/mockups/calm/index.html ADDED Viewed

	@@ -0,0 +1,430 @@

+<!DOCTYPE html>
+<html lang="es">
+<head>
+<meta charset="UTF-8" />
+<meta name="viewport" content="width=device-width, initial-scale=1.0" />
+<title>ScrubData — Tu lista, ordenada con calma</title>
+<style>
+  :root{
+    --paper:#fbf7f0;
+    --paper-2:#fffdf9;
+    --ink:#3a3530;
+    --ink-soft:#6f675d;
+    --line:#ece4d6;
+    --accent:#7ba087;      /* single calm sage accent */
+    --accent-soft:#e8f0ea;
+    --accent-deep:#5e8470;
+    --warm:#d8a25e;        /* gentle merit-badge gold, used sparingly */
+    --shadow:0 14px 40px -22px rgba(80,70,55,.45);
+    --radius:26px;
+  }
+  *{box-sizing:border-box;}
+  html,body{margin:0;padding:0;}
+  body{
+    font-family:"Iowan Old Style","Palatino Linotype",Palatino,Georgia,"Times New Roman",serif;
+    background:
+      radial-gradient(120% 80% at 50% -10%, #fffdf9 0%, var(--paper) 55%, #f5efe4 100%);
+    color:var(--ink);
+    line-height:1.6;
+    -webkit-font-smoothing:antialiased;
+    min-height:100vh;
+    display:flex;
+    flex-direction:column;
+    align-items:center;
+    padding:34px 20px 70px;
+  }
+  ::selection{background:var(--accent-soft);}
+  /* ---------- top bar ---------- */
+  .topbar{
+    width:100%;
+    max-width:760px;
+    display:flex;
+    align-items:center;
+    justify-content:space-between;
+    margin-bottom:30px;
+  }
+  .brand{display:flex;align-items:center;gap:11px;}
+  .leaf{width:34px;height:34px;flex:none;}
+  .brand-name{font-size:1.18rem;font-weight:600;letter-spacing:.2px;}
+  .brand-name small{display:block;font-size:.72rem;color:var(--ink-soft);letter-spacing:.4px;font-weight:400;}
+  .lang{
+    display:flex;background:var(--paper-2);border:1px solid var(--line);
+    border-radius:999px;padding:3px;font-family:system-ui,sans-serif;font-size:.8rem;
+  }
+  .lang button{
+    border:none;background:transparent;color:var(--ink-soft);
+    padding:6px 14px;border-radius:999px;cursor:pointer;font-weight:600;letter-spacing:.3px;
+  }
+  .lang button.on{background:var(--accent);color:#fff;}
+  /* ---------- shared card ---------- */
+  .stage{width:100%;max-width:760px;}
+  .card{
+    background:var(--paper-2);
+    border:1px solid var(--line);
+    border-radius:var(--radius);
+    box-shadow:var(--shadow);
+    padding:46px 44px;
+  }
+  .screen{display:none;}
+  .screen.active{display:block;animation:rise .6s ease both;}
+  @keyframes rise{from{opacity:0;transform:translateY(14px);}to{opacity:1;transform:none;}}
+  h1{font-size:2.05rem;line-height:1.25;margin:0 0 12px;font-weight:600;letter-spacing:.2px;}
+  .lede{font-size:1.18rem;color:var(--ink-soft);margin:0 0 30px;max-width:46ch;}
+  /* persistent safety strip */
+  .safety{
+    display:flex;align-items:center;gap:12px;
+    background:var(--accent-soft);
+    border-radius:18px;
+    padding:14px 18px;
+    margin-top:26px;
+    font-family:system-ui,sans-serif;
+    font-size:.95rem;
+    color:var(--accent-deep);
+  }
+  .safety svg{flex:none;}
+  .safety b{font-weight:600;}
+  /* ---------- screen 1: drop ---------- */
+  .drop{
+    border:2px dashed #cdbfa6;
+    background:linear-gradient(180deg,#fffefb,#fbf6ec);
+    border-radius:24px;
+    padding:54px 30px;
+    text-align:center;
+    cursor:pointer;
+    transition:border-color .25s, background .25s, transform .25s;
+  }
+  .drop:hover{border-color:var(--accent);background:#fbfaf4;transform:translateY(-2px);}
+  .drop .basket{font-size:2.6rem;display:block;margin-bottom:10px;}
+  .drop .big{font-size:1.32rem;font-weight:600;margin-bottom:4px;}
+  .drop .sub{color:var(--ink-soft);font-family:system-ui,sans-serif;font-size:.95rem;}
+  .filechip{
+    display:inline-flex;align-items:center;gap:9px;margin-top:22px;
+    background:#fff;border:1px solid var(--line);border-radius:14px;
+    padding:9px 15px;font-family:system-ui,sans-serif;font-size:.9rem;color:var(--ink);
+  }
+  .filechip .dot{width:9px;height:9px;border-radius:50%;background:var(--accent);}
+  .btn{
+    font-family:system-ui,sans-serif;font-size:1.06rem;font-weight:600;
+    border:none;border-radius:16px;cursor:pointer;padding:16px 30px;
+    transition:transform .15s, box-shadow .25s, background .2s;
+  }
+  .btn-primary{
+    background:var(--accent);color:#fff;
+    box-shadow:0 10px 24px -12px rgba(94,132,112,.9);
+    width:100%;margin-top:26px;
+  }
+  .btn-primary:hover{background:var(--accent-deep);transform:translateY(-2px);}
+  .btn-ghost{
+    background:transparent;color:var(--accent-deep);border:1px solid #cfe0d4;
+  }
+  .btn-ghost:hover{background:var(--accent-soft);}
+  /* ---------- screen 2: working ---------- */
+  .working{text-align:center;padding:30px 10px 14px;}
+  .breath{
+    width:120px;height:120px;margin:6px auto 26px;border-radius:50%;
+    background:radial-gradient(circle at 50% 50%, var(--accent-soft), #fff);
+    border:1px solid var(--line);
+    display:flex;align-items:center;justify-content:center;
+    animation:breathe 3.4s ease-in-out infinite;
+  }
+  .breath span{font-size:2.4rem;}
+  @keyframes breathe{0%,100%{transform:scale(1);box-shadow:0 0 0 0 rgba(123,160,135,.25);}50%{transform:scale(1.07);box-shadow:0 0 0 18px rgba(123,160,135,0);}}
+  .working h1{font-size:1.7rem;}
+  .steps{list-style:none;padding:0;margin:24px auto 0;max-width:380px;text-align:left;font-family:system-ui,sans-serif;}
+  .steps li{
+    display:flex;align-items:center;gap:12px;padding:9px 0;color:var(--ink-soft);font-size:1rem;
+    opacity:.35;transition:opacity .4s;
+  }
+  .steps li.done{opacity:1;color:var(--ink);}
+  .steps li .tick{
+    width:22px;height:22px;border-radius:50%;border:2px solid #d8cdb8;flex:none;
+    display:flex;align-items:center;justify-content:center;font-size:.8rem;color:#fff;background:transparent;
+  }
+  .steps li.done .tick{background:var(--accent);border-color:var(--accent);}
+  /* ---------- screen 3: result ---------- */
+  .result-head{display:flex;align-items:flex-start;gap:16px;margin-bottom:8px;}
+  .badge{
+    width:62px;height:62px;flex:none;
+  }
+  .h-eyebrow{font-family:system-ui,sans-serif;font-size:.82rem;letter-spacing:1.4px;text-transform:uppercase;color:var(--accent-deep);font-weight:700;}
+  .summary{
+    background:var(--paper);
+    border:1px solid var(--line);
+    border-radius:20px;
+    padding:24px 26px;
+    margin:22px 0 8px;
+    font-size:1.12rem;
+  }
+  .summary p{margin:0 0 14px;}
+  .summary p:last-child{margin-bottom:0;}
+  .summary .num{color:var(--accent-deep);font-weight:600;}
+  .section-title{
+    font-family:system-ui,sans-serif;font-size:.95rem;font-weight:700;
+    color:var(--ink-soft);letter-spacing:.4px;margin:34px 0 14px;
+    display:flex;align-items:center;gap:9px;
+  }
+  .section-title .pill{font-size:.7rem;background:var(--accent-soft);color:var(--accent-deep);padding:3px 9px;border-radius:999px;font-weight:700;}
+  /* change cards */
+  .change{
+    border:1px solid var(--line);border-radius:18px;background:#fff;
+    padding:18px 20px;margin-bottom:14px;
+  }
+  .change .lead{font-size:1.08rem;margin:0 0 12px;}
+  .change .lead b{color:var(--ink);}
+  .ba{display:flex;gap:10px;flex-wrap:wrap;font-family:system-ui,sans-serif;font-size:.9rem;}
+  .chip{
+    padding:7px 13px;border-radius:12px;border:1px solid var(--line);
+    background:var(--paper);color:var(--ink-soft);
+  }
+  .chip.after{background:var(--accent-soft);border-color:#cfe0d4;color:var(--accent-deep);font-weight:600;}
+  .arrow{align-self:center;color:#bdb3a1;font-family:system-ui,sans-serif;}
+  /* gentle question card */
+  .ask{
+    border:1px solid #e7dcc4;background:linear-gradient(180deg,#fffdf6,#fbf3e3);
+    border-radius:18px;padding:20px 22px;margin-bottom:14px;
+  }
+  .ask .q{font-size:1.1rem;margin:0 0 6px;}
+  .ask .why{font-family:system-ui,sans-serif;font-size:.92rem;color:var(--ink-soft);margin:0 0 16px;}
+  .ask .row{display:flex;gap:10px;}
+  .ask .btn{padding:11px 20px;font-size:.95rem;}
+  /* honest flags */
+  .flag{
+    display:flex;gap:12px;align-items:flex-start;
+    background:#fff;border:1px dashed #d8cdb8;border-radius:16px;padding:16px 18px;margin-bottom:12px;
+  }
+  .flag .mark{font-size:1.2rem;flex:none;}
+  .flag p{margin:0;font-size:1rem;}
+  .flag .small{font-family:system-ui,sans-serif;font-size:.88rem;color:var(--ink-soft);}
+  /* bonus card */
+  .bonus{
+    background:linear-gradient(180deg,#f4faf6,#eaf3ed);
+    border:1px solid #d3e6da;border-radius:20px;padding:22px 24px;margin-top:8px;
+    display:flex;gap:16px;align-items:center;
+  }
+  .bonus .ic{font-size:2rem;flex:none;}
+  .bonus h3{margin:0 0 4px;font-size:1.15rem;}
+  .bonus p{margin:0;color:var(--ink-soft);font-size:1.02rem;}
+  /* download zone */
+  .download{
+    margin-top:30px;text-align:center;
+    border-top:1px solid var(--line);padding-top:30px;
+  }
+  .download .btn-primary{width:auto;display:inline-block;padding:18px 44px;font-size:1.12rem;}
+  .download .aside{font-family:system-ui,sans-serif;font-size:.92rem;color:var(--ink-soft);margin-top:14px;}
+  .download .aside a{color:var(--accent-deep);text-decoration:underline;cursor:pointer;}
+  .reset{display:block;margin:26px auto 0;background:none;border:none;color:var(--ink-soft);
+    font-family:system-ui,sans-serif;font-size:.85rem;cursor:pointer;text-decoration:underline;}
+  @media(max-width:560px){
+    .card{padding:32px 24px;}
+    h1{font-size:1.7rem;}
+    .lede{font-size:1.05rem;}
+  }
+</style>
+</head>
+<body>
+  <div class="topbar">
+    <div class="brand">
+      <svg class="leaf" viewBox="0 0 40 40" fill="none">
+        <path d="M20 36C8 30 6 16 12 8c8 2 18 8 16 22-1 4-4 6-8 6z" fill="#e8f0ea" stroke="#7ba087" stroke-width="1.6"/>
+        <path d="M20 34c-1-8 0-16 6-22" stroke="#7ba087" stroke-width="1.6" stroke-linecap="round"/>
+        <path d="M18 24c-2-1-4-3-5-6M22 18c2 0 5 0 7-1" stroke="#7ba087" stroke-width="1.4" stroke-linecap="round"/>
+      </svg>
+      <div class="brand-name">ScrubData<small>tu lista, ordenada con calma</small></div>
+    </div>
+    <div class="lang" aria-label="idioma">
+      <button class="on">ES</button>
+      <button>EN</button>
+    </div>
+  </div>
+  <div class="stage">
+    <!-- ============ SCREEN 1 : WELCOME + DROP ============ -->
+    <section class="screen active" id="s1">
+      <div class="card">
+        <h1>Hola, Doña Lupe.<br/>Vamos a ordenar su lista, sin prisa.</h1>
+        <p class="lede">Suelte aquí su archivo y yo le echo un ojo. Usted no tiene que configurar nada.</p>
+        <div class="drop" onclick="goWork()">
+          <span class="basket">🧺</span>
+          <div class="big">Suelte su archivo aquí</div>
+          <div class="sub">o toque para buscarlo en su computadora · Excel o CSV</div>
+          <div class="filechip"><span class="dot"></span> ventas-del-mes.xlsx · listo para revisar</div>
+        </div>
+        <button class="btn btn-primary" onclick="goWork()">Ordénalo por mí</button>
+        <div class="safety">
+          <svg width="22" height="22" viewBox="0 0 24 24" fill="none"><path d="M12 2l8 3v6c0 5-3.5 8-8 9-4.5-1-8-4-8-9V5l8-3z" stroke="#5e8470" stroke-width="1.6"/><path d="M9 12l2 2 4-4" stroke="#5e8470" stroke-width="1.6" stroke-linecap="round" stroke-linejoin="round"/></svg>
+          <div><b>Su original se queda igualito.</b> Hago una copia limpia aparte. Nada sale de esta computadora.</div>
+        </div>
+      </div>
+    </section>
+    <!-- ============ SCREEN 2 : WORKING ============ -->
+    <section class="screen" id="s2">
+      <div class="card working">
+        <div class="breath"><span>🍃</span></div>
+        <h1>Trabajando aquí mismo, en su computadora…</h1>
+        <p class="lede" style="margin:8px auto 0;">Respire tranquila. Su original está a salvo. Esto toma un momentito.</p>
+        <ul class="steps" id="steps">
+          <li data-i="0"><span class="tick">✓</span> Leyendo su lista con cuidado</li>
+          <li data-i="1"><span class="tick">✓</span> Juntando los tacos que están escritos de varias formas</li>
+          <li data-i="2"><span class="tick">✓</span> Revisando teléfonos, fechas y espacios en blanco</li>
+          <li data-i="3"><span class="tick">✓</span> Apuntando lo que no estoy segura, para preguntarle</li>
+        </ul>
+      </div>
+    </section>
+    <!-- ============ SCREEN 3 : RESULT ============ -->
+    <section class="screen" id="s3">
+      <div class="card">
+        <div class="result-head">
+          <svg class="badge" viewBox="0 0 64 64" fill="none">
+            <circle cx="32" cy="32" r="29" fill="#fff" stroke="#d8a25e" stroke-width="2" stroke-dasharray="3 3"/>
+            <circle cx="32" cy="32" r="22" fill="#e8f0ea" stroke="#7ba087" stroke-width="1.6"/>
+            <path d="M24 33l5 5 11-12" stroke="#5e8470" stroke-width="2.6" stroke-linecap="round" stroke-linejoin="round"/>
+          </svg>
+          <div>
+            <div class="h-eyebrow">Listo · su resumen</div>
+            <h1 style="margin-top:2px;">Esto fue lo que encontré.</h1>
+          </div>
+        </div>
+        <div class="summary">
+          <p>Revisé su lista de <b>ventas-del-mes</b> con calma. Esto fue lo que arreglé:</p>
+          <p>· <span class="num">«Al pastor»</span> estaba escrito de 4 maneras distintas. Lo junté todo: <span class="num">1,204 vendidos</span>.</p>
+          <p>· <span class="num">23 personas</span> aparecían dos veces en su lista. Las reuní para que usted las mire.</p>
+          <p>· Puse todos los <span class="num">teléfonos</span> y las <span class="num">fechas</span> escritos igualito, fáciles de leer.</p>
+          <p>· <span class="num">14 espacios</span> decían «N/A» o solo un guion — los tomé como vacíos.</p>
+          <p style="font-family:system-ui,sans-serif;font-size:.98rem;color:var(--ink-soft);">Puede leerlo en voz alta a Yolanda o imprimirlo. Nada de esto tocó su archivo original.</p>
+        </div>
+        <!-- already-done change card (mechanical, safe) -->
+        <div class="section-title">Lo que ya dejé arreglado <span class="pill">hecho</span></div>
+        <div class="change">
+          <p class="lead">El mismo taco, escrito de varias formas — lo conté junto:</p>
+          <div class="ba">
+            <span class="chip">al pastor</span>
+            <span class="chip">Al Pastor</span>
+            <span class="chip">pastor</span>
+            <span class="chip">al pastór</span>
+            <span class="arrow">→</span>
+            <span class="chip after">Al pastor · 1,204</span>
+          </div>
+        </div>
+        <div class="change">
+          <p class="lead">Los teléfonos ahora se ven todos iguales:</p>
+          <div class="ba">
+            <span class="chip">55-1234 5678</span>
+            <span class="chip">5512345678</span>
+            <span class="arrow">→</span>
+            <span class="chip after">(55) 1234-5678</span>
+          </div>
+        </div>
+        <!-- gentle confirms (money / identity) -->
+        <div class="section-title">Antes de seguir, dos preguntitas <span class="pill" style="background:#f6ecd6;color:#a9742f;">usted decide</span></div>
+        <div class="ask">
+          <p class="q">Encontré <b>31 filas en $0.00</b> — parece un error del sistema.</p>
+          <p class="why">Si las dejo dentro, bajan su total del mes. ¿Las saco de la suma?</p>
+          <div class="row">
+            <button class="btn btn-primary" style="width:auto;margin:0;" onclick="this.closest('.ask').style.opacity=.55;this.closest('.row').innerHTML='<span style=&quot;font-family:system-ui;color:#5e8470;font-weight:600&quot;>✓ Hecho — las dejé fuera del total.</span>'">Sí, sácalas</button>
+            <button class="btn btn-ghost" onclick="this.closest('.row').innerHTML='<span style=&quot;font-family:system-ui;color:#6f675d&quot;>De acuerdo, las dejo en la suma.</span>'">No, déjalas</button>
+          </div>
+        </div>
+        <div class="ask">
+          <p class="q">Estas dos parecen <b>la misma persona</b>: «Yolanda R.» y «Yolanda Reyes».</p>
+          <p class="why">¿Las cuento como una sola, o son personas distintas?</p>
+          <div class="row">
+            <button class="btn btn-primary" style="width:auto;margin:0;" onclick="this.closest('.row').innerHTML='<span style=&quot;font-family:system-ui;color:#5e8470;font-weight:600&quot;>✓ Las junté en una.</span>'">Sí, es la misma</button>
+            <button class="btn btn-ghost" onclick="this.closest('.row').innerHTML='<span style=&quot;font-family:system-ui;color:#6f675d&quot;>Las dejé separadas.</span>'">Son distintas</button>
+          </div>
+        </div>
+        <!-- honest flags -->
+        <div class="section-title">No estuve segura de esto — se lo dejé a usted</div>
+        <div class="flag">
+          <span class="mark">🤔</span>
+          <p>Dos teléfonos tenían solo 7 dígitos. No quise inventar los que faltan.<br/>
+            <span class="small">Los dejé tal cual para que usted los revise contra su libreta.</span></p>
+        </div>
+        <div class="flag">
+          <span class="mark">🧮</span>
+          <p>El total de su caja dice <b>$48,920</b>, pero su lista suma <b>$48,655</b>.<br/>
+            <span class="small">No cuadran por $265 — aquí se lo marco para que lo compare con su efectivo.</span></p>
+        </div>
+        <!-- bonus -->
+        <div class="bonus">
+          <span class="ic">🌶️</span>
+          <div>
+            <h3>Ah, y una cosita más…</h3>
+            <p>Se le está acabando el <b>adobo de pastor</b> — fue el más vendido del mes. Quizá conviene pedir más antes del finde.</p>
+          </div>
+        </div>
+        <!-- download -->
+        <div class="download">
+          <button class="btn btn-primary">Descargar mi copia limpia</button>
+          <p class="aside">
+            Su original sigue a salvo en su USB. ·
+            <a onclick="alert('Su archivo original nunca se tocó — está justo donde lo dejó.')">Devolver todo como estaba</a><br/>
+            También puede <a onclick="window.print()">imprimir este resumen</a> para Yolanda.
+          </p>
+        </div>
+        <div class="safety" style="margin-top:30px;">
+          <svg width="22" height="22" viewBox="0 0 24 24" fill="none"><path d="M12 2l8 3v6c0 5-3.5 8-8 9-4.5-1-8-4-8-9V5l8-3z" stroke="#5e8470" stroke-width="1.6"/><path d="M9 12l2 2 4-4" stroke="#5e8470" stroke-width="1.6" stroke-linecap="round" stroke-linejoin="round"/></svg>
+          <div><b>Buen trabajo, Doña Lupe.</b> Su lista quedó en buen estado, y usted la revisó con sus propios ojos. Nada salió de esta computadora.</div>
+        </div>
+        <button class="reset" onclick="reset()">Empezar de nuevo con otro archivo</button>
+      </div>
+    </section>
+  </div>
+<script>
+  function show(id){
+    document.querySelectorAll('.screen').forEach(s=>s.classList.remove('active'));
+    document.getElementById(id).classList.add('active');
+    window.scrollTo({top:0,behavior:'smooth'});
+  }
+  function goWork(){
+    show('s2');
+    const lis=document.querySelectorAll('#steps li');
+    lis.forEach(l=>l.classList.remove('done'));
+    let i=0;
+    const t=setInterval(()=>{
+      if(i<lis.length){lis[i].classList.add('done');i++;}
+      else{clearInterval(t);setTimeout(()=>show('s3'),650);}
+    },720);
+  }
+  function reset(){show('s1');}
+</script>
+</body>
+</html>

design/mockups/cozy/index.html ADDED Viewed

	@@ -0,0 +1,526 @@

+<!DOCTYPE html>
+<html lang="es">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>ScrubData — tu ayudante de cocina para los números</title>
+<style>
+  :root{
+    --paper:#fbf4e7;
+    --paper-2:#f5ead4;
+    --card:#fffaf0;
+    --ink:#4a3a2c;
+    --ink-soft:#7a6a58;
+    --line:#e6d6ba;
+    --moss:#6f8f5a;
+    --moss-deep:#52733f;
+    --berry:#c4694e;
+    --gold:#d9a441;
+    --sky:#8aa9b8;
+    --shadow:0 10px 30px rgba(120,90,50,.12);
+    --shadow-soft:0 4px 14px rgba(120,90,50,.10);
+    --radius:22px;
+  }
+  *{box-sizing:border-box}
+  html,body{margin:0}
+  body{
+    font-family:"Iowan Old Style","Palatino Linotype","Book Antiqua",Georgia,"Segoe UI",serif;
+    color:var(--ink);
+    background:
+      radial-gradient(circle at 15% 8%, #fdf8ee 0%, transparent 40%),
+      radial-gradient(circle at 90% 92%, #f6ecd6 0%, transparent 45%),
+      var(--paper);
+    line-height:1.55;
+    -webkit-font-smoothing:antialiased;
+    min-height:100vh;
+  }
+  /* faint paper grain + dotted-trail texture */
+  body::before{
+    content:"";position:fixed;inset:0;pointer-events:none;z-index:0;opacity:.5;
+    background-image:radial-gradient(rgba(180,150,100,.10) 1px, transparent 1.4px);
+    background-size:22px 22px;
+  }
+  .wrap{position:relative;z-index:1;max-width:880px;margin:0 auto;padding:28px 20px 80px}
+  /* ---------- top bar ---------- */
+  .topbar{display:flex;align-items:center;justify-content:space-between;gap:12px;margin-bottom:14px}
+  .brand{display:flex;align-items:center;gap:12px}
+  .logo{width:46px;height:46px;flex:0 0 auto}
+  .brand h1{font-size:1.32rem;margin:0;letter-spacing:.2px}
+  .brand .tag{margin:0;font-size:.86rem;color:var(--ink-soft);font-style:italic}
+  .lang{display:flex;background:var(--card);border:1.5px solid var(--line);border-radius:999px;padding:3px;box-shadow:var(--shadow-soft)}
+  .lang button{border:0;background:transparent;font:inherit;font-size:.85rem;color:var(--ink-soft);padding:5px 13px;border-radius:999px;cursor:pointer}
+  .lang button.on{background:var(--moss);color:#fff;box-shadow:0 2px 6px rgba(80,110,60,.35)}
+  /* ---------- persistent safety ribbon ---------- */
+  .safe{
+    display:flex;align-items:center;gap:12px;
+    background:linear-gradient(180deg,#f2f7ec,#eaf2e0);
+    border:1.5px solid #d6e3c4;border-radius:16px;
+    padding:11px 16px;margin-bottom:22px;box-shadow:var(--shadow-soft);
+  }
+  .safe svg{flex:0 0 auto}
+  .safe p{margin:0;font-size:.92rem;color:var(--moss-deep)}
+  .safe b{color:var(--moss-deep)}
+  /* ---------- cards / screens ---------- */
+  .screen{display:none;animation:rise .5s ease both}
+  .screen.active{display:block}
+  @keyframes rise{from{opacity:0;transform:translateY(14px)}to{opacity:1;transform:none}}
+  .card{
+    background:var(--card);border:1.5px solid var(--line);
+    border-radius:var(--radius);box-shadow:var(--shadow);
+    padding:30px 30px 32px;position:relative;
+  }
+  .card + .card{margin-top:20px}
+  /* ---------- welcome / drop ---------- */
+  .hello{text-align:center}
+  .hello h2{font-size:1.75rem;margin:6px 0 6px}
+  .hello .sub{color:var(--ink-soft);font-size:1.05rem;margin:0 auto 24px;max-width:520px}
+  .drop{
+    border:2.5px dashed #d8b873;border-radius:20px;
+    background:linear-gradient(180deg,#fffdf6,#fdf3df);
+    padding:38px 24px;text-align:center;cursor:pointer;transition:.2s;
+  }
+  .drop:hover{border-color:var(--gold);background:#fff8e8;transform:translateY(-2px)}
+  .drop .basket{font-size:0;line-height:0;margin-bottom:10px}
+  .drop h3{margin:8px 0 4px;font-size:1.2rem}
+  .drop p{margin:0;color:var(--ink-soft);font-size:.95rem}
+  .or{color:var(--ink-soft);font-size:.85rem;margin:14px 0 4px}
+  .filechip{
+    display:inline-flex;align-items:center;gap:10px;background:#fff;border:1.5px solid var(--line);
+    border-radius:14px;padding:9px 14px;margin-top:6px;font-size:.92rem;box-shadow:var(--shadow-soft)
+  }
+  .filechip .x{color:var(--ink-soft);font-size:.8rem}
+  .btn{
+    border:0;font:inherit;cursor:pointer;border-radius:16px;font-size:1.06rem;
+    padding:14px 30px;font-weight:600;letter-spacing:.2px;transition:.16s;
+  }
+  .btn-go{background:var(--berry);color:#fff;box-shadow:0 6px 16px rgba(196,105,78,.35);margin-top:24px}
+  .btn-go:hover{transform:translateY(-2px);box-shadow:0 9px 22px rgba(196,105,78,.42)}
+  .btn-ghost{background:#fff;color:var(--ink);border:1.5px solid var(--line)}
+  .btn-ghost:hover{background:#fffdf6}
+  /* ---------- tidying ---------- */
+  .tidy{text-align:center;padding:54px 30px}
+  .tidy h2{font-size:1.5rem;margin:18px 0 6px}
+  .tidy p{color:var(--ink-soft);margin:0 auto;max-width:440px}
+  .scene{width:160px;height:120px;margin:0 auto 6px;position:relative}
+  .broom{position:absolute;left:46px;top:6px;transform-origin:78px 12px;animation:sweep 1.1s ease-in-out infinite}
+  @keyframes sweep{0%,100%{transform:rotate(-13deg)}50%{transform:rotate(13deg)}}
+  .spk{position:absolute;font-size:0;animation:twinkle 1.4s ease-in-out infinite}
+  .spk:nth-child(2){left:24px;top:70px;animation-delay:.1s}
+  .spk:nth-child(3){left:120px;top:54px;animation-delay:.5s}
+  .spk:nth-child(4){left:70px;top:96px;animation-delay:.8s}
+  @keyframes twinkle{0%,100%{opacity:.2;transform:scale(.7)}50%{opacity:1;transform:scale(1.1)}}
+  .bar{height:12px;background:#efe2c8;border-radius:99px;overflow:hidden;max-width:340px;margin:22px auto 0;border:1px solid var(--line)}
+  .bar i{display:block;height:100%;width:0;background:linear-gradient(90deg,var(--moss),var(--gold));border-radius:99px;animation:fill 4.2s ease forwards}
+  @keyframes fill{to{width:100%}}
+  .tidy .micro{font-size:.85rem;color:var(--moss-deep);margin-top:14px}
+  /* ---------- result ---------- */
+  .result-head{text-align:center;margin-bottom:6px}
+  .badge-row{display:flex;justify-content:center;gap:10px;margin-bottom:8px}
+  .merit{display:flex;flex-direction:column;align-items:center;gap:4px;font-size:.72rem;color:var(--moss-deep);width:84px;text-align:center}
+  .result-head h2{font-size:1.6rem;margin:6px 0 2px}
+  .result-head .sub{color:var(--ink-soft);margin:0 0 6px}
+  .summary{background:linear-gradient(180deg,#fffdf6,#fbf3e0);border:1.5px solid var(--line)}
+  .summary h3{margin:0 0 4px;font-size:1.22rem}
+  .summary .read{font-size:.82rem;color:var(--ink-soft);font-style:italic;margin:0 0 14px}
+  .sline{display:flex;gap:13px;align-items:flex-start;padding:11px 0;border-top:1px dotted var(--line)}
+  .sline:first-of-type{border-top:0}
+  .sline .ic{flex:0 0 auto;margin-top:2px}
+  .sline p{margin:0;font-size:1.02rem}
+  .sline b{color:var(--moss-deep)}
+  .secttitle{font-size:1.05rem;color:var(--ink-soft);margin:26px 4px 10px;display:flex;align-items:center;gap:8px;font-style:italic}
+  /* change cards */
+  .chg{padding:18px 20px}
+  .chg.done{border-left:6px solid var(--moss)}
+  .chg.ask{border-left:6px solid var(--gold);background:linear-gradient(180deg,#fffdf3,#fdf6e2)}
+  .chg.flag{border-left:6px solid var(--sky)}
+  .chg h4{margin:0 0 10px;font-size:1.08rem;display:flex;align-items:center;gap:9px}
+  .chk{font-size:.72rem;background:#eaf2e0;color:var(--moss-deep);padding:2px 9px;border-radius:99px;font-weight:600;letter-spacing:.3px}
+  .pill-ask{font-size:.72rem;background:#f7ead0;color:#9a7a2e;padding:2px 9px;border-radius:99px;font-weight:600;letter-spacing:.3px}
+  .pill-flag{font-size:.72rem;background:#e3edf2;color:#5b7d8c;padding:2px 9px;border-radius:99px;font-weight:600;letter-spacing:.3px}
+  .ba{display:flex;gap:12px;align-items:stretch;flex-wrap:wrap}
+  .ba .col{flex:1 1 200px;border:1.5px solid var(--line);border-radius:14px;overflow:hidden;background:#fff}
+  .ba .col .ttl{font-size:.74rem;letter-spacing:.6px;text-transform:uppercase;color:var(--ink-soft);padding:7px 12px;background:#faf4e6;border-bottom:1px solid var(--line)}
+  .ba .col.after .ttl{background:#eef5e6;color:var(--moss-deep)}
+  .row{display:flex;justify-content:space-between;gap:10px;padding:7px 12px;font-size:.95rem;border-top:1px dashed #efe6d2}
+  .row:first-of-type{border-top:0}
+  .row .q{color:var(--ink-soft)}
+  .ba .col.before .was{color:#a9947d}
+  .ba .col.after .now{color:var(--moss-deep);font-weight:600}
+  .arrow{display:flex;align-items:center;color:var(--gold);font-size:1.3rem}
+  @media(max-width:560px){.arrow{transform:rotate(90deg)}}
+  .askbtns{display:flex;gap:10px;margin-top:14px;flex-wrap:wrap}
+  .askbtns .yes{background:var(--moss);color:#fff;padding:9px 18px;border-radius:13px;border:0;font:inherit;font-weight:600;cursor:pointer}
+  .askbtns .no{background:#fff;color:var(--ink);border:1.5px solid var(--line);padding:9px 18px;border-radius:13px;font:inherit;cursor:pointer}
+  .askbtns .yes:hover{background:var(--moss-deep)}
+  .answered{display:none;align-items:center;gap:8px;color:var(--moss-deep);font-size:.92rem;margin-top:12px;background:#eef5e6;padding:8px 12px;border-radius:11px}
+  /* bonus card */
+  .bonus{background:linear-gradient(135deg,#fdf6e6,#f6efe0);border:1.5px solid #ecd9b0}
+  .bonus h4{margin:0 0 6px;font-size:1.12rem;display:flex;align-items:center;gap:9px}
+  .bonus ul{margin:8px 0 0;padding-left:4px;list-style:none}
+  .bonus li{padding:5px 0;font-size:1rem;display:flex;gap:9px;align-items:center}
+  .bonus li .dot{width:9px;height:9px;border-radius:99px;background:var(--berry);flex:0 0 auto}
+  /* download footer */
+  .getit{text-align:center;background:linear-gradient(180deg,#f2f7ec,#e9f1de);border:1.5px solid #d6e3c4}
+  .getit h3{margin:0 0 4px;font-size:1.3rem;color:var(--moss-deep)}
+  .getit p{margin:0 0 18px;color:var(--ink-soft)}
+  .getit .btns{display:flex;gap:12px;justify-content:center;flex-wrap:wrap}
+  .btn-dl{background:var(--moss);color:#fff;box-shadow:0 6px 16px rgba(80,110,60,.32)}
+  .btn-dl:hover{transform:translateY(-2px)}
+  .undo{margin-top:18px;font-size:.9rem;color:var(--moss-deep)}
+  .undo a{color:var(--berry);text-decoration:underline;cursor:pointer}
+  .restart{display:block;margin:26px auto 0;color:var(--ink-soft);background:none;border:0;font:inherit;font-size:.85rem;text-decoration:underline;cursor:pointer}
+  .footnote{text-align:center;color:var(--ink-soft);font-size:.8rem;margin-top:30px;font-style:italic}
+  .es{display:none}
+  body.es-on .en{display:none}
+  body.es-on .es{display:inline}
+  body.es-on .es.block{display:block}
+</style>
+</head>
+<body class="es-on">
+<div class="wrap">
+  <!-- top bar -->
+  <div class="topbar">
+    <div class="brand">
+      <svg class="logo" viewBox="0 0 48 48" fill="none">
+        <path d="M24 4c7 0 12 4 12 4s-2 8-2 14c0 9-5 18-10 18S14 31 14 22c0-6-2-14-2-14s5-4 12-4z" fill="#7e9f63" stroke="#52733f" stroke-width="1.6"/>
+        <path d="M24 9v28" stroke="#52733f" stroke-width="1.4"/>
+        <path d="M24 18l6-5M24 24l-6-5M24 30l6-5" stroke="#52733f" stroke-width="1.3"/>
+      </svg>
+      <div>
+        <h1>ScrubData</h1>
+        <p class="tag"><span class="es">tu ayudante para ordenar tus listas</span><span class="en">your little helper for tidy lists</span></p>
+      </div>
+    </div>
+    <div class="lang">
+      <button id="bES" class="on" onclick="setLang('es')">Español</button>
+      <button id="bEN" onclick="setLang('en')">English</button>
+    </div>
+  </div>
+  <!-- persistent safety ribbon -->
+  <div class="safe">
+    <svg width="26" height="26" viewBox="0 0 24 24" fill="none"><path d="M12 2l8 3v6c0 5-3.5 9-8 11C7.5 20 4 16 4 11V5l8-3z" fill="#cfe0bd" stroke="#52733f" stroke-width="1.4"/><path d="M8.5 12l2.5 2.5L16 9" stroke="#52733f" stroke-width="1.7" stroke-linecap="round" stroke-linejoin="round"/></svg>
+    <p>
+      <span class="es"><b>Tu archivo original queda igualito.</b> Nada sale de esta computadora — todo se hace aquí mismo.</span>
+      <span class="en"><b>Your original stays exactly as it is.</b> Nothing leaves this computer — it all happens right here.</span>
+    </p>
+  </div>
+  <!-- ===================== SCREEN 1: WELCOME ===================== -->
+  <section id="s1" class="screen active">
+    <div class="card hello">
+      <div style="font-size:0;line-height:0">
+        <svg width="86" height="74" viewBox="0 0 86 74" fill="none" style="margin:0 auto">
+          <ellipse cx="43" cy="64" rx="30" ry="6" fill="#ead9b9"/>
+          <path d="M16 40h54l-5 22a4 4 0 0 1-4 3H25a4 4 0 0 1-4-3L16 40z" fill="#e7b86b" stroke="#b9863a" stroke-width="1.6"/>
+          <path d="M16 40h54" stroke="#b9863a" stroke-width="1.6"/>
+          <path d="M22 40c0-12 9-21 21-21s21 9 21 21" stroke="#b9863a" stroke-width="1.6" fill="#f3d9a3"/>
+          <circle cx="34" cy="33" r="3" fill="#c4694e"/><circle cx="50" cy="31" r="3" fill="#6f8f5a"/><circle cx="43" cy="36" r="3" fill="#d9a441"/>
+        </svg>
+      </div>
+      <h2>
+        <span class="es">Hola, Doña Lupe. ¿Le ayudo con su lista?</span>
+        <span class="en">Hi, Lupe. Want a hand with your list?</span>
+      </h2>
+      <p class="sub">
+        <span class="es">Suéltela aquí y yo la reviso con calma — sin botones raros ni cosas que configurar.</span>
+        <span class="en">Drop it here and I'll look it over, calmly — no strange buttons, nothing to set up.</span>
+      </p>
+      <div class="drop" onclick="pick()">
+        <div class="basket">
+          <svg width="58" height="50" viewBox="0 0 58 50" fill="none" style="margin:0 auto">
+            <path d="M6 22h46l-4 22a3 3 0 0 1-3 3H13a3 3 0 0 1-3-3L6 22z" fill="#f3e3c2" stroke="#c79a52" stroke-width="1.6"/>
+            <path d="M6 22h46M16 22l3 25M40 22l-3 25M29 22v25" stroke="#c79a52" stroke-width="1.2"/>
+            <path d="M16 22c0-9 6-15 13-15s13 6 13 15" stroke="#c79a52" stroke-width="1.6"/>
+          </svg>
+        </div>
+        <h3><span class="es">Suelte su archivo aquí</span><span class="en">Drop your file here</span></h3>
+        <p><span class="es">Excel o CSV — yo me encargo del resto.</span><span class="en">Excel or CSV — I'll handle the rest.</span></p>
+        <p class="or"><span class="es">— o —</span><span class="en">— or —</span></p>
+        <span class="filechip">
+          <svg width="16" height="16" viewBox="0 0 24 24" fill="none"><path d="M6 3h8l5 5v13a1 1 0 0 1-1 1H6a1 1 0 0 1-1-1V4a1 1 0 0 1 1-1z" fill="#f3e3c2" stroke="#c79a52" stroke-width="1.4"/></svg>
+          <span class="es">elegir de mi computadora</span><span class="en">choose from my computer</span>
+        </span>
+      </div>
+      <div style="margin-top:8px">
+        <span class="filechip" style="border-color:#cfe0bd;background:#f2f7ec">
+          <svg width="15" height="15" viewBox="0 0 24 24"><circle cx="12" cy="12" r="10" fill="#cfe0bd"/><path d="M8 12l3 3 5-6" stroke="#52733f" stroke-width="2" fill="none" stroke-linecap="round"/></svg>
+          ventas-mayo.xlsx <span class="x">· 1,431 <span class="es">renglones</span><span class="en">lines</span></span>
+        </span>
+      </div>
+      <button class="btn btn-go" onclick="go()">
+        <span class="es">Vamos a ordenarla ✦</span><span class="en">Let's tidy it up ✦</span>
+      </button>
+    </div>
+  </section>
+  <!-- ===================== SCREEN 2: TIDYING ===================== -->
+  <section id="s2" class="screen">
+    <div class="card tidy">
+      <div class="scene">
+        <span class="spk"><svg width="14" height="14" viewBox="0 0 24 24"><path d="M12 2l2 8 8 2-8 2-2 8-2-8-8-2 8-2 2-8z" fill="#d9a441"/></svg></span>
+        <span class="spk"><svg width="11" height="11" viewBox="0 0 24 24"><path d="M12 2l2 8 8 2-8 2-2 8-2-8-8-2 8-2 2-8z" fill="#6f8f5a"/></svg></span>
+        <span class="spk"><svg width="13" height="13" viewBox="0 0 24 24"><path d="M12 2l2 8 8 2-8 2-2 8-2-8-8-2 8-2 2-8z" fill="#c4694e"/></svg></span>
+        <div class="broom">
+          <svg width="64" height="110" viewBox="0 0 64 110" fill="none">
+            <rect x="30" y="2" width="5" height="64" rx="2.5" fill="#b9863a"/>
+            <path d="M18 64h28l6 38c0 3-3 4-6 4H18c-3 0-6-1-6-4l6-38z" fill="#e7b86b" stroke="#b9863a" stroke-width="1.6"/>
+            <path d="M22 78v24M30 78v26M38 78v24M46 78v22" stroke="#b9863a" stroke-width="1.3"/>
+          </svg>
+        </div>
+      </div>
+      <h2><span class="es">Ordenando con cuidado…</span><span class="en">Tidying up, gently…</span></h2>
+      <p>
+        <span class="es">Estoy aquí mismo en su computadora, sin prisas. Su archivo original sigue a salvo.</span>
+        <span class="en">I'm right here on your computer, taking my time. Your original is safe.</span>
+      </p>
+      <div class="bar"><i></i></div>
+      <p class="micro" id="step">
+        <span class="es">Juntando los tacos que están escritos de varias maneras…</span>
+        <span class="en">Gathering the items written a few different ways…</span>
+      </p>
+    </div>
+  </section>
+  <!-- ===================== SCREEN 3: RESULT ===================== -->
+  <section id="s3" class="screen">
+    <!-- merit + hero -->
+    <div class="card result-head">
+      <div class="badge-row">
+        <div class="merit">
+          <svg width="52" height="52" viewBox="0 0 52 52"><circle cx="26" cy="26" r="22" fill="#eef5e6" stroke="#6f8f5a" stroke-width="2"/><path d="M26 6l4 5 6-2-1 6 6 3-5 4 2 6-6-1-3 6-3-6-6 1 2-6-5-4 6-3-1-6 6 2 4-5z" fill="#cfe0bd"/><path d="M20 26l4 4 8-9" stroke="#52733f" stroke-width="2.4" fill="none" stroke-linecap="round" stroke-linejoin="round"/></svg>
+          <span><span class="es">lista ordenada</span><span class="en">tidy list</span></span>
+        </div>
+        <div class="merit">
+          <svg width="52" height="52" viewBox="0 0 52 52"><circle cx="26" cy="26" r="22" fill="#fdf2dc" stroke="#d9a441" stroke-width="2"/><path d="M26 14a8 8 0 0 1 8 8c0 5-8 12-8 12s-8-7-8-12a8 8 0 0 1 8-8z" fill="#f3d9a3" stroke="#b9863a" stroke-width="1.4"/><circle cx="26" cy="22" r="3" fill="#c4694e"/></svg>
+          <span><span class="es">nada se subió</span><span class="en">nothing uploaded</span></span>
+        </div>
+      </div>
+      <h2><span class="es">Listo. Esto fue lo que encontré 🌿</span><span class="en">All done. Here's what I found 🌿</span></h2>
+      <p class="sub"><span class="es">Léalo con calma. Usted decide lo que toca el dinero.</span><span class="en">Read it calmly. You decide anything that touches money.</span></p>
+    </div>
+    <!-- THE SUMMARY (hero) -->
+    <div class="card summary">
+      <h3><span class="es">Su resumen, en palabras sencillas</span><span class="en">Your summary, in plain words</span></h3>
+      <p class="read"><span class="es">— puede leerlo en voz alta a Yolanda, o imprimirlo.</span><span class="en">— you can read it aloud to Yolanda, or print it.</span></p>
+      <div class="sline">
+        <span class="ic"><svg width="22" height="22" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#eef5e6"/><path d="M7 12l3.5 3.5L17 8" stroke="#52733f" stroke-width="2.2" fill="none" stroke-linecap="round" stroke-linejoin="round"/></svg></span>
+        <p class="es">El <b>al pastor</b> estaba escrito de <b>4 maneras</b> (al pastor, Al Pastor, pastor, "al pstr"). Los junté todos: <b>1,204 vendidos</b>.</p>
+        <p class="en"><b>Al pastor</b> was written <b>4 ways</b> (al pastor, Al Pastor, pastor, "al pstr"). I counted them together: <b>1,204 sold</b>.</p>
+      </div>
+      <div class="sline">
+        <span class="ic"><svg width="22" height="22" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#eef5e6"/><path d="M7 12l3.5 3.5L17 8" stroke="#52733f" stroke-width="2.2" fill="none" stroke-linecap="round" stroke-linejoin="round"/></svg></span>
+        <p class="es">Unos espacios estaban en blanco (escritos como <b>"N/A"</b> o solo una raya). Los traté como vacíos.</p>
+        <p class="en">Some spots were left blank (written as <b>"N/A"</b> or just a dash). I treated those as empty.</p>
+      </div>
+      <div class="sline">
+        <span class="ic"><svg width="22" height="22" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#eef5e6"/><path d="M7 12l3.5 3.5L17 8" stroke="#52733f" stroke-width="2.2" fill="none" stroke-linecap="round" stroke-linejoin="round"/></svg></span>
+        <p class="es">Puse todos los <b>teléfonos</b> y las <b>fechas</b> igualitos, para que se lean fácil.</p>
+        <p class="en">I made all the <b>phone numbers</b> and <b>dates</b> match, so they're easy to read.</p>
+      </div>
+      <div class="sline">
+        <span class="ic"><svg width="22" height="22" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#fdf2dc"/><path d="M12 6v7M12 16.5v.5" stroke="#b9863a" stroke-width="2.2" fill="none" stroke-linecap="round"/></svg></span>
+        <p class="es">Hay <b>2 cositas</b> que prefiero <b>preguntarle</b> antes de tocar — porque tienen que ver con dinero. Están abajo. 👇</p>
+        <p class="en">There are <b>2 things</b> I'd rather <b>ask you</b> about before touching — because they involve money. They're below. 👇</p>
+      </div>
+    </div>
+    <!-- DONE change card with before/after -->
+    <div class="secttitle">
+      <svg width="18" height="18" viewBox="0 0 24 24"><circle cx="12" cy="12" r="10" fill="#cfe0bd"/><path d="M8 12l3 3 5-6" stroke="#52733f" stroke-width="2" fill="none" stroke-linecap="round"/></svg>
+      <span class="es">Lo que ya dejé arregladito</span><span class="en">What I already tidied for you</span>
+    </div>
+    <div class="card chg done">
+      <h4><span class="es">El mismo taco, contado junto</span><span class="en">The same taco, counted together</span> <span class="chk"><span class="es">YA HECHO</span><span class="en">DONE</span></span></h4>
+      <div class="ba">
+        <div class="col before">
+          <div class="ttl"><span class="es">Antes</span><span class="en">Before</span></div>
+          <div class="row"><span class="q was">al pastor</span><span class="was">312</span></div>
+          <div class="row"><span class="q was">Al Pastor</span><span class="was">520</span></div>
+          <div class="row"><span class="q was">pastor</span><span class="was">301</span></div>
+          <div class="row"><span class="q was">al pstr</span><span class="was">71</span></div>
+        </div>
+        <div class="arrow">➜</div>
+        <div class="col after">
+          <div class="ttl"><span class="es">Después</span><span class="en">After</span></div>
+          <div class="row"><span class="q">Al pastor</span><span class="now">1,204</span></div>
+          <div class="row" style="color:var(--ink-soft)"><span class="q" style="font-style:italic"><span class="es">una sola fila, bien clara</span><span class="en">one tidy line</span></span><span></span></div>
+        </div>
+      </div>
+    </div>
+    <div class="card chg done">
+      <h4><span class="es">Los blancos disfrazados</span><span class="en">The disguised blanks</span> <span class="chk"><span class="es">YA HECHO</span><span class="en">DONE</span></span></h4>
+      <div class="ba">
+        <div class="col before">
+          <div class="ttl"><span class="es">Antes</span><span class="en">Before</span></div>
+          <div class="row"><span class="q was">tel.</span><span class="was">N/A</span></div>
+          <div class="row"><span class="q was">notas</span><span class="was">—</span></div>
+          <div class="row"><span class="q was">extra</span><span class="was">none</span></div>
+        </div>
+        <div class="arrow">➜</div>
+        <div class="col after">
+          <div class="ttl"><span class="es">Después</span><span class="en">After</span></div>
+          <div class="row"><span class="q">tel.</span><span class="now"><span class="es">(vacío)</span><span class="en">(empty)</span></span></div>
+          <div class="row"><span class="q">notas</span><span class="now"><span class="es">(vacío)</span><span class="en">(empty)</span></span></div>
+          <div class="row"><span class="q">extra</span><span class="now"><span class="es">(vacío)</span><span class="en">(empty)</span></span></div>
+        </div>
+      </div>
+    </div>
+    <!-- ASK cards (money / identity) -->
+    <div class="secttitle">
+      <svg width="18" height="18" viewBox="0 0 24 24"><circle cx="12" cy="12" r="10" fill="#f3d9a3"/><path d="M12 7v6M12 16v.5" stroke="#b9863a" stroke-width="2" fill="none" stroke-linecap="round"/></svg>
+      <span class="es">Aquí mejor le pregunto a usted</span><span class="en">Here I'd better ask you</span>
+    </div>
+    <div class="card chg ask">
+      <h4><span class="es">31 renglones marcaron $0.00</span><span class="en">31 lines showed $0.00</span> <span class="pill-ask"><span class="es">¿ME DICE?</span><span class="en">YOUR CALL</span></span></h4>
+      <p style="margin:0 0 4px">
+        <span class="es">Encontré <b>31 ventas en $0.00</b> — eso casi siempre es una falla de la caja, no una venta de verdad. ¿Quiere que las <b>deje fuera del total</b> del mes?</span>
+        <span class="en">I found <b>31 sales at $0.00</b> — that's usually a register glitch, not a real sale. Want me to <b>leave them out of the month's total</b>?</span>
+      </p>
+      <div class="askbtns">
+        <button class="yes" onclick="answer(this)"><span class="es">Sí, déjalas fuera</span><span class="en">Yes, leave them out</span></button>
+        <button class="no" onclick="answer(this)"><span class="es">No, déjalas</span><span class="en">No, keep them</span></button>
+      </div>
+      <div class="answered">
+        <svg width="18" height="18" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#cfe0bd"/><path d="M7 12l3.5 3.5L17 8" stroke="#52733f" stroke-width="2.2" fill="none" stroke-linecap="round"/></svg>
+        <span class="es">Listo — usted decidió. Lo anoté en su resumen.</span><span class="en">Done — your call. I noted it in your summary.</span>
+      </div>
+    </div>
+    <div class="card chg ask">
+      <h4><span class="es">Dos clientes parecen el mismo</span><span class="en">Two customers look like the same one</span> <span class="pill-ask"><span class="es">¿ME DICE?</span><span class="en">YOUR CALL</span></span></h4>
+      <p style="margin:0 0 4px">
+        <span class="es"><b>"Yolanda R."</b> y <b>"Yolanda Reyes"</b> tienen el mismo teléfono. ¿Los <b>cuento como una sola persona</b>?</span>
+        <span class="en"><b>"Yolanda R."</b> and <b>"Yolanda Reyes"</b> share the same phone. Should I <b>count them as one person</b>?</span>
+      </p>
+      <div class="askbtns">
+        <button class="yes" onclick="answer(this)"><span class="es">Sí, es la misma</span><span class="en">Yes, same person</span></button>
+        <button class="no" onclick="answer(this)"><span class="es">No, son distintas</span><span class="en">No, keep both</span></button>
+      </div>
+      <div class="answered">
+        <svg width="18" height="18" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#cfe0bd"/><path d="M7 12l3.5 3.5L17 8" stroke="#52733f" stroke-width="2.2" fill="none" stroke-linecap="round"/></svg>
+        <span class="es">Listo — usted decidió.</span><span class="en">Done — your call.</span>
+      </div>
+    </div>
+    <!-- HONEST FLAGS -->
+    <div class="secttitle">
+      <svg width="18" height="18" viewBox="0 0 24 24"><circle cx="12" cy="12" r="10" fill="#e3edf2"/><path d="M12 7v5M12 15v.5" stroke="#5b7d8c" stroke-width="2" fill="none" stroke-linecap="round"/></svg>
+      <span class="es">No estuve segura de esto — lo dejé para usted</span><span class="en">I wasn't sure about these — I left them for you</span>
+    </div>
+    <div class="card chg flag">
+      <h4><span class="es">Dos teléfonos raros y una nota de catering</span><span class="en">Two odd phones and a catering note</span> <span class="pill-flag"><span class="es">PARA REVISAR</span><span class="en">FOR YOU</span></span></h4>
+      <p style="margin:0">
+        <span class="es">Dos teléfonos tienen muy pocos números, y una nota dice "evento — preguntar a Memo". No quise adivinar, así que <b>los dejé tal cual</b> para que usted los vea con calma.</span>
+        <span class="en">Two phones have too few digits, and one note says "event — ask Memo." I didn't want to guess, so I <b>left them exactly as they were</b> for you to peek at.</span>
+      </p>
+    </div>
+    <!-- BONUS -->
+    <div class="card bonus">
+      <h4>
+        <svg width="24" height="24" viewBox="0 0 24 24"><path d="M5 9h14l-1.3 9.2A2 2 0 0 1 15.7 20H8.3a2 2 0 0 1-2-1.8L5 9z" fill="#f3d9a3" stroke="#b9863a" stroke-width="1.3"/><path d="M8 9a4 4 0 0 1 8 0" stroke="#b9863a" stroke-width="1.3" fill="none"/></svg>
+        <span class="es">De pilón: lo que se le está acabando</span><span class="en">A little bonus: what you're running low on</span>
+      </h4>
+      <p style="margin:0;color:var(--ink-soft)">
+        <span class="es">Ya que andábamos en sus números, le aparté esto para el pedido:</span>
+        <span class="en">While I was in your numbers, I set this aside for your reorder:</span>
+      </p>
+      <ul>
+        <li><span class="dot"></span><span class="es"><b>Marinada de pastor</b> — para ~3 días. Tal vez pedir el lunes.</span><span class="en"><b>Pastor marinade</b> — about 3 days left. Maybe order Monday.</span></li>
+        <li><span class="dot"></span><span class="es"><b>Tortillas</b> — bajando rápido este fin de semana.</span><span class="en"><b>Tortillas</b> — going fast this weekend.</span></li>
+      </ul>
+    </div>
+    <!-- GET MY CLEAN COPY -->
+    <div class="card getit">
+      <svg width="58" height="58" viewBox="0 0 58 58" style="margin:0 auto 6px"><circle cx="29" cy="29" r="26" fill="#dcebcb" stroke="#6f8f5a" stroke-width="2"/><path d="M29 16v18M22 28l7 7 7-7" stroke="#52733f" stroke-width="3" fill="none" stroke-linecap="round" stroke-linejoin="round"/><path d="M19 40h20" stroke="#52733f" stroke-width="3" stroke-linecap="round"/></svg>
+      <h3><span class="es">¡Quedó preciosa, Doña Lupe!</span><span class="en">It looks lovely, Lupe!</span></h3>
+      <p><span class="es">Aquí está su copia limpia y su resumen para imprimir o mandar por correo.</span><span class="en">Here's your clean copy and your summary to print or email.</span></p>
+      <div class="btns">
+        <button class="btn btn-dl"><span class="es">Bajar mi copia limpia</span><span class="en">Get my clean copy</span></button>
+        <button class="btn btn-ghost"><span class="es">Imprimir el resumen</span><span class="en">Print the summary</span></button>
+      </div>
+      <p class="undo">
+        <svg width="15" height="15" viewBox="0 0 24 24" style="vertical-align:-2px"><path d="M12 5V2L7 7l5 5V8a6 6 0 1 1-6 6" stroke="#52733f" stroke-width="1.8" fill="none" stroke-linecap="round" stroke-linejoin="round"/></svg>
+        <span class="es">Su archivo original sigue a salvo. <a>Déjelo como estaba</a> cuando quiera.</span>
+        <span class="en">Your original is safe. <a>Put it back the way it was</a> any time.</span>
+      </p>
+    </div>
+    <button class="restart" onclick="reset()"><span class="es">↺ empezar de nuevo con otro archivo</span><span class="en">↺ start over with another file</span></button>
+  </section>
+  <p class="footnote">
+    <span class="es">Hecho con cariño para una hora tranquila en la mesa de la cocina · funciona sin internet</span>
+    <span class="en">Made with care for a quiet hour at the kitchen table · works without internet</span>
+  </p>
+</div>
+<script>
+  function setLang(l){
+    document.body.classList.toggle('es-on', l==='es');
+    document.getElementById('bES').classList.toggle('on', l==='es');
+    document.getElementById('bEN').classList.toggle('on', l!=='es');
+    document.documentElement.lang = l;
+  }
+  function show(id){
+    document.querySelectorAll('.screen').forEach(s=>s.classList.remove('active'));
+    document.getElementById(id).classList.add('active');
+    window.scrollTo({top:0,behavior:'smooth'});
+  }
+  function pick(){ /* mock: file already shown as chosen */ }
+  function go(){
+    show('s2');
+    const isES = document.body.classList.contains('es-on');
+    const steps = isES ? [
+      'Juntando los tacos que están escritos de varias maneras…',
+      'Emparejando los teléfonos y las fechas…',
+      'Buscando blancos disfrazados como "N/A" o una raya…',
+      'Apartando lo que mejor le pregunto a usted…'
+    ] : [
+      'Gathering the items written a few different ways…',
+      'Matching up the phone numbers and dates…',
+      'Looking for blanks disguised as "N/A" or a dash…',
+      'Setting aside the things I should ask you about…'
+    ];
+    let i=0;
+    const el = document.getElementById('step');
+    const t = setInterval(()=>{ i++; if(i<steps.length){ el.textContent = steps[i]; } }, 1050);
+    setTimeout(()=>{ clearInterval(t); show('s3'); }, 4400);
+  }
+  function answer(btn){
+    const card = btn.closest('.chg');
+    card.querySelector('.askbtns').style.display='none';
+    card.querySelector('.answered').style.display='flex';
+  }
+  function reset(){ show('s1'); }
+</script>
+</body>
+</html>

design/mockups/helper/index.html ADDED Viewed

	@@ -0,0 +1,517 @@

+<!DOCTYPE html>
+<html lang="es">
+<head>
+<meta charset="UTF-8" />
+<meta name="viewport" content="width=device-width, initial-scale=1.0" />
+<title>ScrubData — tu ayudante de listas</title>
+<style>
+  :root{
+    --paper:#fbf4e7;
+    --paper-2:#f4e9d4;
+    --card:#fffdf8;
+    --ink:#4a3b2e;
+    --ink-soft:#6f5d49;
+    --line:#e6d6b8;
+    --accent:#e07a3f;      /* warm terracotta */
+    --accent-soft:#f6c89a;
+    --leaf:#6e8a5a;        /* trail green */
+    --leaf-soft:#dfe7cf;
+    --sky:#8fb0c4;
+    --shadow:0 10px 28px rgba(120,90,50,.14);
+    --shadow-sm:0 4px 12px rgba(120,90,50,.10);
+    --radius:22px;
+    --font: "Segoe UI", "Helvetica Neue", system-ui, -apple-system, "Trebuchet MS", sans-serif;
+  }
+  *{box-sizing:border-box;}
+  html,body{margin:0;padding:0;}
+  body{
+    font-family:var(--font);
+    color:var(--ink);
+    background:
+      radial-gradient(1200px 600px at 80% -10%, #fdf6e8 0%, rgba(253,246,232,0) 60%),
+      radial-gradient(900px 500px at 0% 100%, #f6ecd6 0%, rgba(246,236,214,0) 55%),
+      var(--paper);
+    -webkit-font-smoothing:antialiased;
+    line-height:1.5;
+    min-height:100vh;
+  }
+  /* tiny hand-drawn paper texture via repeating soft dots */
+  body::before{
+    content:"";position:fixed;inset:0;pointer-events:none;z-index:0;
+    background-image:radial-gradient(rgba(180,150,100,.06) 1px, transparent 1px);
+    background-size:22px 22px;
+  }
+  .wrap{position:relative;z-index:1;max-width:880px;margin:0 auto;padding:26px 20px 80px;}
+  /* ---- top bar ---- */
+  .topbar{display:flex;align-items:center;justify-content:space-between;margin-bottom:18px;}
+  .brand{display:flex;align-items:center;gap:11px;font-weight:800;font-size:20px;letter-spacing:.2px;}
+  .brand .logo{
+    width:40px;height:40px;border-radius:14px;
+    background:linear-gradient(150deg,var(--accent),#f0a05f);
+    display:grid;place-items:center;color:#fff;font-size:20px;
+    box-shadow:var(--shadow-sm);transform:rotate(-4deg);
+  }
+  .brand small{display:block;font-weight:600;font-size:12px;color:var(--ink-soft);letter-spacing:0;}
+  .lang{
+    display:flex;background:var(--card);border:1.5px solid var(--line);
+    border-radius:999px;padding:4px;box-shadow:var(--shadow-sm);font-weight:700;font-size:13px;
+  }
+  .lang button{
+    border:0;background:transparent;color:var(--ink-soft);
+    padding:6px 14px;border-radius:999px;cursor:pointer;font:inherit;font-weight:700;
+  }
+  .lang button.on{background:var(--accent);color:#fff;}
+  /* ---- persistent safety ribbon ---- */
+  .safety{
+    display:flex;align-items:center;gap:10px;
+    background:var(--leaf-soft);color:#41522f;
+    border:1.5px solid #cdd9bb;border-radius:999px;
+    padding:9px 16px;font-size:14px;font-weight:600;margin-bottom:24px;
+    box-shadow:var(--shadow-sm);
+  }
+  .safety .dot{font-size:16px;}
+  /* ---- card base ---- */
+  .card{
+    background:var(--card);border:1.5px solid var(--line);
+    border-radius:var(--radius);box-shadow:var(--shadow);
+    padding:30px;margin-bottom:22px;
+  }
+  h1{font-size:30px;margin:.1em 0 .25em;line-height:1.2;}
+  h2{font-size:22px;margin:.1em 0 .5em;}
+  .lead{font-size:18px;color:var(--ink-soft);margin:0 0 6px;}
+  /* ---- screen toggling ---- */
+  .screen{display:none;}
+  .screen.active{display:block;animation:fade .5s ease;}
+  @keyframes fade{from{opacity:0;transform:translateY(8px);}to{opacity:1;transform:none;}}
+  /* ---- step pills ---- */
+  .steps{display:flex;gap:8px;justify-content:center;margin-bottom:20px;flex-wrap:wrap;}
+  .steps .pill{
+    font-size:12.5px;font-weight:700;color:var(--ink-soft);
+    background:var(--paper-2);border:1.5px solid var(--line);
+    padding:6px 13px;border-radius:999px;cursor:pointer;transition:.2s;
+  }
+  .steps .pill.on{background:var(--accent);color:#fff;border-color:var(--accent);}
+  /* ---- drop zone ---- */
+  .drop{
+    border:2.5px dashed var(--accent-soft);border-radius:26px;
+    background:linear-gradient(180deg,#fffdf7,#fdf3e2);
+    padding:46px 24px;text-align:center;cursor:pointer;transition:.2s;
+  }
+  .drop:hover{border-color:var(--accent);transform:translateY(-2px);box-shadow:var(--shadow);}
+  .drop .big{font-size:54px;line-height:1;margin-bottom:10px;}
+  .drop .title{font-size:21px;font-weight:800;margin-bottom:4px;}
+  .drop .sub{color:var(--ink-soft);font-size:15px;}
+  .file-chip{
+    display:inline-flex;align-items:center;gap:9px;margin-top:18px;
+    background:var(--leaf-soft);border:1.5px solid #cdd9bb;border-radius:14px;
+    padding:9px 15px;font-weight:700;font-size:14.5px;color:#41522f;
+  }
+  /* ---- big friendly button ---- */
+  .btn{
+    border:0;cursor:pointer;font:inherit;font-weight:800;font-size:18px;
+    background:linear-gradient(150deg,var(--accent),#ef9a55);color:#fff;
+    padding:16px 30px;border-radius:18px;box-shadow:0 8px 18px rgba(224,122,63,.30);
+    transition:.15s;display:inline-flex;align-items:center;gap:10px;
+  }
+  .btn:hover{transform:translateY(-2px);box-shadow:0 12px 22px rgba(224,122,63,.38);}
+  .btn.ghost{
+    background:var(--card);color:var(--ink);border:1.5px solid var(--line);
+    box-shadow:var(--shadow-sm);font-size:15px;padding:12px 20px;
+  }
+  .btn.ghost:hover{box-shadow:var(--shadow-sm);}
+  .center{text-align:center;}
+  .mt{margin-top:22px;}
+  /* ---- working state ---- */
+  .work{text-align:center;padding:20px 10px 6px;}
+  .pot{font-size:64px;display:inline-block;animation:stir 1.6s ease-in-out infinite;}
+  @keyframes stir{0%,100%{transform:rotate(-6deg);}50%{transform:rotate(6deg);}}
+  .progress{height:14px;background:var(--paper-2);border-radius:999px;overflow:hidden;margin:22px auto;max-width:430px;border:1.5px solid var(--line);}
+  .progress > i{display:block;height:100%;width:0;background:linear-gradient(90deg,var(--accent),var(--leaf));border-radius:999px;animation:fill 3.4s ease forwards;}
+  @keyframes fill{to{width:100%;}}
+  .work-note{color:var(--ink-soft);font-size:15px;min-height:22px;}
+  /* ---- summary hero ---- */
+  .badge-row{display:flex;align-items:center;gap:16px;flex-wrap:wrap;margin-bottom:6px;}
+  .merit{
+    width:78px;height:78px;flex:none;border-radius:50%;
+    background:radial-gradient(circle at 50% 35%,#fbe2c2,#f0b277);
+    border:3px dashed #d98b4e;display:grid;place-items:center;
+    color:#7a4a1f;font-size:30px;box-shadow:var(--shadow-sm);transform:rotate(-5deg);
+  }
+  .summary-list{margin:18px 0 4px;padding:0;list-style:none;display:grid;gap:12px;}
+  .summary-list li{
+    display:flex;gap:13px;align-items:flex-start;font-size:16.5px;
+    background:var(--paper);border:1.5px solid var(--line);border-radius:16px;padding:13px 16px;
+  }
+  .summary-list .ic{font-size:22px;flex:none;line-height:1.2;}
+  .summary-list b{color:var(--ink);}
+  /* ---- change cards (before/after) ---- */
+  .change{
+    border:1.5px solid var(--line);border-radius:18px;background:var(--card);
+    padding:18px 18px 16px;margin-bottom:16px;box-shadow:var(--shadow-sm);
+  }
+  .change .head{font-weight:800;font-size:17px;margin-bottom:4px;display:flex;align-items:center;gap:9px;}
+  .change .say{color:var(--ink-soft);font-size:14.5px;margin-bottom:14px;}
+  .ba{display:grid;grid-template-columns:1fr auto 1fr;gap:12px;align-items:center;}
+  .ba .col{background:var(--paper);border:1.5px solid var(--line);border-radius:14px;padding:12px 14px;}
+  .ba .lab{font-size:11.5px;font-weight:800;letter-spacing:.5px;text-transform:uppercase;color:var(--ink-soft);margin-bottom:7px;}
+  .ba .col.after{background:var(--leaf-soft);border-color:#cdd9bb;}
+  .ba .row{font-size:15px;padding:3px 0;color:var(--ink);}
+  .ba .row.dim{color:#a98f6e;}
+  .ba .arrow{font-size:26px;color:var(--accent);text-align:center;}
+  /* gentle confirm card */
+  .ask{
+    border:1.5px solid var(--accent-soft);background:linear-gradient(180deg,#fffaf2,#fdf1e0);
+    border-radius:18px;padding:18px;margin-bottom:16px;box-shadow:var(--shadow-sm);
+  }
+  .ask .q{font-weight:800;font-size:17px;margin-bottom:5px;display:flex;gap:9px;align-items:center;}
+  .ask .detail{color:var(--ink-soft);font-size:14.5px;margin-bottom:14px;}
+  .ask .actions{display:flex;gap:10px;flex-wrap:wrap;}
+  .yes{background:var(--leaf);color:#fff;border:0;font-weight:800;border-radius:13px;padding:11px 20px;cursor:pointer;font:inherit;font-weight:800;}
+  .no{background:var(--card);color:var(--ink);border:1.5px solid var(--line);font-weight:700;border-radius:13px;padding:11px 20px;cursor:pointer;font:inherit;font-weight:700;}
+  .answered{font-weight:800;color:var(--leaf);font-size:15px;display:none;align-items:center;gap:8px;margin-top:4px;}
+  /* honest flags */
+  .flags{background:#fcf6ea;border:1.5px dashed #e0c9a0;border-radius:18px;padding:18px;margin-bottom:16px;}
+  .flags .q{font-weight:800;font-size:16.5px;margin-bottom:8px;display:flex;gap:9px;align-items:center;}
+  .flags ul{margin:6px 0 0;padding-left:4px;list-style:none;}
+  .flags li{font-size:14.5px;color:var(--ink-soft);padding:6px 0;border-top:1px dashed #e7d6b6;}
+  .flags li:first-child{border-top:0;}
+  /* bonus card */
+  .bonus{
+    background:linear-gradient(150deg,#eef3e3,#e3ecd2);border:1.5px solid #cdd9bb;
+    border-radius:18px;padding:20px;margin-bottom:16px;display:flex;gap:15px;align-items:center;
+  }
+  .bonus .em{font-size:42px;flex:none;}
+  .bonus .t{font-weight:800;font-size:17px;color:#3f5230;margin-bottom:3px;}
+  .bonus .d{color:#4f6240;font-size:14.5px;}
+  /* download band */
+  .download{
+    text-align:center;background:linear-gradient(180deg,#fffdf7,#fdf2e1);
+    border:1.5px solid var(--line);border-radius:20px;padding:26px 20px;margin-bottom:8px;
+  }
+  .download .small{color:var(--ink-soft);font-size:13.5px;margin-top:12px;}
+  .section-title{font-size:14px;font-weight:800;letter-spacing:.6px;text-transform:uppercase;color:var(--ink-soft);margin:26px 4px 12px;}
+  .footnote{text-align:center;color:var(--ink-soft);font-size:13px;margin-top:30px;}
+  @media(max-width:560px){
+    .ba{grid-template-columns:1fr;}
+    .ba .arrow{transform:rotate(90deg);}
+    h1{font-size:25px;}
+  }
+</style>
+</head>
+<body>
+<div class="wrap">
+  <!-- TOP BAR -->
+  <div class="topbar">
+    <div class="brand">
+      <span class="logo">🧺</span>
+      <span>ScrubData<small data-es="tu ayudante de listas" data-en="your list helper">tu ayudante de listas</small></span>
+    </div>
+    <div class="lang">
+      <button class="on" onclick="setLang('es',this)">Español</button>
+      <button onclick="setLang('en',this)">English</button>
+    </div>
+  </div>
+  <!-- PERSISTENT SAFETY RIBBON -->
+  <div class="safety">
+    <span class="dot">🌿</span>
+    <span data-es="Tu archivo original no se toca. Nada sale de esta computadora. Siempre puedes dejarlo como estaba."
+          data-en="Your original file stays exactly as it is. Nothing leaves this computer. You can always put it back the way it was.">
+      Tu archivo original no se toca. Nada sale de esta computadora. Siempre puedes dejarlo como estaba.
+    </span>
+  </div>
+  <!-- STEP PILLS (let reviewer walk the arc) -->
+  <div class="steps">
+    <span class="pill on" onclick="go(0,this)" data-es="1 · Bienvenida" data-en="1 · Welcome">1 · Bienvenida</span>
+    <span class="pill" onclick="go(1,this)" data-es="2 · Acomodando" data-en="2 · Tidying">2 · Acomodando</span>
+    <span class="pill" onclick="go(2,this)" data-es="3 · Lo que encontré" data-en="3 · What I found">3 · Lo que encontré</span>
+  </div>
+  <!-- ============ SCREEN 1 — WELCOME + DROP ============ -->
+  <section class="screen active" id="s0">
+    <div class="card">
+      <h1 data-es="Hola, Lupita. Vamos a poner tu lista bonita. 🌼"
+          data-en="Hi, Lupita. Let's make your list nice and tidy. 🌼">
+        Hola, Lupita. Vamos a poner tu lista bonita. 🌼
+      </h1>
+      <p class="lead" data-es="Sube tu archivo y yo lo reviso contigo, despacito. Sin botones raros, sin configurar nada."
+         data-en="Drop your file and I'll look through it with you, nice and slow. No strange buttons, nothing to set up.">
+        Sube tu archivo y yo lo reviso contigo, despacito. Sin botones raros, sin configurar nada.
+      </p>
+      <div class="drop" onclick="go(1)">
+        <div class="big">📂</div>
+        <div class="title" data-es="Suelta tu archivo aquí — yo le echo un ojo."
+             data-en="Drop your file here — I'll take a look.">Suelta tu archivo aquí — yo le echo un ojo.</div>
+        <div class="sub" data-es="Excel o CSV está bien. Tu original se queda igualito."
+             data-en="Excel or CSV is fine. Your original stays exactly as it is.">Excel o CSV está bien. Tu original se queda igualito.</div>
+        <div class="file-chip">📄 resumen-del-mes-mayo.xlsx</div>
+      </div>
+      <div class="center mt">
+        <button class="btn" onclick="go(1)">
+          <span>🧽</span><span data-es="Acomódalo por mí" data-en="Clean it up">Acomódalo por mí</span>
+        </button>
+      </div>
+    </div>
+  </section>
+  <!-- ============ SCREEN 2 — WORKING ============ -->
+  <section class="screen" id="s1">
+    <div class="card work">
+      <div class="pot">🍲</div>
+      <h2 data-es="Estoy acomodando tu lista…" data-en="I'm tidying your list…">Estoy acomodando tu lista…</h2>
+      <div class="progress"><i></i></div>
+      <p class="work-note" id="workNote"
+         data-es="Trabajando aquí mismo, en tu computadora. Tu original está a salvo."
+         data-en="Working right here on your computer. Your original is safe.">
+        Trabajando aquí mismo, en tu computadora. Tu original está a salvo.
+      </p>
+      <div class="center mt">
+        <button class="btn ghost" onclick="go(2)" data-es="Ver lo que encontré →" data-en="See what I found →">Ver lo que encontré →</button>
+      </div>
+    </div>
+  </section>
+  <!-- ============ SCREEN 3 — RESULT ============ -->
+  <section class="screen" id="s2">
+    <!-- SUMMARY HERO -->
+    <div class="card">
+      <div class="badge-row">
+        <div class="merit">🏅</div>
+        <div>
+          <h1 style="margin:0" data-es="¡Listo! Tu lista quedó bien bonita."
+              data-en="All done! Your list is in great shape.">¡Listo! Tu lista quedó bien bonita.</h1>
+          <p class="lead" style="margin:2px 0 0" data-es="Esto fue lo que acomodé por ti — léelo en voz alta a Yolanda si quieres."
+             data-en="Here's what I tidied for you — read it out loud to Yolanda if you like.">
+            Esto fue lo que acomodé por ti — léelo en voz alta a Yolanda si quieres.
+          </p>
+        </div>
+      </div>
+      <ul class="summary-list">
+        <li><span class="ic">🌮</span><span data-es="<b>“Al pastor”</b> estaba escrito de 4 maneras. Los junté: <b>1,204 vendidos</b> en mayo."
+            data-en="<b>“Al pastor”</b> was written 4 different ways. I counted them together: <b>1,204 sold</b> in May.">
+            <b>“Al pastor”</b> estaba escrito de 4 maneras. Los junté: <b>1,204 vendidos</b> en mayo.</span></li>
+        <li><span class="ic">👥</span><span data-es="<b>3 clientes</b> aparecían dos veces. Los reuní para que los revises."
+            data-en="<b>3 customers</b> showed up twice. I gathered each one for you to check.">
+            <b>3 clientes</b> aparecían dos veces. Los reuní para que los revises.</span></li>
+        <li><span class="ic">📞</span><span data-es="Acomodé <b>todos los teléfonos</b> para que se lean igualito."
+            data-en="I made <b>all the phone numbers</b> match so they're easy to read.">
+            Acomodé <b>todos los teléfonos</b> para que se lean igualito.</span></li>
+        <li><span class="ic">🗓️</span><span data-es="Puse <b>todas las fechas</b> escritas de la misma forma."
+            data-en="I made <b>all the dates</b> written the same way.">
+            Puse <b>todas las fechas</b> escritas de la misma forma.</span></li>
+        <li><span class="ic">⬜</span><span data-es="Algunos espacios decían “N/A” o un guion. Los dejé como <b>vacíos</b>."
+            data-en="Some spots said “N/A” or just a dash. I treated those as <b>empty</b>.">
+            Algunos espacios decían “N/A” o un guion. Los dejé como <b>vacíos</b>.</span></li>
+      </ul>
+    </div>
+    <!-- CHANGE CARDS (story, not diff) -->
+    <div class="section-title" data-es="Aquí está lo que cambió — antes y después" data-en="Here's what changed — before and after">
+      Aquí está lo que cambió — antes y después
+    </div>
+    <div class="change">
+      <div class="head">🌮 <span data-es="El mismo taco, escrito de varias maneras" data-en="The same taco, written a few ways">El mismo taco, escrito de varias maneras</span></div>
+      <div class="say" data-es="La computadora por fin entiende que es el mismo taco. Los conté juntos."
+           data-en="The computer finally understands it's the same taco. I counted them together.">
+        La computadora por fin entiende que es el mismo taco. Los conté juntos.</div>
+      <div class="ba">
+        <div class="col">
+          <div class="lab" data-es="Antes" data-en="Before">Antes</div>
+          <div class="row dim">al pastor</div>
+          <div class="row dim">Al Pastor</div>
+          <div class="row dim">pastor</div>
+          <div class="row dim">tacos al pastor</div>
+        </div>
+        <div class="arrow">→</div>
+        <div class="col after">
+          <div class="lab" data-es="Después" data-en="After">Después</div>
+          <div class="row"><b>Al pastor</b></div>
+          <div class="row" data-es="1,204 vendidos" data-en="1,204 sold">1,204 vendidos</div>
+        </div>
+      </div>
+    </div>
+    <div class="change">
+      <div class="head">📞 <span data-es="Los teléfonos, todos parejitos" data-en="Phone numbers, all matching">Los teléfonos, todos parejitos</span></div>
+      <div class="say" data-es="Los dejé escritos igual para que sean fáciles de leer y marcar."
+           data-en="I made them all match so they're easy to read and dial.">
+        Los dejé escritos igual para que sean fáciles de leer y marcar.</div>
+      <div class="ba">
+        <div class="col">
+          <div class="lab" data-es="Antes" data-en="Before">Antes</div>
+          <div class="row dim">55-1234.5678</div>
+          <div class="row dim">(55) 12345678</div>
+          <div class="row dim">5512345678</div>
+        </div>
+        <div class="arrow">→</div>
+        <div class="col after">
+          <div class="lab" data-es="Después" data-en="After">Después</div>
+          <div class="row"><b>55 1234 5678</b></div>
+        </div>
+      </div>
+    </div>
+    <!-- GENTLE CONFIRM — money -->
+    <div class="ask" id="ask1">
+      <div class="q">💵 <span data-es="¿Dejo fuera del total las filas de $0.00?" data-en="Leave the $0.00 rows out of the total?">¿Dejo fuera del total las filas de $0.00?</span></div>
+      <div class="detail" data-es="Encontré <b>31 filas que marcan $0.00</b>. Eso parece un error del sistema, no una venta. Tú decides — yo no toco el dinero sin preguntarte."
+           data-en="I found <b>31 rows showing $0.00</b>. That looks like a glitch, not a sale. You decide — I won't touch money without asking.">
+        Encontré <b>31 filas que marcan $0.00</b>. Eso parece un error del sistema, no una venta. Tú decides — yo no toco el dinero sin preguntarte.</div>
+      <div class="actions">
+        <button class="yes" onclick="answer('ask1')" data-es="Sí, déjalas fuera" data-en="Yes, leave them out">Sí, déjalas fuera</button>
+        <button class="no" onclick="answer('ask1')" data-es="No, déjalas" data-en="No, keep them">No, déjalas</button>
+      </div>
+      <div class="answered" id="ans-ask1">✓ <span data-es="Anotado. Tú mandas." data-en="Got it. You're in charge.">Anotado. Tú mandas.</span></div>
+    </div>
+    <!-- GENTLE CONFIRM — duplicates -->
+    <div class="ask" id="ask2">
+      <div class="q">👥 <span data-es="¿Estos dos son la misma persona?" data-en="Are these two the same person?">¿Estos dos son la misma persona?</span></div>
+      <div class="detail" data-es="<b>“Yolanda Pérez”</b> y <b>“Yola Perez”</b> tienen el mismo teléfono. ¿Los junto en uno solo?"
+           data-en="<b>“Yolanda Pérez”</b> and <b>“Yola Perez”</b> share the same phone. Shall I count them as one?">
+        <b>“Yolanda Pérez”</b> y <b>“Yola Perez”</b> tienen el mismo teléfono. ¿Los junto en uno solo?</div>
+      <div class="actions">
+        <button class="yes" onclick="answer('ask2')" data-es="Sí, es la misma" data-en="Yes, same person">Sí, es la misma</button>
+        <button class="no" onclick="answer('ask2')" data-es="No, déjalas aparte" data-en="No, keep separate">No, déjalas aparte</button>
+      </div>
+      <div class="answered" id="ans-ask2">✓ <span data-es="Listo, como tú digas." data-en="Done, as you say.">Listo, como tú digas.</span></div>
+    </div>
+    <!-- HONEST FLAGS -->
+    <div class="flags">
+      <div class="q">🤔 <span data-es="De estas no estuve segura — te las dejé para que las veas" data-en="I wasn't sure about these — I left them for you">De estas no estuve segura — te las dejé para que las veas</span></div>
+      <ul>
+        <li data-es="Dos teléfonos tienen muy pocos números. No los cambié por si tú los conoces."
+            data-en="Two phone numbers have too few digits. I didn't change them in case you know them.">
+            Dos teléfonos tienen muy pocos números. No los cambié por si tú los conoces.</li>
+        <li data-es="Las notas del catering (“fiesta Sra. Mendoza”) no las entendí bien. Las dejé tal cual."
+            data-en="The catering notes (“Mrs. Mendoza's party”) I didn't quite understand. I left them as they were.">
+            Las notas del catering (“fiesta Sra. Mendoza”) no las entendí bien. Las dejé tal cual.</li>
+        <li data-es="El total de mayo y la suma de las filas no cuadran por $84. Aquí te lo marco para que lo cheques con tu caja."
+            data-en="May's total and the rows add up $84 apart. I'm flagging it so you can check it against your cash.">
+            El total de mayo y la suma de las filas no cuadran por $84. Aquí te lo marco para que lo cheques con tu caja.</li>
+      </ul>
+    </div>
+    <!-- BONUS CARD -->
+    <div class="bonus">
+      <div class="em">🫙</div>
+      <div>
+        <div class="t" data-es="De pasada: se te está acabando el adobo de pastor"
+             data-en="By the way: you're running low on pastor marinade">De pasada: se te está acabando el adobo de pastor</div>
+        <div class="d" data-es="Con lo que vendiste, te alcanza para unos 6 días. Buen momento para pedir más."
+             data-en="At this pace you have about 6 days left. Good time to reorder.">
+            Con lo que vendiste, te alcanza para unos 6 días. Buen momento para pedir más.</div>
+      </div>
+    </div>
+    <!-- DOWNLOAD BAND -->
+    <div class="download">
+      <button class="btn" onclick="return false">
+        <span>💾</span><span data-es="Dame mi copia limpia" data-en="Get my clean copy">Dame mi copia limpia</span>
+      </button>
+      <div style="margin-top:14px;">
+        <button class="btn ghost" onclick="return false" data-es="🖨️ Imprimir el resumen en palabras sencillas" data-en="🖨️ Print the plain-words summary">
+          🖨️ Imprimir el resumen en palabras sencillas</button>
+      </div>
+      <div class="small" data-es="Tu original (resumen-del-mes-mayo.xlsx) sigue intacto. Esto es una copia nueva."
+           data-en="Your original (resumen-del-mes-mayo.xlsx) is untouched. This is a fresh new copy.">
+        Tu original (resumen-del-mes-mayo.xlsx) sigue intacto. Esto es una copia nueva.</div>
+    </div>
+    <!-- REVERSIBILITY -->
+    <div class="center mt">
+      <button class="btn ghost" onclick="return false" data-es="↩️ Mejor déjalo como estaba" data-en="↩️ Put it back the way it was">↩️ Mejor déjalo como estaba</button>
+    </div>
+    <div class="footnote" data-es="Lo hiciste tú misma, y está bien. 🌙 Buenas noches, Lupita."
+         data-en="You did it yourself, and it's right. 🌙 Goodnight, Lupita.">
+      Lo hiciste tú misma, y está bien. 🌙 Buenas noches, Lupita.
+    </div>
+  </section>
+</div>
+<script>
+  var screens = ['s0','s1','s2'];
+  var pills = document.querySelectorAll('.steps .pill');
+  function go(i, el){
+    screens.forEach(function(id,n){
+      document.getElementById(id).classList.toggle('active', n===i);
+    });
+    pills.forEach(function(p,n){ p.classList.toggle('on', n===i); });
+    window.scrollTo({top:0,behavior:'smooth'});
+    if(i===1){ runWork(); }
+  }
+  // working state: cycle reassuring notes, then auto-advance
+  var workTimers = [];
+  function runWork(){
+    workTimers.forEach(clearTimeout); workTimers = [];
+    var note = document.getElementById('workNote');
+    var es = [
+      "Trabajando aquí mismo, en tu computadora. Tu original está a salvo.",
+      "Estoy juntando los tacos que están escritos de varias maneras…",
+      "Acomodando teléfonos y fechas para que se lean igualito…",
+      "Casi listo — guardando una copia nueva, sin tocar tu original."
+    ];
+    var en = [
+      "Working right here on your computer. Your original is safe.",
+      "Gathering the tacos that are written a few different ways…",
+      "Tidying phone numbers and dates so they're easy to read…",
+      "Almost there — saving a fresh copy, leaving your original untouched."
+    ];
+    var k = (lang==='es') ? es : en;
+    var step = 0;
+    note.textContent = k[0];
+    for(var s=1;s<k.length;s++){
+      (function(s){ workTimers.push(setTimeout(function(){ note.textContent = k[s]; }, s*900)); })(s);
+    }
+    workTimers.push(setTimeout(function(){ if(document.getElementById('s1').classList.contains('active')) go(2); }, 3700));
+  }
+  function answer(id){
+    var card = document.getElementById(id);
+    card.querySelector('.actions').style.display='none';
+    document.getElementById('ans-'+id).style.display='flex';
+  }
+  // language toggle
+  var lang = 'es';
+  function setLang(l, el){
+    lang = l;
+    document.querySelectorAll('.lang button').forEach(function(b){b.classList.remove('on');});
+    el.classList.add('on');
+    document.documentElement.lang = l;
+    document.querySelectorAll('[data-es]').forEach(function(node){
+      var v = node.getAttribute('data-'+l);
+      if(v!=null) node.innerHTML = v;
+    });
+  }
+</script>
+</body>
+</html>

design/mockups/office/index.html ADDED Viewed

	@@ -0,0 +1,219 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>ScrubData — clean spreadsheets, with the receipts</title>
+<style>
+  :root{
+    --paper:#faf7f2; --card:#fffdfa; --ink:#23201c; --ink-soft:#6b6359;
+    --line:#ece5da; --accent:#2f6f5e; --accent-soft:#e7f1ec;
+    --done:#3f7d5f; --done-bg:#eef5ef; --done-line:#cfe3d4;
+    --call:#b06a1f; --call-bg:#fbf1e2; --call-line:#f0dcbf;
+    --flag:#7a7367; --flag-bg:#f3efe8;
+    --shadow:0 1px 2px rgba(40,30,20,.04),0 8px 24px rgba(40,30,20,.06);
+    --r:15px;
+  }
+  *{box-sizing:border-box}
+  body{margin:0;background:var(--paper);color:var(--ink);
+    font-family:Inter,-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,sans-serif;
+    line-height:1.5;-webkit-font-smoothing:antialiased}
+  .wrap{max-width:760px;margin:0 auto;padding:0 22px}
+  a{color:var(--accent)}
+  /* privacy ribbon */
+  .ribbon{background:var(--accent-soft);color:#234e42;font-size:13.5px;
+    text-align:center;padding:9px 16px;border-bottom:1px solid #d6e7df}
+  .ribbon b{font-weight:600}
+  /* header */
+  header{padding:40px 0 8px}
+  .logo{display:flex;align-items:center;gap:9px;font-weight:700;font-size:18px;letter-spacing:-.2px}
+  .logo .mark{width:26px;height:26px;border-radius:8px;background:var(--accent);
+    display:grid;place-items:center;color:#fff;font-size:15px}
+  h1{font-size:30px;line-height:1.15;letter-spacing:-.6px;margin:22px 0 8px;font-weight:740}
+  .sub{color:var(--ink-soft);font-size:16.5px;max-width:560px}
+  /* file chip */
+  .filebar{display:flex;align-items:center;gap:12px;margin:26px 0 6px;
+    background:var(--card);border:1px solid var(--line);border-radius:var(--r);
+    padding:14px 16px;box-shadow:var(--shadow)}
+  .fileicon{width:34px;height:34px;border-radius:9px;background:#eef4f1;color:var(--accent);
+    display:grid;place-items:center;font-size:16px;flex:none}
+  .filebar .nm{font-weight:600}
+  .filebar .meta{color:var(--ink-soft);font-size:13.5px}
+  .filebar .spacer{flex:1}
+  .pill-done-mini{font-size:12px;font-weight:600;color:var(--done);
+    background:var(--done-bg);border:1px solid var(--done-line);padding:3px 9px;border-radius:20px}
+  /* summary */
+  section{margin:34px 0}
+  .eyebrow{font-size:12.5px;font-weight:700;letter-spacing:.06em;text-transform:uppercase;
+    color:var(--ink-soft);margin-bottom:13px}
+  .result-h{font-size:22px;font-weight:720;letter-spacing:-.3px;margin:0 0 4px}
+  .result-sub{color:var(--ink-soft);margin:0 0 4px}
+  .summary{background:var(--card);border:1px solid var(--line);border-radius:var(--r);
+    padding:6px 20px;box-shadow:var(--shadow)}
+  .summary li{list-style:none;padding:14px 0;border-bottom:1px solid var(--line);
+    display:flex;gap:13px;align-items:flex-start;font-size:15.5px}
+  .summary li:last-child{border-bottom:0}
+  .summary .ic{flex:none;margin-top:1px;font-size:16px}
+  .summary b{font-weight:650}
+  .handoff{color:var(--call)}
+  /* change cards */
+  .card{background:var(--card);border:1px solid var(--line);border-left-width:4px;
+    border-radius:var(--r);padding:17px 19px;margin:13px 0;box-shadow:var(--shadow)}
+  .card.done{border-left-color:var(--done)}
+  .card.call{border-left-color:var(--call)}
+  .card.flag{border-left-color:#cdbfa6}
+  .card-top{display:flex;align-items:center;gap:10px;margin-bottom:4px}
+  .card-title{font-weight:650;font-size:15.5px}
+  .pill{font-size:11.5px;font-weight:700;letter-spacing:.04em;padding:3px 9px;border-radius:20px;margin-left:auto;flex:none}
+  .pill.done{color:var(--done);background:var(--done-bg);border:1px solid var(--done-line)}
+  .pill.call{color:var(--call);background:var(--call-bg);border:1px solid var(--call-line)}
+  .pill.flag{color:var(--flag);background:var(--flag-bg);border:1px solid #e2d9c9}
+  .card-body{color:var(--ink-soft);font-size:14.5px}
+  /* before/after */
+  .ba{display:grid;grid-template-columns:1fr auto 1fr;gap:10px;align-items:center;margin-top:13px}
+  .ba .col{background:#fbf9f5;border:1px solid var(--line);border-radius:11px;padding:11px 13px}
+  .ba .lab{font-size:11px;text-transform:uppercase;letter-spacing:.05em;color:var(--ink-soft);margin-bottom:6px}
+  .ba .val{font-size:13.5px;font-family:"SF Mono",ui-monospace,Menlo,monospace}
+  .ba .was{color:#9a8d7c}
+  .ba .arrow{color:var(--accent);font-size:18px;text-align:center}
+  .ba .ann{color:var(--done);font-weight:600;font-size:12.5px}
+  .strike{text-decoration:line-through;text-decoration-color:#c9bcab;color:#9a8d7c}
+  /* your-call buttons */
+  .actions{display:flex;gap:9px;margin-top:14px}
+  .btn{font:inherit;font-size:14px;font-weight:600;padding:9px 15px;border-radius:10px;cursor:pointer;border:1px solid var(--line);background:#fff;color:var(--ink)}
+  .btn.primary{background:var(--accent);border-color:var(--accent);color:#fff}
+  .btn.ghost{background:transparent}
+  /* download */
+  .download{background:linear-gradient(180deg,#fffdfa,#f7f2ea);border:1px solid var(--line);
+    border-radius:18px;padding:26px;text-align:center;box-shadow:var(--shadow)}
+  .download h3{margin:0 0 4px;font-size:19px;font-weight:720}
+  .download p{margin:0 0 18px;color:var(--ink-soft);font-size:14.5px}
+  .dl-row{display:flex;gap:11px;justify-content:center;flex-wrap:wrap}
+  .btn.big{padding:12px 22px;font-size:15px}
+  .revert{margin-top:16px;font-size:13px;color:var(--ink-soft)}
+  footer{padding:30px 0 50px;text-align:center;color:#9a8d7c;font-size:13px;border-top:1px solid var(--line);margin-top:36px}
+  .restart{display:inline-block;margin-top:22px;font-size:14px;color:var(--accent);font-weight:600;text-decoration:none}
+</style>
+</head>
+<body>
+<div class="ribbon">🔒 <b>Runs entirely on your machine.</b> Your original file is untouched — nothing is uploaded.</div>
+<div class="wrap">
+  <header>
+    <div class="logo"><span class="mark">✦</span> ScrubData</div>
+    <h1>Done. Here's what changed.</h1>
+    <p class="sub">I did the tedious part — matching spellings, fixing formats, finding the blanks. Everything below is reversible, and I left the judgment calls for you.</p>
+  </header>
+  <div class="filebar">
+    <div class="fileicon">▦</div>
+    <div>
+      <div class="nm">crm-export-may.csv</div>
+      <div class="meta">3,840 rows · 11 columns · cleaned in 4.2s, locally</div>
+    </div>
+    <div class="spacer"></div>
+    <div class="pill-done-mini">6 fixes applied</div>
+  </div>
+  <!-- SUMMARY -->
+  <section>
+    <div class="eyebrow">The summary, in plain English</div>
+    <ul class="summary">
+      <li><span class="ic">🗂️</span><div><b>Unified 4 spellings of "United States"</b> (US, U.S., usa, United States) into one. 2,108 rows affected.</div></li>
+      <li><span class="ic">🏷️</span><div><b>Merged 4 ways of writing the same deal stage</b> ("Closed Won", "closed-won", "Won", "CW") into one. 1,204 rows.</div></li>
+      <li><span class="ic">⬜</span><div><b>Treated 47 disguised blanks</b> ("N/A", "none", "—") as empty, so your counts and filters behave.</div></li>
+      <li><span class="ic">📅</span><div><b>Standardized all dates to YYYY-MM-DD</b> and phone numbers to one format.</div></li>
+      <li class="handoff"><span class="ic">✋</span><div><b>2 changes touch money or identity, so I didn't make them.</b> They're below for your call.</div></li>
+    </ul>
+  </section>
+  <!-- DONE -->
+  <section>
+    <div class="eyebrow">Handled — already applied (and reversible)</div>
+    <div class="card done">
+      <div class="card-top"><span class="card-title">Same country, counted as one</span><span class="pill done">DONE</span></div>
+      <div class="card-body">Four spellings were splitting your "United States" rows across the report.</div>
+      <div class="ba">
+        <div class="col"><div class="lab">Before</div>
+          <div class="val was">US · U.S. · usa<br>United States</div></div>
+        <div class="arrow">→</div>
+        <div class="col"><div class="lab">After</div>
+          <div class="val">United States</div><div class="ann">one value · 2,108 rows</div></div>
+      </div>
+    </div>
+    <div class="card done">
+      <div class="card-top"><span class="card-title">Phone numbers, one format</span><span class="pill done">DONE</span></div>
+      <div class="card-body">Mixed formats standardized so lookups and dedupes line up.</div>
+      <div class="ba">
+        <div class="col"><div class="lab">Before</div>
+          <div class="val was">(415) 555.0192<br>415-555-0147<br>+1 415 555 0188</div></div>
+        <div class="arrow">→</div>
+        <div class="col"><div class="lab">After</div>
+          <div class="val">(415) 555-0192<br>(415) 555-0147<br>(415) 555-0188</div></div>
+      </div>
+    </div>
+  </section>
+  <!-- YOUR CALL -->
+  <section>
+    <div class="eyebrow">Needs your call — I didn't touch these</div>
+    <div class="card call">
+      <div class="card-top"><span class="card-title">31 deals show $0.00</span><span class="pill call">YOUR CALL</span></div>
+      <div class="card-body">Usually a sync glitch, not a real deal. Leaving them in drags your win total down. Exclude them from the total?</div>
+      <div class="actions">
+        <button class="btn primary">Leave them out</button>
+        <button class="btn ghost">Keep them</button>
+      </div>
+    </div>
+    <div class="card call">
+      <div class="card-top"><span class="card-title">Possible duplicate contact</span><span class="pill call">YOUR CALL</span></div>
+      <div class="card-body">"Yolanda R." and "Yolanda Reyes" share an email (y.reyes@northwind.co). Count them as one contact?</div>
+      <div class="actions">
+        <button class="btn primary">Merge them</button>
+        <button class="btn ghost">Keep both</button>
+      </div>
+    </div>
+  </section>
+  <!-- FLAGGED -->
+  <section>
+    <div class="eyebrow">Worth a look — left exactly as they were</div>
+    <div class="card flag">
+      <div class="card-top"><span class="card-title">3 cells I wouldn't guess at</span><span class="pill flag">FLAGGED</span></div>
+      <div class="card-body">Two phone numbers have too few digits, and one note reads <span style="font-family:ui-monospace,monospace;font-size:13px">"follow up?? — check w/ Dana"</span>. I didn't guess. Left them untouched for you to check.</div>
+    </div>
+  </section>
+  <!-- DOWNLOAD -->
+  <section>
+    <div class="download">
+      <h3>Your clean copy is ready</h3>
+      <p>Take the cleaned file and the change log. Both are yours to keep.</p>
+      <div class="dl-row">
+        <button class="btn primary big">↓ Download clean file</button>
+        <button class="btn big">Export change log</button>
+      </div>
+      <div class="revert">Your original is untouched. Revert any change — or all of them — whenever you want.</div>
+    </div>
+    <div style="text-align:center"><a class="restart" href="#">← Clean another file</a></div>
+  </section>
+</div>
+<footer>Runs locally. Nothing leaves your machine, ever.</footer>
+</body>
+</html>

docs/DATASETS.md ADDED Viewed

	@@ -0,0 +1,57 @@

+# Dataset inventory — every source the system trains on, evaluates on, or must clean
+Stage-3 consolidated registry (2026-06-11). Assignment discipline: a source is
+TRAIN, EVAL, or BENCH — never both sides of train/eval.
+## Paired dirty/clean (27 — eval/paired_bench.py → docs/PAIRED_BENCH.md)
+| source | origin | license | assignment | notes |
+|---|---|---|---|---|
+| hospital, beers, movies_1 | Raha (BigDaMa) | Apache-2.0 | TRAIN | champion mix since v6 |
+| flights, rayyan | Raha | Apache-2.0 | EVAL (GEN) | held-out real errors |
+| tax | Raha | Apache-2.0 | unused | numeric-heavy, huge |
+| ed2_restaurants | BigDaMa ED2 | research | EVAL (GEN) | real NYC variants; errors past row 2k |
+| fodors_zagats | Magellan EM | BSD-ish data | TRAIN | variant-masked EM table |
+| dblp_acm, dblp_scholar | Magellan EM | research | BENCH only | out-of-regime (unique titles / convention-mismatch gold) |
+| cleanml_company, cleanml_movie | CleanML | research | TRAIN | Company = org canon |
+| gidcl_imdb | SICS-FRC GIDCL | none stated | TRAIN (v9+) | 1M-row pair; 57k errors; subset 86k rows |
+| zeroed_billionaire, zeroed_tax100k | WelkinNi/ZeroED | none stated | BENCH | injected; rich categoricals |
+| dgov_* (5 tables) | LUH-DBS Matelda | Apache-2.0 | BENCH | real data.gov tables, injected typos (6,692 more available) |
+| tt_* (8 tables) | ToughTables 2T_WD | CC-BY-4.0 | BENCH | gold-anchored entity misspellings, 370–33.5k corrections each |
+## Wild messy tables (35 — eval/wild_bench.py → docs/WILD_BENCH.md)
+24 portal tables (training/unpaired_sources.json cache: NYC/Chicago/SF/LA/Seattle/TX/WA
+portals, spotify, billboard, titanic, worldcities, airlines) + 12 stage-3 additions
+(training/harvest_wild.py): bx_books (mojibake), salary_survey, fec_indiv80 (PII,
+headerless), acnc_charities (AU), uk_price_paid (headerless UK), irs_eo1,
+glassdoor_jobs (multiline cells), paris_trees (FR), online_retail, bl_flickr_books,
+open_food_facts (211 cols), ct_real_estate. Backlog: CMS doctors (API 400), NHTSA
+FLAT_CMPL (multi-GB), Canada contracts (627MB).
+## Alias vocabularies (training generator material)
+| vocab | size | license | regime |
+|---|---|---|---|
+| toughtables_aliases | 49,629 | CC-BY-4.0 | real entity misspellings (gold-anchored) |
+| musicbrainz_hint_aliases | 34,017 | CC0 | community-recorded artist misspellings |
+| rxnorm_aliases | 17,701 | public domain | drug name synonyms |
+| ror_aliases | 73k orgs | CC0 | research orgs |
+| geonames_city_aliases | 80k cities | CC-BY | city aliases |
+| wikidata_company_aliases | 10.2k | CC0 | company aliases |
+| onet_jobtitle_aliases | 1,016 | CC-BY-4.0 | job titles |
+| nickname_aliases | 555 | Apache-2.0 | first names |
+| openflights_airports | 7,698 | ODbL/DbCL | airports reference |
+| libpostal_aliases | — | MIT | address abbreviations |
+## Measured conclusions that govern future widening
+1. Pre-paired corpus discovery is SATURATED (3 verified hunts) — synthesis from
+   vocabularies is the widening path.
+2. Pair volume / vocab training does NOT move held-out generalization (v7–v9, 4
+   retrains + tt-transfer test): the planner's value_counts cap (80) structurally
+   hides high-cardinality dirty cells. The unlock is architectural: error-suspect /
+   windowed profiling and cross-row entity voting.
+3. The deterministic side (grounding + ops + verifier union) carries never-seen
+   tables today; every op added from a measured regime (normalize_punctuation)
+   moved GEN; convention/encoding ops are the cheapest remaining wins.

docs/DEGENERATE_BASELINES.md ADDED Viewed

	@@ -0,0 +1,30 @@

+# Degenerate baselines + cost-weighted damage (W4.3 + W4.4)
+Same 42 dirty/clean pairs as `eval/paired_bench.py`, scored with `run_real_multi.score()` (churn-neutral F1 + damage). The degenerate policies pin
+the metric: no-op = floor (F1 0, damage 0), oracle = ceiling (F1 1, damage 0),
+random-edit (seeded, 5% of cells) = vandalism the metric must punish. Abstain-all
+is score-identical to no-op — the repair metric is flag-blind by design.
+| policy | macro F1 | macro P | macro R | macro damage | fixed | damage cells |
+|---|---|---|---|---|---|---|
+| no-op | 0.000 | 1.000 | 0.000 | 0.0000 | 0 | 0 |
+| abstain-all | 0.000 | 1.000 | 0.000 | 0.0000 | 0 | 0 |
+| random-edit | 0.000 | 0.001 | 0.001 | 0.0485 | 39 | 80042 |
+| oracle | 1.000 | 1.000 | 1.000 | 0.0000 | 163607 | 0 |
+| shipped | 0.343 | 0.576 | 0.308 | 0.0229 | 83543 | 61679 |
+## Cost-weighted scores (Effective-Reliability style, W4.4)
+score_c = fixes − c·damage_cells, micro-summed over all pairs; per-error =
+score_c / 163607 total benchmark errors.
+| policy | c=1 (per-error) | c=5 (per-error) | c=10 (per-error) |
+|---|---|---|---|
+| no-op | 0 (+0.000) | 0 (+0.000) | 0 (+0.000) |
+| abstain-all | 0 (+0.000) | 0 (+0.000) | 0 (+0.000) |
+| random-edit | -80003 (-0.489) | -400171 (-2.446) | -800381 (-4.892) |
+| oracle | 163607 (+1.000) | 163607 (+1.000) | 163607 (+1.000) |
+| shipped | 21864 (+0.134) | -224852 (-1.374) | -533247 (-3.259) |
+Acceptance: oracle F1 = 1.0 on all pairs: **True** · no-op damage = 0.0 on all pairs: **True**
+Repro: `uv run python -m eval.degenerate` (seed 7, edit fraction 0.05).

docs/FIELD_NOTES.md ADDED Viewed

	@@ -0,0 +1,128 @@

+# Field notes — building ScrubData small, on purpose
+*Build Small Hackathon, June 2026. A ≤4B model, a Gradio Space, and two weeks of
+finding out what "small but honest" actually costs.*
+## The bet
+The person who most needs data cleaning — the ops coordinator with a messy CRM export
+and a Monday deadline — will never write a pandas script, and shouldn't have to ship
+her customer data to a frontier API either. The bet: a 4B model running locally is
+enough, **if you stop asking it to edit data and start asking it to plan**.
+So the model never touches a cell. It reads an aggregated profile (per-value frequency
+counts — so the model sees a bounded, fixed-size summary whether the table has a hundred
+rows or a million) and emits a JSON plan; deterministic pandas executes it. Every change is named, reversible, and logged. Silent edits are
+impossible by construction. That decomposition turned out to be the whole project.
+## Things that broke, in order
+**The fine-tune that aced the test and failed the job.** v4 hit canonicalization F1
+0.90 on held-out synthetic data — and scored exactly 0.000 on real hospital typos. It
+had never seen a high-cardinality real column. Fix: derive training pairs from real
+dirty/clean benchmark tables by cell alignment, keeping only *learnable*
+canonicalizations (a surface form that's a string variant of its target and never a
+legitimate value elsewhere). Real repair recall: 0.00 → 0.42. Synthetic data teaches
+the format; real data teaches the job.
+**The GGUF that lobotomized the model.** Same adapter, two exports: Q8_0 worked
+perfectly, Q4_K_M degenerated into `<tool_call>` loops. Hours of template debugging
+later: the quantization itself was corrupting the export. Then the bf16 path had its
+own version — training converged (loss 0.16) but free-running generation *still*
+emitted tool-call loops, because Qwen3's tool-calling prior dominates the first token.
+The fix is two tokens long: `suppress_tokens=[151657, 151658]`.
+**The model that invented cities.** Asked for canonical forms, a generative model
+generates — including `guntxrsvillx → huntsville` (wrong town). Frequency clustering
+can't fix this either: a lone column has no signal to vote against the error (GARF
+proves this structurally). The fix came from the literature: never free-generate a
+canonical. Retrieve candidates from a reference taxonomy (GeoNames, ISO), require a
+similarity threshold *and* an ambiguity margin, and **abstain** when unsure. `boxz` is
+equally close to `Box` and `Boaz` — so the system declines and asks. We measured the
+abstention: precision rises monotonically with the threshold (90% at the default, 95%
+at 0.91). Knowing when not to act turned out to be the most valuable feature.
+**The eval that graded itself too kindly — twice.** Our own ablations caught two metric
+artifacts: (1) convention-tolerant scoring counted bulk case-rewrites as "good
+changes," inflating precision — removing case-matching *gained* +0.12 until we made
+the metric churn-neutral; (2) our adversarial traps included `Boazz`, which grounding
+correctly maps to the real city Boaz — the trap was punishing correct behavior. Both
+fixes are reported in the paper as results, because an eval you haven't tried to break
+is an eval you can't trust.
+**The honest negative result.** On *injected* typos, classical frequency clustering
+remains a strong baseline — by construction: injection puts the canonical in the
+column, which is clustering's ideal regime. Grounding's edge is real errors, tail
+entities, and not wrong-merging. We report both slices separately rather than
+averaging the difference away.
+**The verifier that made the model shippable.** The fine-tune's hospital numbers told
+an awkward story: recall 0.475 (best we'd measured for a local model) at precision
+0.185 — it fixed errors *and* invented merges. Instead of retraining, we scored every
+proposed mapping with three deterministic gates distilled from its actual failures: a
+value occurring ≥3 times is data, not a typo (*errors are rare*); a repair target must
+dominate its source in frequency (no mapping one typo onto another); digit-bearing
+codes only repair when the letter part is near-identical (`amix-2 → ami-2` yes,
+`ak_ → al_` no). The gated model plan alone: **0.993 precision at 0.287 coverage** —
+146 of 147 changes correct. Union it with the grounded heuristic and you get **0.905
+precision at 0.413 coverage** on hospital's 509 real errors. Every dropped mapping
+becomes a review flag, not a silent skip. That composition — verify the model's
+output, never trust it — is what the app now ships as its default planner.
+## The PII turn
+A friend pointed at the OpenMed project (small Apache-2.0 token classifiers; their
+paper is the sister result to our thesis — small specialized beats big generic). Their
+44M PII model, trained on clinical *sentences*, turned out to transfer perfectly to
+bare CSV cells: 100% on names and addresses, no prompt template needed. We put it
+behind a sensitive-type allowlist and a column-level vote, added a deterministic
+checksum tier (Luhn, IBAN mod-97 — math, not vibes), and made masking an executor
+operation. Leak test: 0/360 residual detectable PII after masking. OOD type detection:
+5/5 with 0/7 false positives. The privacy ribbon at the top of the app — "nothing
+leaves this machine" — now describes the PII handling too, not just the inference.
+## The word that broke the demo
+We shipped the engine, then sent the live Space to people who actually have messy
+spreadsheets and aren't data people. The most useful feedback wasn't a bug report — it
+was that the word **"cleaning" didn't mean anything to them**. One tester read "clean my
+Excel" as *deleting* data: *"¿Te refieres a que elimine algo de algún archivo?"* ("you
+mean it removes something from the file?"). Another didn't know where to start: *"¿eso
+del Excel te lo subimos ahí o cómo?"* ("the Excel thing — do we upload it there, or
+how?"). The clearest explanation of the whole product turned out to be a sentence we
+typed by hand in a chat reply — *"it fixes text errors: names, phones, emails, cities"* —
+and that sentence was nowhere in the app.
+The engine was fine. The *framing* was the failure. So we changed the product to **show**
+what cleaning is instead of naming it: the hero now opens with a literal before→after
+strip (`nigeia → Nigeria`, `Calfornia → California`) before any upload, the headline is
+the sentence that worked in chat ("Fix the messy text in your spreadsheet"), the copy
+says plainly "I never delete your data," jargon labels are gone ("with PII" → "with
+sensitive data"), and a one-click "watch it run on a sample" path removes the "where do I
+even start" wall. One honesty footnote from the rewrite: our first before→after example
+added a `+52` country code to a phone number — which the executor doesn't actually do — so
+we cut it. The demo strip can only show what the engine truly does.
+n was small and informal (~3 people we know), so this isn't a usability study. But you
+only need to watch one person mistake your tool for a delete button to learn the lesson:
+the people who most need the tool don't share your vocabulary, and the demo has to teach
+the concept before it can show the feature.
+## What we'd tell the next person
+1. **Planner/executor is the trust unlock.** Auditability isn't a feature you add;
+   it's a decomposition you choose.
+2. **Verify supervision by executing it.** Every training example we kept provably
+   recovers the clean table. Bad plans can't become labels.
+3. **Ground generation in references and budget for abstention.** A small model that
+   declines correctly beats a big model that guesses confidently.
+4. **Attack your own eval before reviewers do.** Both of our metric bugs were found by
+   ablations we almost didn't run.
+5. **Small models are enough more often than you think** — and roughly $35 of GPU
+   credit covers an embarrassing number of mistakes if each one teaches you something.
+6. **Test the framing on someone outside your vocabulary.** The engine can be correct and
+   the product still unusable if the first screen assumes a word — "cleaning" — that your
+   user doesn't have. Show the concept before you name the feature.
+— Built with a ≤4B planner, a 44M PII classifier, checksums, and a reference gazetteer.
+Total model weight: under 4.1B parameters. Total cloud spend: about $35.

docs/GITTABLES_AUDIT.md ADDED Viewed

	@@ -0,0 +1,24 @@

+# GitTables N=250 audit — trust contract at scale
+Shipped pipeline over 239 real GitHub tables (Matelda GitTables-subsets,
+Apache-2.0). IMPORTANT framing: this subset is a CLEAN LAKE (dirty == clean for
+238/239 tables), so the repair-F1 dimension is void and `macro_damage` is NOT
+damage — it is an INTERVENTION-RATE upper bound (any semantic normalization the
+pipeline performs counts against gold=input, including intended format parsing).
+What this audit certifies: robustness (0 pipeline failures), schema validity
+(239/239), and ZERO silent edits across 239 arbitrary real-world tables — the
+trust contract at scale. The ~5.5% intervention rate (43 tables untouched) is
+the conservative measure of how much the pipeline chooses to act on arbitrary
+tables.
+| metric | value |
+|---|---|
+| tables_audited | 239 |
+| pipeline_failures | 0 |
+| plan_valid | 239 |
+| tables_with_silent_edits | 0 |
+| tables_with_errors | 1 |
+| macro_f1_on_errored | 0.0 |
+| macro_damage | 0.055 |
+| zero_damage_tables | 43 |
+| seconds | 796.9 |

docs/PAIRED_BENCH.md ADDED Viewed

	@@ -0,0 +1,49 @@

+# Paired Bench — shipped system on every cell-aligned pair
+Churn-neutral repairs metric + variant-class recall; `seen` = source fed
+the champion's training mix (flagged, not hidden).
+| dataset | seen | rows×cols | errors | variant | F1 | precision | recall | VR | damage |
+|---|---|---|---|---|---|---|---|---|---|
+| dgov_2_10_budget_presentation_award_summary |  | 16×6 | 9 | 9 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
+| dgov_emergency_operating_center_tools |  | 7×3 | 4 | 3 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
+| dgov_illinois_obesity_by_county |  | 102×5 | 17 | 17 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
+| fodors_zagats | ✓ | 112×6 | 206 | 206 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0536 |
+| rayyan |  | 1000×11 | 948 | 171 | 0.0 | 0.0 | 0.0 | 0.0 | 0.1178 |
+| zeroed_tax100k |  | 20000×15 | 952 | 117 | 0.0 | 0.0 | 0.006 | 0.051 | 0.0822 |
+| ed2_restaurants |  | 20000×15 | 309 | 76 | 0.001 | 0.0 | 0.026 | 0.105 | 0.0718 |
+| dblp_acm |  | 2224×4 | 2128 | 2128 | 0.003 | 0.273 | 0.001 | 0.001 | 0.001 |
+| cleanml_movie | ✓ | 9329×8 | 4779 | 8 | 0.008 | 0.019 | 0.005 | 0.0 | 0.0172 |
+| dblp_scholar |  | 2408×4 | 3099 | 3099 | 0.008 | 0.012 | 0.006 | 0.006 | 0.233 |
+| tt_cn5wvwhh |  | 8302×5 | 370 | 370 | 0.021 | 0.046 | 0.014 | 0.014 | 0.0025 |
+| beers | ✓ | 2410×11 | 4362 | 693 | 0.026 | 0.042 | 0.019 | 0.117 | 0.0044 |
+| dgov_mva_vehicle_sales_counts_by_month_for_ca |  | 248×6 | 43 | 24 | 0.042 | 0.2 | 0.023 | 0.042 | 0.0 |
+| zeroed_billionaire |  | 2614×22 | 5248 | 1146 | 0.103 | 0.232 | 0.067 | 0.305 | 0.0042 |
+| dgov_field_listings |  | 122×20 | 317 | 250 | 0.106 | 0.133 | 0.088 | 0.112 | 0.0523 |
+| flights |  | 2376×7 | 4920 | 1049 | 0.164 | 0.265 | 0.119 | 0.247 | 0.0839 |
+| dgov_grocery_stores_2013 |  | 506×17 | 420 | 332 | 0.21 | 0.265 | 0.174 | 0.193 | 0.0192 |
+| cleanml_company | ✓ | 20000×9 | 65 | 65 | 0.243 | 0.147 | 0.708 | 0.708 | 0.0015 |
+| dgov_median_household_income |  | 174×19 | 138 | 83 | 0.25 | 0.579 | 0.159 | 0.265 | 0.0 |
+| hospital | ✓ | 1000×20 | 509 | 379 | 0.258 | 0.169 | 0.542 | 0.607 | 0.0662 |
+| dgov_louisville_metro_ky_inspection_results_p |  | 521×18 | 1126 | 1044 | 0.31 | 0.933 | 0.186 | 0.2 | 0.0002 |
+| dgov_la_county_covid_cases |  | 975×14 | 579 | 579 | 0.34 | 0.983 | 0.206 | 0.206 | 0.0 |
+| dgov_allegheny_county_tobacco_vendors |  | 1248×12 | 2392 | 2109 | 0.343 | 0.882 | 0.213 | 0.242 | 0.0008 |
+| dgov_legislative_bridge_names |  | 252×16 | 415 | 396 | 0.358 | 0.614 | 0.253 | 0.265 | 0.0091 |
+| tt_co23z7go |  | 15477×4 | 33542 | 33542 | 0.36 | 0.929 | 0.223 | 0.223 | 0.0004 |
+| dgov_louisville_metro_ky_permitted_hotels_and |  | 131×13 | 191 | 182 | 0.424 | 0.898 | 0.277 | 0.291 | 0.0007 |
+| dgov_health_conditions_among_children_under_a |  | 2744×16 | 2900 | 2844 | 0.426 | 0.357 | 0.528 | 0.539 | 0.0569 |
+| gidcl_imdb | ✓ | 20000×6 | 13320 | 7890 | 0.438 | 0.489 | 0.396 | 0.669 | 0.0297 |
+| tt_uma1dnf6 |  | 8302×5 | 5080 | 5080 | 0.442 | 0.911 | 0.292 | 0.292 | 0.0026 |
+| dgov_medicare_part_d_opioid_prescribing_rates |  | 677×17 | 547 | 547 | 0.447 | 0.775 | 0.314 | 0.314 | 0.0026 |
+| dgov_access_control |  | 4928×13 | 4180 | 4161 | 0.551 | 0.933 | 0.391 | 0.392 | 0.0 |
+| dgov_3_09_census_acs_post_secondary_education |  | 53×17 | 82 | 82 | 0.552 | 0.941 | 0.39 | 0.39 | 0.0 |
+| dgov_305b_assessed_lake_2020 |  | 182×23 | 442 | 424 | 0.556 | 0.766 | 0.437 | 0.455 | 0.0139 |
+| dgov_ah_provisional_diabetes_death_counts_for |  | 226×16 | 142 | 141 | 0.571 | 0.951 | 0.408 | 0.411 | 0.0 |
+| dgov_jefferson_county_ky_post_offices |  | 32×9 | 26 | 26 | 0.651 | 0.824 | 0.538 | 0.538 | 0.0115 |
+| dgov_national_obesity_by_state_1 |  | 52×5 | 13 | 13 | 0.7 | 1.0 | 0.538 | 0.538 | 0.0 |
+| movies_1 | ✓ | 7390×17 | 7006 | 5567 | 0.705 | 0.639 | 0.786 | 0.989 | 0.0226 |
+| tt_3n6s2fcx |  | 9396×3 | 9510 | 9510 | 0.955 | 0.998 | 0.916 | 0.916 | 0.0 |
+| tt_2zwsmotj |  | 10855×3 | 10977 | 10977 | 0.956 | 0.997 | 0.918 | 0.918 | 0.0 |
+| tt_8yinkydr |  | 14008×3 | 14188 | 14188 | 0.956 | 0.997 | 0.918 | 0.918 | 0.0 |
+| tt_dvnkv0xu |  | 15477×4 | 15676 | 15676 | 0.956 | 0.997 | 0.919 | 0.919 | 0.0 |
+| tt_00e2h310 |  | 12285×3 | 12433 | 12433 | 0.957 | 0.998 | 0.919 | 0.919 | 0.0 |

docs/PAPER.md ADDED Viewed

	@@ -0,0 +1,66 @@

+> **SUPERSEDED SCAFFOLD (2026-06-12).** The paper was reframed; current title:
+> "Verified Cleaning Plans: Plan-Level Selective Prediction Turns Local LLM
+> Planners into Trustworthy Table Cleaners". This file is the original outline,
+> kept for history. The live paper is docs/paper/main.tex.
+# ScrubData — paper scaffold & related-work map
+**Working title:** *Small fine-tuned planners with execution-verified data and calibrated
+abstention match larger models on tabular canonicalization.*
+**One-line claim (measured):** a ≤4B fine-tune that emits a *cleaning plan* (not edited cells)
+reaches `canon_f1 0.86` on alias-level canonicalization vs `0.45` for a large generic model and
+`0.13` for a rule heuristic — and, with reference grounding + calibrated abstention, beats the
+tool people actually use (OpenRefine) on a wide validation suite at far lower damage.
+## Contributions (the combination is the novelty — not "LLM cleans data")
+1. **Planner/executor decomposition.** The model proposes a structured JSON plan; deterministic
+   pandas executes it. Auditable, reversible, **no silent edits** (`observability.py`,
+   `trace.py`). This is the trust/monitorability contract.
+2. **Execution-self-verified synthetic SFT.** Every training example's plan is checked to
+   actually recover the known-clean original by *running the executor* (`training/build_dataset.py`).
+   A clean, citable data-generation method (drops non-recovering examples).
+3. **Reference grounding + calibrated abstention.** Canonicalization is reconciled against a
+   type-scoped taxonomy (GeoNames/pycountry; `reconcile.py`, `grounded.py`); the system ABSTAINS
+   under ambiguity instead of hallucinating a canonical (`eval/calibration.py`: risk-coverage +
+   ECE). Structural fix for the over-correction larger models also exhibit.
+4. **Aggregation + column-batching.** Prompt size scales with *distinct values*, not rows
+   (`profiler.py` value_counts + `model_planner.make_batched_planner`).
+## Related work (position against — reviewers know this field)
+- **Error detection/repair:** Raha & Baran (Mahdavi et al.), HoloClean (Rekatsinas et al. 2017,
+  `arXiv 1702.00820`), GARF — we *use* their hospital/beers/flights/rayyan as OOD eval and cite
+  GARF as the frequency-only baseline our grounding beats (it cannot supply a canonical for a lone
+  column).
+- **LLMs for data wrangling:** "Can Foundation Models Wrangle Your Data?" (Narayan et al. 2022),
+  Jellyfish, Table-GPT/TableLlama (`2311.09206`), RetClean (`2303.16909`). We differ by being a
+  *small fine-tuned planner* + grounding + abstain, not a large zero-shot value-editor.
+- **Grounding / entity disambiguation:** RACOON (`2409.14556`), TURL (`2006.14806`), Belotti et al.
+  table-EL (`2408.06423`), MTab — motivate retrieval-then-abstain and warn against memorizing
+  canonicals into weights (TURL ~40% OOD collapse). See `taxonomy-grounding.md`.
+- **The tool we beat:** **OpenRefine** clustering — fingerprint (key collision) + nearest-neighbor
+  (kNN/edit-distance), reimplemented as `scrubdata/baselines.py` for head-to-head.
+- **Selective prediction:** calibrated abstention / risk-coverage (El-Yaniv & Wiener; Geifman &
+  El-Yaniv) — our ECE/AURC study; also the AI-safety monitorability framing.
+## Experiments
+- **Headline:** canon_f1 vs large-generic vs heuristic on frozen synthetic gold (Layer 1).
+- **Wide north-star (`eval/run_real_multi.py`):** double-macro (error-type × domain) F1 + damage +
+  abstain over Raha real-error sets **+ seeded error-injection** on 20+ harvested gov/GitHub clean
+  domains (`eval/inject.py`); multi-seed 95% CIs. Hospital is 1 dataset of many.
+- **Money result:** grounded vs OpenRefine fingerprint & kNN on the same suite (grounded wins F1 +
+  damage; kNN over-merges — higher recall, low precision, high damage).
+- **Calibration (`eval/calibration.py`):** risk-coverage, AURC, ECE; operating point for ≥95%
+  precision via the abstain threshold.
+- **Ablations to add:** −grounding, −abstain, −execution-verification, −aggregation.
+## Honest limitations (the integrity reviewers reward)
+- Reference *coverage* is the recall ceiling (Belotti) — uncovered entities abstain by design.
+- Convention vs error: standardization (date→ISO, `%`→fraction) is product value, not damage —
+  the metric is case/whitespace-normalized but a format-aware variant is future work.
+- ECE shows mild over-confidence (difflib-ratio scores) — temperature/Platt scaling is future work.
+- Some benchmark sources gated (CleanML/TableEG behind Dropbox/Drive; licenses noted).
+## To-do before submission
+multi-seed CIs (running) · −ablations · OpenRefine table with CIs · cs.DB endorser (primary cs.DB, cross-list cs.CL+cs.LG; endorser targets = the data-cleaning authors we cite) · selective-
+prediction figure · keep the eval README's convention-vs-error honesty.

docs/SCALING_ARM.md ADDED Viewed

	@@ -0,0 +1,46 @@

+# W1.c — ≤32B Zero-Label Repair Scaling Arm (multi-family, zero-shot)
+First scaling measurement for the verified-union planner: vanilla (NOT fine-tuned)
+20–31B open-weights models dropped into the EXACT hospital pipeline the 4B fine-tune
+gate used — batched raw planner (batch_size=4, same `scrubdata/prompt.py` contract,
+temperature 0) → `verify_plan(tau=0.5)` → union with the grounded heuristic
+(`mock_plan`). Scored against hospital's 509 real errors with the
+`eval/precision_curve.py` repairs-only churn-neutral protocol. Protocol parity was
+verified by re-scoring the captured v6 plan through the same scorer: it reproduces the
+prior gate numbers exactly (gated 0.993/0.287, union 0.905/0.413).
+Disclosure: ≤32B open-weights models measured via hosted inference for speed; all are
+locally deployable in principle.
+| model | params (B) | family | gated P @ C | union P @ C | validity | kept/dropped | runtime (s) |
+|---|---|---|---|---|---|---|---|
+| scrubdata-ft-v6 (Qwen3-4B fine-tune) | 4 | qwen3 (fine-tuned) | **0.993** @ 0.287 | 0.905 @ 0.413 | — | 132/38 | — (prior measurement) |
+| gpt-oss:20b | 20 | openai/gpt-oss | 1.0 @ 0.000* | 0.845 @ 0.257* | 0.0 | 0/0 | 360 |
+| devstral-small-2:24b | 24 | mistral/devstral | 0.943 @ 0.426 | 0.915 @ **0.485** | 1.0 | 208/87 | 135 |
+| nemotron-3-nano:30b | 30 | nvidia/nemotron | 1.0 @ 0.138 | 0.877 @ 0.336 | 0.4 | 63/6 | 114 |
+| gemma4:31b | 31 | google/gemma | 0.943 @ 0.426 | **0.915 @ 0.485** | 1.0 | 209/28 | 104 |
+\* gpt-oss:20b is a serving-path failure, not a measured capability: the model
+generated ~4.8k tokens per planning call (`done_reason=stop`) but the Ollama Cloud
+proxy returned empty `content` and empty `thinking` on all 5 calls at both
+num_predict=4000 and 8000 (simple prompts work) — its "gated" point is the degenerate
+empty plan and its "union" point is the heuristic backstop alone. nemotron-3-nano
+produced valid JSON on only 2/5 batch calls at num_predict=8000 (long-thinking
+truncation); validity is part of the measurement.
+**Interpretation.** Zero-shot capability at 24–31B does close — and slightly
+exceed — the 4B fine-tune's gap inside the same verifier harness: devstral-24B and
+gemma4-31B both land at union 0.915 precision @ 0.485 coverage vs the fine-tune's
+0.905 @ 0.413, though the fine-tune remains the most precise gated planner
+(0.993 vs 0.943) and the only ≤4B point, while two of the four bigger families
+(gpt-oss, nemotron) fail on plan-schema validity before capability even gets
+measured. Gemma4-31B is the best family on balance: same gate point as devstral but
+cleaner raw plans (verifier dropped 28 entries vs devstral's 87 — vs 38 for the 4B
+fine-tune) and the fastest wall-clock (104s). The union still dominates everywhere:
+every model's union point adds coverage over its gated point at gate-passing
+precision, and it floors even the broken planners (nemotron 0.877 @ 0.336) because
+the grounded heuristic covers whatever the model misses.
+Artifacts: `eval/results/scaling_arm.json` (rows + provenance),
+`eval/results/scaling_<model>_hospital_raw_plan.json` (captured raw plans),
+runner: `eval/scaling_arm.py`.

docs/TOOL_REFERENCE.md ADDED Viewed

	@@ -0,0 +1,251 @@

+# ScrubData — The Profound Tool Reference
+> The single local document that explains the whole system: what it is, why every
+> piece exists, where every number comes from, and what we learned building it.
+> Written at the close of the research domain (2026-06-12). The paper
+> (`docs/paper/main.tex`) is the citable account; THIS file is the operational one.
+---
+## 1. What ScrubData is
+ScrubData is a **zero-config, zero-label, local** tabular data-cleaning system built
+around one architectural commitment: **the model never touches data**.
+A profiler aggregates each column into a bounded value-frequency profile; a small
+(≤4B, locally-run) fine-tuned planner *proposes* a JSON cleaning plan; a
+deterministic pandas executor *applies* it. The plan is the complete, inspectable,
+reversible specification of every change. Three consequences define the product:
+1. **No silent edits by construction** — every changed cell traces to a named,
+   logged operation (verified at scale: 0 silent edits across 35 wild tables and a
+   239-table GitTables trust audit).
+2. **Abstention is first-class** — anything below confidence becomes a review flag
+   ("YOUR CALL" card in the UI), never a quiet skip and never a guess.
+3. **Profile-not-rows scaling** — the prompt scales with *distinct values*, not
+   rows; a million-row table profiles like a hundred-row one, and no cell values
+   leave the machine.
+### The central finding (load-bearing, repeatedly measured)
+**Model weights contribute approximately nothing to never-seen-table
+generalization in this protocol class.** Five SFT retrains (v7–v10 + mixes, 109k
+harvested real alias pairs) and a three-arm GRPO pilot (executor as verifiable
+reward, including a random-reward control that reproduced the same format drift)
+all failed to move held-out generalization. Every measured gain came from
+**deterministic machinery gated by the plan-level verifier** (§5). Corroborated
+independently by Spreadsheet-RL, arXiv:2601.05009, and arXiv:2606.02866.
+Practical corollary: *to improve ScrubData, write a deterministic capability and
+gate it with the verifier; do not collect more training data.*
+---
+## 2. The shipped pipeline (`scrubdata/active.py::get_planner`)
+```
+                       ┌──────────────────────────────────────────────┐
+ df ──► profiler ──►   │  model path (only if SCRUBDATA_MODEL is set) │
+        (bounded       │  batched (4 cols/call) local Ollama planner  │
+        profile incl.  │  → per-batch fallback to heuristic on error  │
+        suspects)      │  → grounded (reference taxonomies, RACOON)   │
+                       │  → verify_plan(tau=SCRUBDATA_TAU, def 0.5)   │
+                       └───────────────┬──────────────────────────────┘
+                                       │  union_plans (model wins per surface;
+                                       │  inherits deterministic ops + table ops)
+        heuristic mock_plan ───────────┘
+                                       ▼
+                        executor.apply_plan → (clean_df, change_log)
+                                       ▼
+                 report.render_report · trace.log_run · observability
+```
+- **No model configured** → `mock_plan` (grounded deterministic heuristic) alone.
+  The app always produces a plan; the model is an upgrade, never a dependency.
+- **Measured operating point** (hospital, 509 real errors): union **0.905
+  precision @ 0.413 coverage**; gated model alone 0.993 @ 0.287; 3-seed
+  0.891±0.012 @ 0.396±0.025. Precision flat 0.89–0.91 for τ∈[0.2,0.8].
+Entry points: `uv run server.py` (FastAPI + UI), `app.py` (HF Space/Gradio),
+`scrubdata/cli.py` (`scrubdata <file.csv> -o out.csv --report r.md --plan p.json`).
+### Environment variables
+| Var | Default | Meaning |
+|---|---|---|
+| `SCRUBDATA_MODEL` | unset | local Ollama model id (e.g. `scrubdata-ft-v6`); unset = heuristic only |
+| `SCRUBDATA_TAU` | `0.5` | per-entry verifier threshold on model mappings |
+| `SCRUBDATA_HC_TAU` | `0.8` | stricter bar for heuristic suspect-mappings (no model cross-check there) |
+| `SCRUBDATA_PAIR_PROFILES` | off | WS2 candidate-constrained planning (measured redundant with verifier; off by default) |
+| `SCRUBDATA_PII_NER` | off | OpenMed-PII 44M NER tier on top of deterministic validators |
+---
+## 3. Module map (`scrubdata/`)
+| Module | Role | Key facts |
+|---|---|---|
+| `profiler.py` | column → bounded profile | `VALUE_COUNTS_CAP=80` (high-card cols: top-8 only) + `suspect_values` section (the visibility fix); `truncated_values` count keeps honesty about what's hidden |
+| `detect.py` | typing + issue predicates | `detect_semantic_type` (zip/ZCTA/Excel-serial guards), `date_formats_consistent` (collapses digit AND alpha runs; 90% dominant-shape), `percent_formats_consistent` (90%), `has_mojibake`, `is_missing` |
+| `planner.py` | deterministic heuristic planner | `mock_plan`, `_column_operations`, `_suspect_canonicalize` (τ_hc=0.8), `detect_entity_groups` (cross-row voting detection), emits `fix_encoding` BEFORE `strip_whitespace` (order-critical), `off_convention_dates` visible-abstention flags |
+| `executor.py` | the only thing that touches cells | op dispatch (§4); unknown ops are no-ops (forward-compatible); returns `(df, change_log)`; `resolve_by_majority` table op lives here |
+| `verifier.py` | WS1 selective prediction | `entry_confidence` (3 hard gates, §5.0), `verify_plan` (also enforces convention gates on MODEL-emitted parse_date/parse_percent — the model path otherwise bypasses them), `union_plans` (order-preserving op inheritance via `reversed(inherit)`) |
+| `reconcile.py` | reference grounding | `ReferenceIndex`, `default_index()` loads toughtables_ref (contamination-guarded: excludes the 8 benchmark tables) + MusicBrainz hints + Wikidata companies + ROR; `infer_reference_type` needs **≥20% exact entity hits** (over-fire guard); falls back to `training/harvests/` for Space/clone parity |
+| `grounded.py` | RACOON wrapper | model never free-generates a canonical for a reference-typed column |
+| `pair_profile.py` | suspects + WS2 candidates | `suspects_for_column` (≤25/col, bounded: 4k rare cap + cheap prefilters before SequenceMatcher — 40min→24s fix), `candidate_pairs`, `constrain_plan` |
+| `model_planner.py` | Ollama backends | `make_local_ollama_planner`, `make_batched_planner(batch_size=4)`, JSON extraction |
+| `prompt.py` | prompt/training contract | `_profile_for_prompt` (compact suspects), `build_chat_example` (training-data side of the same contract — change one, regenerate the other) |
+| `pii.py` | PII second task | deterministic validators (Luhn, IBAN, phone) + allowlist + coverage vote; optional 44M NER; `mask/hash/pseudonymize` |
+| `active.py` | THE composition | `get_planner()` — §2 |
+| `cli.py` / `report.py` / `trace.py` / `observability.py` | UX + audit | CLI, markdown report, JSONL traces, monitor summary/OTel span |
+| `baselines.py` | OpenRefine kNN/fingerprint reimplementations | the zero-config comparison class |
+| `refdata/cities.txt` | seed gazetteer | plus everything in `training/harvests/*.jsonl` |
+---
+## 4. Operation vocabulary (the executor's closed set)
+**Column ops** (`_apply_column_op`): `strip_whitespace`, `normalize_punctuation`,
+`fix_encoding` (lossless cp1252/latin-1↔utf8 round-trip, mojibake-marker-reduction
+gated), `normalize_disguised_nulls`, `parse_currency`, `parse_number`,
+`parse_percent` (abstains on bare values — no /100 corruption),
+`parse_date`, `standardize_boolean`, `standardize_phone` (7-digit → `DDD-DDDD`),
+`normalize_email`, `standardize_case`, `canonicalize_categories` (mapping-driven;
+the verifier's subject), `flag_pii` (log-only), `mask_pii`, `hash_pii`,
+`pseudonymize_pii`. Unknown op → no-op.
+**Table ops**: `drop_empty_columns`, `drop_empty_rows`, `drop_exact_duplicates`,
+`resolve_by_majority` (§5.3).
+Op-order invariant: **`fix_encoding` must precede whitespace/punctuation ops** —
+they destroy the UTF-8 byte patterns repair needs (grader-reproduced bug; fixed in
+both heuristic emission and union inheritance).
+---
+## 5. The five deterministic capabilities (what actually generalizes)
+### 5.0 Plan-level verifier (WS1) — `verifier.entry_confidence`
+Every non-grounded `canonicalize_categories` entry `raw→canon` is scored with
+three HARD gates, each killing a measured hospital failure class:
+- **errors are rare**: `freq(raw) ≥ 3` → 0.0 (frequent = legit data; "de kalb"×92)
+- **repair to dominance only**: `freq(canon) < max(2, 2·freq(raw))` → 0.0
+  ("yex→yexu", typo mapped to a worse typo)
+- **code discipline**: digit-bearing values repair only if letter-part similarity
+  ≥0.85 AND digits identical (allows `amix-2→ami-2`, blocks `ak_→al_`)
+Survivors score `sim × (0.5 + 0.5·support)`; below-τ entries become review flags.
+### 5.1 Suspect surfacing (visibility) — `pair_profile.suspects_for_column`
+The 80-value profile cap structurally hides high-cardinality dirty cells from ANY
+planner (proved by the v8/v9 retrains: more data couldn't fix what the model
+couldn't see). Every text-ish column profile now carries ≤25 `suspect_values`:
+rare surfaces + evidence-backed candidates (frequency dominance, edit similarity,
+reference membership). The heuristic maps suspects clearing `entry_confidence ≥
+SCRUBDATA_HC_TAU=0.8`; the rest become flags.
+### 5.2 Generic entity reference — `reconcile.default_index`
+Open vocabularies (ToughTables-derived ref [8 bench tables excluded], MusicBrainz
+search-hint misspellings, RxNorm, Wikidata companies, ROR, GeoNames, OpenFlights,
+O*NET, nicknames) as a pluggable reference type. Typing requires **≥20% exact
+hits** of distinct values (fuzzy coverage alone over-fires on name-like columns —
+measured). Cracked the all-unique regime: 5 ToughTables tables **0 → 0.955–0.957
+F1 at 0.0000 damage** (~62k corrections) — where no in-column frequency signal
+exists at all.
+### 5.3 Cross-row majority voting — `planner.detect_entity_groups` + `resolve_by_majority`
+Tables repeating a real-world entity across rows (flights reported by many
+sources) carry their own repair signal. Detection: compact-token key columns,
+median multiplicity 3–30, ≥2 votable string columns with majority-bearing
+disagreement + ≥2 distinct majorities, date-share ≤0.3 guard. Execution: resolve
+thin dissenting minorities to group majority; skips missing-like keys;
+min_share/min_group clamped. **False-consensus guard**: mean minority share ≥0.25
+→ decline (legitimate correlated updates, not reporting errors — a flat volume cap
+was measured to destroy the legitimate regime and replaced). Measured: flights
+heuristic 0.044→**0.164** F1; hospital heuristic 0.092→**0.186**.
+### 5.4 Convention conservatism — `detect.*_formats_consistent` + `verify_plan`
+Never re-format an internally consistent column: date/percent ops gated on
+dominant-shape inconsistency (digit+alpha runs collapsed, 90% rule); zip/postal
+names never typed phone/date; Excel-serial typing needs a date-suggestive name.
+Suppressed minorities surface as `off_convention_dates` flags. The verifier
+enforces the same gates on model plans at the verification boundary (the model
+path otherwise bypasses heuristic emission gates entirely).
+---
+## 6. Evaluation (how every number regenerates)
+One scoring contract — `eval/run_real_multi.py::score()` — **churn-neutral,
+convention-tolerant**: sem-equal = numeric-tolerant OR strip+casefold equal; pure
+case/whitespace churn counts as nothing; a fix requires acting; **damage** =
+clean cells corrupted / clean cells; **silent edits** = changed columns minus
+log-attributed columns (must be 0).
+| Harness | Command | What it measures | Current numbers |
+|---|---|---|---|
+| Money table | `python -m eval.run_real_multi` | 65-set suite, 3 seeds | grounded NORTH 0.224±0.004; REAL-F1 0.225 vs OR-kNN 0.058 (HEAD 2026-06-12 regen; freeze was 0.203/0.174) |
+| WS1 gate | `python -m eval.precision_curve --plan eval/results/v6_hospital_raw_plan.json --union` | precision–coverage curve | **0.905 @ 0.413** (τ=0.5) |
+| Paired bench | `python -m eval.paired_bench` | 42 dirty/gold pairs | unseen-35 macro F1 **0.363** @ dmg **0.0219** |
+| Wild bench | `python -m eval.wild_bench` | 35 uncurated tables, behavioral + inject-recovery | recovery 0.207; **0 silent edits** |
+| Trust audit | `python -m eval.gittables_audit` | 239 GitTables clean-lake | 239/239 valid, 0 crashes, 0 silent edits |
+| Generalization | `python -m eval.generalization` | held-out-source (train: hospital/beers/movies_1 · eval: flights/rayyan/ed2) | GEN-F1 0.058, VR 0.108, dmg 0.036 |
+| RADAR board | `python -m eval.radar_bench` | regime boundaries by artifact type | abstains on missingness ✓; reasoning-class = frontier territory |
+| Baselines | `eval/run_baran.py`, `modal run scripts/modal_jellyfish.py` | disclosed-protocol comparisons | Baran (oracle+20 labels) 0.811; Jellyfish-13B 0.074 |
+| Calibration / PII | `eval.calibration`, `eval.pii_leak` | abstention quality / leak test | AURC 0.120, ECE 0.169; 0/360 residual PII |
+**Eval-source discipline**: TRAIN_SOURCES["v6"]={hospital,beers,movies_1};
+EVAL_SOURCES={flights,rayyan,ed2_restaurants}. Never crossed.
+---
+## 7. Model & artifacts
+| Artifact | Where | Notes |
+|---|---|---|
+| Champion adapter | Modal volume `scrubdata-v5-adapter` `/v5_seed21` (= "v6") | survived v7–v10 challenges + GRPO |
+| Merged model | `hf.co/ricalanis/scrubdata-qwen3-4b` | card carries the v2 finding |
+| Q8 GGUF | `hf.co/ricalanis/scrubdata-qwen3-4b-v6-q8` | **Q8_0 only — Q4_K_M corrupts** (Unsloth 2026.6.x); non-thinking Modelfile required (`notebooks/Modelfile`); suppress tokens 151657/151658 under transformers |
+| Benchmark | `hf.co/datasets/ricalanis/wildclean` | 33 redistributable pairs + loaders.py for 9 license-gated + gittables250 + 10 vocabs + frozen results; first cleaning bench with damage + silent-edit accounting |
+| Demo | `hf.co/spaces/build-small-hackathon/scrubdata` | deploy = `HfApi.upload_folder` of `git archive HEAD` — **NO GitHub auto-sync** |
+| Paper | `docs/paper/main.tex` + `numbers.tex` | compile: `~/.local/bin/tectonic main.tex` (no pdflatex on this machine) |
+| Vocabs | `training/harvests/*.jsonl` (15MB, 13 files) | loader falls back here for clone parity |
+Modal patterns: `--detach` for anything long; results land in Modal Dicts
+(`scrubdata-train-results`, `scrubdata-eval-v5-results`, `scrubdata-suite-results`).
+**Budget status at domain close: ~$187 of $212 ceiling — Modal HALTED.**
+---
+## 8. Negative results ledger (measured, do not re-litigate)
+1. **v7–v10 SFT retrains**: 109k harvested alias pairs, episode mixes, suspects
+   contract — GEN flat/worse. Mixing harvested pairs **dilutes** executor-verified
+   synthetic skill (monotonic dilution law across mix ratios; mixH 0.677).
+2. **GRPO pilot, 3 arms** (main, KL-anchored v2, random-reward control): all
+   degrade format at 4B/LoRA/$30 scale; the control proved the drift is an RL
+   artifact (cf. "Spurious Rewards"). Published RLVR wins used real infra
+   (verl, 4×H100×40h). Episodes corpus (600, `training/build_grpo_episodes.py`) +
+   hand-rolled loop (`scripts/modal_grpo.py`) committed for a future attempt.
+3. **Uniform verification of existing low-card mappings** (A1 per-class
+   thresholds): 0.905→0.890 — reverted.
+4. **Strict entity-typing thresholds** (0.90/0.05): cost more than bought — reverted.
+5. **WS2 candidate constraining composed with verifier**: 0.876 @ 0.387 < union at
+   same τ — redundant gating of the same failure class; available, off by default.
+6. **Flat volume cap on cross-row voting**: destroyed the legitimate
+   dense-disagreement regime — replaced by the false-consensus guard.
+7. **Frozen-gold synthetic yardstick predates the suspects prompt contract** —
+   regenerate gold before ever quoting synthetic canon_f1 again.
+## 9. Known-open (graded non-blocking)
+`_parse_date` per-value dayfirst; i18n name guards; mojibake fixpoint /
+sequence-plausibility; backlog sources: CMS API, NHTSA, Canada contracts, Matelda
+~6,670 pairs, GLEIF/USDA vocabs, WDVC-16. Reasoning-class artifacts (RADAR) are
+explicitly out of protocol class — frontier-model territory.
+## 10. Where deeper detail lives
+`docs/PRODUCT.md` (trust contract) · `docs/SOTA.md` + `docs/ROADMAP_SOTA2.md`
+(position + research map) · `docs/CAPABILITY_GRADES.md` (12-agent adversarial
+grading + must-fix ledger) · `docs/WILD_BENCH.md` / `docs/PAIRED_BENCH.md` /
+`docs/GITTABLES_AUDIT.md` / `docs/DATASETS.md` (per-bench detail + licenses) ·
+`docs/NIGHT_LOG.md` (stage-3 timeline) · `project-memory/` (agent memory snapshot).

docs/WILD_BENCH.md ADDED Viewed

	@@ -0,0 +1,41 @@

+# Wild Bench — can the shipped system clean real-world tables?
+Behavioral audit + seeded inject-recovery per dataset (eval/wild_bench.py).
+| dataset | domain | rows×cols | valid | changes | flags | PII | silent | typo | ocr | case | ws | mean |
+|---|---|---|---|---|---|---|---|---|---|---|---|---|
+| airlines | aviation | 56×8 | ✓ | 413 | 1 | 1 | 0 | — | — | — | — | — |
+| billboard | music-billboard | 317×83 | ✓ | 36222 | 3 | 2 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
+| acnc_charities | nonprofits-au | 800×69 | ✓ | 43268 | 4 | 1 | 0 | 0.00 | 0.00 | 0.01 | 0.01 | 0.01 |
+| open_food_facts | food-products | 800×211 | ✓ | 27115 | 34 | 5 | 0 | 0.02 | 0.02 | 0.02 | 0.03 | 0.02 |
+| biz_sf | sf-business | 800×38 | ✓ | 8060 | 12 | 1 | 0 | 0.02 | 0.05 | 0.02 | 0.07 | 0.04 |
+| irs_eo1 | nonprofits-us | 800×28 | ✓ | 16953 | 5 | 3 | 0 | 0.04 | 0.03 | 0.03 | 0.15 | 0.06 |
+| permits_nyc | construction | 800×60 | ✓ | 16762 | 25 | 3 | 0 | 0.03 | 0.04 | 0.04 | 0.13 | 0.06 |
+| pawnbrokers_nyc | business | 800×31 | ✓ | 8494 | 8 | 2 | 0 | 0.06 | 0.08 | 0.05 | 0.11 | 0.08 |
+| proptax_sf | real-estate | 800×46 | ✓ | 9302 | 3 | 3 | 0 | 0.06 | 0.06 | 0.07 | 0.12 | 0.08 |
+| biz_chicago | business-licenses | 800×37 | ✓ | 12808 | 9 | 2 | 0 | 0.05 | 0.06 | 0.06 | 0.15 | 0.08 |
+| permits_seattle | seattle-permits | 800×40 | ✓ | 6878 | 9 | 2 | 0 | 0.08 | 0.13 | 0.09 | 0.14 | 0.11 |
+| restaurants_nyc | restaurants | 800×27 | ✓ | 7742 | 6 | 4 | 0 | 0.07 | 0.08 | 0.09 | 0.20 | 0.11 |
+| titanic | passengers | 800×12 | ✓ | 5722 | 1 | 0 | 0 | 0.00 | 0.00 | 0.09 | 0.40 | 0.12 |
+| biz_la | la-business | 800×16 | ✓ | 2726 | 9 | 3 | 0 | 0.15 | 0.09 | 0.10 | 0.21 | 0.14 |
+| schools_nyc | education | 800×41 | ✓ | 14387 | 7 | 5 | 0 | 0.08 | 0.14 | 0.12 | 0.22 | 0.14 |
+| online_retail | ecommerce-uk | 800×8 | ✓ | 3413 | 1 | 0 | 0 | 0.26 | 0.01 | 0.01 | 0.30 | 0.14 |
+| film_nyc | film | 800×14 | ✓ | 3049 | 3 | 0 | 0 | 0.14 | 0.16 | 0.11 | 0.23 | 0.16 |
+| salary_survey | survey | 800×18 | ✓ | 4142 | 5 | 0 | 0 | 0.12 | 0.20 | 0.13 | 0.26 | 0.18 |
+| restaurants_sf | sf-restaurants | 800×22 | ✓ | 6002 | 6 | 2 | 0 | 0.15 | 0.15 | 0.16 | 0.26 | 0.18 |
+| alcohol_tx | alcohol-bars | 800×24 | ✓ | 8518 | 9 | 1 | 0 | 0.14 | 0.09 | 0.17 | 0.38 | 0.20 |
+| contractors_chi | contractors | 800×116 | ✓ | 20213 | 22 | 2 | 0 | 0.17 | 0.20 | 0.16 | 0.33 | 0.21 |
+| fhv_nyc | transport | 800×23 | ✓ | 3789 | 4 | 2 | 0 | 0.10 | 0.30 | 0.14 | 0.36 | 0.23 |
+| uk_price_paid | real-estate-uk | 800×16 | ✓ | 1662 | 8 | 0 | 0 | 0.14 | 0.17 | 0.26 | 0.42 | 0.25 |
+| food_chicago | food-inspections | 800×17 | ✓ | 2790 | 6 | 0 | 0 | 0.17 | 0.25 | 0.23 | 0.38 | 0.26 |
+| bx_books | books | 800×8 | ✓ | 1650 | 3 | 1 | 0 | 0.22 | 0.22 | 0.16 | 0.51 | 0.28 |
+| bl_flickr_books | library | 800×15 | ✓ | 1769 | 6 | 1 | 0 | 0.19 | 0.28 | 0.22 | 0.43 | 0.28 |
+| svc311_nyc | complaints | 800×44 | ✓ | 6299 | 16 | 2 | 0 | 0.23 | 0.30 | 0.23 | 0.37 | 0.28 |
+| spotify | music | 800×23 | ✓ | 4669 | 3 | 1 | 0 | 0.20 | 0.28 | 0.30 | 0.36 | 0.28 |
+| glassdoor_jobs | job-listings | 800×14 | ✓ | 1713 | 6 | 0 | 0 | 0.20 | 0.29 | 0.22 | 0.43 | 0.29 |
+| ct_real_estate | real-estate-us | 800×14 | ✓ | 4840 | 4 | 0 | 0 | 0.23 | 0.29 | 0.24 | 0.40 | 0.29 |
+| worldcities | geography | 800×4 | ✓ | 914 | 2 | 0 | 0 | 0.41 | 0.11 | 0.22 | 0.69 | 0.36 |
+| fec_indiv80 | political-finance | 800×21 | ✓ | 4375 | 4 | 2 | 0 | 0.20 | 0.24 | 0.35 | 0.87 | 0.41 |
+| payroll_nyc | jobs | 800×17 | ✓ | 4587 | 3 | 2 | 0 | 0.45 | 0.56 | 0.42 | 0.73 | 0.54 |
+| paris_trees | urban-fr | 800×16 | ✓ | 3305 | 5 | 1 | 0 | 0.43 | 0.54 | 0.55 | 0.73 | 0.56 |
+| ev_wa | vehicles | 800×16 | ✓ | 4085 | 5 | 2 | 0 | 0.50 | 0.56 | 0.48 | 0.91 | 0.61 |

docs/assets/space_landing.png ADDED Viewed

Git LFS Details

SHA256: 144649ae9a9d4546534d4a890239d4c8fb0ea2c46f8bece9fd577f91ce1685f4
Pointer size: 130 Bytes
Size of remote file: 72 kB

docs/assets/space_results.png ADDED Viewed

Git LFS Details

SHA256: 38c350045de7113f3a71dce1db32ba305003bef2eb1210af6ca2c4fa5ec19ae5
Pointer size: 131 Bytes
Size of remote file: 368 kB

docs/paper/fig_label_curve.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70a409349a89aa8681a6b0a4f47a68405a9391fad562bfb4b5a0de9ec573ab74
+size 19327

docs/paper/fig_label_curve.png ADDED Viewed

Git LFS Details

SHA256: 1007f2590c88e79b80d24b0181c448c4dbc3f11fe69ec96be7fd5c945f8c8102
Pointer size: 130 Bytes
Size of remote file: 77.9 kB

docs/paper/fig_precision_coverage.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4d65c23406f21bcb82e8054cecc95d40ceb41cd08096726b85db5430cdae4a2
+size 19440

docs/paper/fig_precision_coverage.png ADDED Viewed

Git LFS Details

SHA256: 02ecda628a984d5a27f7e270daf435156afaae6408f8c429f0f33d26fbaa9916
Pointer size: 130 Bytes
Size of remote file: 79.5 kB

docs/paper/fig_risk_coverage.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e11e0af912e0ce66da8c3f407732d62afe4c222fbc2af8e541467d4bf5f73bce
+size 18227

docs/paper/fig_risk_coverage.png ADDED Viewed

Git LFS Details

SHA256: 9a0e46fd7f23d4224fac46a8045fc30040026bc774da0a58b4eb0f3a4ed9d6d6
Pointer size: 130 Bytes
Size of remote file: 59.2 kB

docs/paper/main.aux ADDED Viewed

	@@ -0,0 +1,59 @@

+\relax
+\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{}\protected@file@percent }
+\citation{raha}
+\citation{holoclean}
+\citation{garf}
+\citation{wrangle}
+\citation{jellyfish}
+\citation{tablegpt}
+\citation{retclean}
+\citation{turl}
+\citation{tablellama}
+\citation{belotti}
+\citation{racoon}
+\citation{mtab}
+\@writefile{toc}{\contentsline {section}{\numberline {2}Related Work}{2}{}\protected@file@percent }
+\newlabel{sec:related}{{2}{2}}
+\citation{selective}
+\citation{openmed}
+\@writefile{toc}{\contentsline {section}{\numberline {3}Method}{3}{}\protected@file@percent }
+\newlabel{sec:method}{{3}{3}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Planner / executor decomposition}{3}{}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Execution-verified synthetic supervision}{3}{}\protected@file@percent }
+\newlabel{sec:sft}{{3.2}{3}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Reference-grounded canonicalization with abstention}{3}{}\protected@file@percent }
+\newlabel{sec:grounding}{{3.3}{3}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.4}PII as a second task instance}{4}{}\protected@file@percent }
+\newlabel{sec:pii}{{3.4}{4}}
+\@writefile{toc}{\contentsline {section}{\numberline {4}Evaluation Design}{4}{}\protected@file@percent }
+\newlabel{sec:eval}{{4}{4}}
+\@writefile{toc}{\contentsline {section}{\numberline {5}Results}{4}{}\protected@file@percent }
+\newlabel{sec:results}{{5}{4}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Small fine-tuned planner vs.\ large generic model}{4}{}\protected@file@percent }
+\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Wide-suite comparison, 3 injection seeds, churn-neutral metric. NORTH is the double-macro harmonic mean; REAL-F1 is the real-error slice. (Filled from the final run.)}}{5}{}\protected@file@percent }
+\newlabel{tab:money}{{1}{5}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Grounding vs.\ clustering}{5}{}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Ablations}{5}{}\protected@file@percent }
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.4}Calibration of abstention}{5}{}\protected@file@percent }
+\newlabel{sec:calibration}{{5.4}{5}}
+\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Risk--coverage for grounded city reconciliation (650 probes). Operating points annotated; the confidence supports thresholded abstention.}}{6}{}\protected@file@percent }
+\newlabel{fig:rc}{{1}{6}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {5.5}PII transfer and detection}{6}{}\protected@file@percent }
+\@writefile{toc}{\contentsline {section}{\numberline {6}Limitations}{6}{}\protected@file@percent }
+\@writefile{toc}{\contentsline {section}{\numberline {7}Conclusion}{6}{}\protected@file@percent }
+\bibcite{raha}{{1}{}{{}}{{}}}
+\bibcite{holoclean}{{2}{}{{}}{{}}}
+\bibcite{garf}{{3}{}{{}}{{}}}
+\bibcite{wrangle}{{4}{}{{}}{{}}}
+\bibcite{jellyfish}{{5}{}{{}}{{}}}
+\bibcite{tablegpt}{{6}{}{{}}{{}}}
+\bibcite{retclean}{{7}{}{{}}{{}}}
+\bibcite{turl}{{8}{}{{}}{{}}}
+\bibcite{tablellama}{{9}{}{{}}{{}}}
+\bibcite{belotti}{{10}{}{{}}{{}}}
+\bibcite{racoon}{{11}{}{{}}{{}}}
+\bibcite{mtab}{{12}{}{{}}{{}}}
+\bibcite{selective}{{13}{}{{}}{{}}}
+\bibcite{openmed}{{14}{}{{}}{{}}}
+\providecommand\NAT@force@numbers{}\NAT@force@numbers
+\gdef \@abspage@last{7}

docs/paper/main.log ADDED Viewed

	@@ -0,0 +1,269 @@

+**
+(main.tex
+LaTeX2e <2021-11-15> patch level 1
+L3 programming layer <2022-02-24> (article.cls
+Document Class: article 2021/10/04 v1.4n Standard LaTeX document class
+(size11.clo
+File: size11.clo 2021/10/04 v1.4n Standard LaTeX file (size option)
+)
+\c@part=\count181
+\c@section=\count182
+\c@subsection=\count183
+\c@subsubsection=\count184
+\c@paragraph=\count185
+\c@subparagraph=\count186
+\c@figure=\count187
+\c@table=\count188
+\abovecaptionskip=\skip47
+\belowcaptionskip=\skip48
+\bibindent=\dimen138
+) (geometry.sty
+Package: geometry 2020/01/02 v5.9 Page Geometry
+ (keyval.sty
+Package: keyval 2014/10/28 v1.15 key=value parser (DPC)
+\KV@toks@=\toks16
+) (ifvtex.sty
+Package: ifvtex 2019/10/25 v1.7 ifvtex legacy package. Use iftex instead.
+ (iftex.sty
+Package: iftex 2022/02/03 v1.0f TeX engine tests
+))
+\Gm@cnth=\count189
+\Gm@cntv=\count190
+\c@Gm@tempcnt=\count191
+\Gm@bindingoffset=\dimen139
+\Gm@wd@mp=\dimen140
+\Gm@odd@mp=\dimen141
+\Gm@even@mp=\dimen142
+\Gm@layoutwidth=\dimen143
+\Gm@layoutheight=\dimen144
+\Gm@layouthoffset=\dimen145
+\Gm@layoutvoffset=\dimen146
+\Gm@dimlist=\toks17
+) (amsmath.sty
+Package: amsmath 2021/10/15 v2.17l AMS math features
+\@mathmargin=\skip49
+For additional information on amsmath, use the `?' option.
+(amstext.sty
+Package: amstext 2021/08/26 v2.01 AMS text
+ (amsgen.sty
+File: amsgen.sty 1999/11/30 v2.0 generic functions
+\@emptytoks=\toks18
+\ex@=\dimen147
+)) (amsbsy.sty
+Package: amsbsy 1999/11/29 v1.2d Bold Symbols
+\pmbraise@=\dimen148
+) (amsopn.sty
+Package: amsopn 2021/08/26 v2.02 operator names
+)
+\inf@bad=\count192
+LaTeX Info: Redefining \frac on input line 234.
+\uproot@=\count193
+\leftroot@=\count194
+LaTeX Info: Redefining \overline on input line 399.
+\classnum@=\count195
+\DOTSCASE@=\count196
+LaTeX Info: Redefining \ldots on input line 496.
+LaTeX Info: Redefining \dots on input line 499.
+LaTeX Info: Redefining \cdots on input line 620.
+\Mathstrutbox@=\box50
+\strutbox@=\box51
+\big@size=\dimen149
+LaTeX Font Info:    Redeclaring font encoding OML on input line 743.
+LaTeX Font Info:    Redeclaring font encoding OMS on input line 744.
+\macc@depth=\count197
+\c@MaxMatrixCols=\count198
+\dotsspace@=\muskip16
+\c@parentequation=\count199
+\dspbrk@lvl=\count266
+\tag@help=\toks19
+\row@=\count267
+\column@=\count268
+\maxfields@=\count269
+\andhelp@=\toks20
+\eqnshift@=\dimen150
+\alignsep@=\dimen151
+\tagshift@=\dimen152
+\tagwidth@=\dimen153
+\totwidth@=\dimen154
+\lineht@=\dimen155
+\@envbody=\toks21
+\multlinegap=\skip50
+\multlinetaggap=\skip51
+\mathdisplay@stack=\toks22
+LaTeX Info: Redefining \[ on input line 2938.
+LaTeX Info: Redefining \] on input line 2939.
+) (amssymb.sty
+Package: amssymb 2013/01/14 v3.01 AMS font symbols
+(amsfonts.sty
+Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support
+\symAMSa=\mathgroup4
+\symAMSb=\mathgroup5
+LaTeX Font Info:    Redeclaring math symbol \hbar on input line 98.
+LaTeX Font Info:    Overwriting math alphabet `\mathfrak' in version `bold'
+(Font)                  U/euf/m/n --> U/euf/b/n on input line 106.
+)) (booktabs.sty
+Package: booktabs 2020/01/12 v1.61803398 Publication quality tables
+\heavyrulewidth=\dimen156
+\lightrulewidth=\dimen157
+\cmidrulewidth=\dimen158
+\belowrulesep=\dimen159
+\belowbottomsep=\dimen160
+\aboverulesep=\dimen161
+\abovetopsep=\dimen162
+\cmidrulesep=\dimen163
+\cmidrulekern=\dimen164
+\defaultaddspace=\dimen165
+\@cmidla=\count270
+\@cmidlb=\count271
+\@aboverulesep=\dimen166
+\@belowrulesep=\dimen167
+\@thisruleclass=\count272
+\@lastruleclass=\count273
+\@thisrulewidth=\dimen168
+) (graphicx.sty
+Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR)
+ (graphics.sty
+Package: graphics 2021/03/04 v1.4d Standard LaTeX Graphics (DPC,SPQR)
+ (trig.sty
+Package: trig 2021/08/11 v1.11 sin cos tan (DPC)
+)
+(graphics.cfg
+File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration
+)
+Package graphics Info: Driver file: xetex.def on input line 107.
+ (xetex.def
+File: xetex.def 2021/03/18 v5.0k Graphics/color driver for xetex
+))
+\Gin@req@height=\dimen169
+\Gin@req@width=\dimen170
+) (url.sty
+\Urlmuskip=\muskip17
+Package: url 2013/09/16  ver 3.4  Verb mode for urls, etc.
+) (xcolor.sty
+Package: xcolor 2021/10/31 v2.13 LaTeX color extensions (UK)
+ (color.cfg
+File: color.cfg 2016/01/02 v1.6 sample color configuration
+)
+Package xcolor Info: Driver file: xetex.def on input line 227.
+Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1352.
+Package xcolor Info: Model `RGB' extended on input line 1368.
+Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1370.
+Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1371.
+Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1372.
+Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1373.
+Package xcolor Info: Model `Gray' substituted by `gray' on input line 1374.
+Package xcolor Info: Model `wave' substituted by `hsb' on input line 1375.
+) (natbib.sty
+Package: natbib 2010/09/13 8.31b (PWD, AO)
+\bibhang=\skip52
+\bibsep=\skip53
+LaTeX Info: Redefining \cite on input line 694.
+\c@NAT@ctr=\count274
+)
+(numbers) (l3backend-xetex.def
+File: l3backend-xetex.def 2022-02-07 L3 backend support: XeTeX
+\c__kernel_sys_dvipdfmx_version_int=\count275
+\l__color_backend_stack_int=\count276
+\g__color_backend_stack_int=\count277
+\g__graphics_track_int=\count278
+\l__pdf_internal_box=\box52
+\g__pdf_backend_object_int=\count279
+\g__pdf_backend_annotation_int=\count280
+\g__pdf_backend_link_int=\count281
+) (main.aux)
+\openout1 = `main.aux'.
+LaTeX Font Info:    Checking defaults for OML/cmm/m/it on input line 16.
+LaTeX Font Info:    ... okay on input line 16.
+LaTeX Font Info:    Checking defaults for OMS/cmsy/m/n on input line 16.
+LaTeX Font Info:    ... okay on input line 16.
+LaTeX Font Info:    Checking defaults for OT1/cmr/m/n on input line 16.
+LaTeX Font Info:    ... okay on input line 16.
+LaTeX Font Info:    Checking defaults for T1/cmr/m/n on input line 16.
+LaTeX Font Info:    ... okay on input line 16.
+LaTeX Font Info:    Checking defaults for TS1/cmr/m/n on input line 16.
+LaTeX Font Info:    Trying to load font information for TS1+cmr on input line 1
+6.
+ (ts1cmr.fd
+File: ts1cmr.fd 2019/12/16 v2.5j Standard LaTeX font definitions
+)
+LaTeX Font Info:    ... okay on input line 16.
+LaTeX Font Info:    Checking defaults for TU/lmr/m/n on input line 16.
+LaTeX Font Info:    ... okay on input line 16.
+LaTeX Font Info:    Checking defaults for OMX/cmex/m/n on input line 16.
+LaTeX Font Info:    ... okay on input line 16.
+LaTeX Font Info:    Checking defaults for U/cmr/m/n on input line 16.
+LaTeX Font Info:    ... okay on input line 16.
+*geometry* driver: auto-detecting
+*geometry* detected driver: xetex
+*geometry* verbose mode - [ preamble ] result:
+* driver: xetex
+* paper: <default>
+* layout: <same size as paper>
+* layoutoffset:(h,v)=(0.0pt,0.0pt)
+* modes:
+* h-part:(L,W,R)=(72.26999pt, 469.75502pt, 72.26999pt)
+* v-part:(T,H,B)=(72.26999pt, 650.43001pt, 72.26999pt)
+* \paperwidth=614.295pt
+* \paperheight=794.96999pt
+* \textwidth=469.75502pt
+* \textheight=650.43001pt
+* \oddsidemargin=0.0pt
+* \evensidemargin=0.0pt
+* \topmargin=-37.0pt
+* \headheight=12.0pt
+* \headsep=25.0pt
+* \topskip=11.0pt
+* \footskip=30.0pt
+* \marginparwidth=59.0pt
+* \marginparsep=10.0pt
+* \columnsep=10.0pt
+* \skip\footins=10.0pt plus 4.0pt minus 2.0pt
+* \hoffset=0.0pt
+* \voffset=0.0pt
+* \mag=1000
+* \@twocolumnfalse
+* \@twosidefalse
+* \@mparswitchfalse
+* \@reversemarginfalse
+* (1in=72.27pt=25.4mm, 1cm=28.453pt)
+LaTeX Font Info:    Trying to load font information for U+msa on input line 17.
+(umsa.fd
+File: umsa.fd 2013/01/14 v3.01 AMS symbols A
+)
+LaTeX Font Info:    Trying to load font information for U+msb on input line 17.
+ (umsb.fd
+File: umsb.fd 2013/01/14 v3.01 AMS symbols B
+) [1
+]
+LaTeX Font Warning: Font shape `TU/lmr/bx/sc' undefined
+(Font)              using `TU/lmr/bx/n' instead on input line 92.
+[2] [3] [4] [5] [6]
+File: fig_precision_coverage.pdf Graphic file (type pdf)
+<use fig_precision_coverage.pdf>
+ [7] [8] [9] [10] [11]
+File: fig_label_curve.pdf Graphic file (type pdf)
+<use fig_label_curve.pdf>
+ [12]
+File: fig_risk_coverage.pdf Graphic file (type pdf)
+<use fig_risk_coverage.pdf>
+ [13]
+Underfull \hbox (badness 10000) in paragraph at lines 842--852
+\TU/lmr/m/n/10.95 The model weights are public: $[][][][][] [] [] [] [][][][][]
+[][][][][][] [] [][] [] [][][][][][][][][] []
+ []
+[14] [15] [16] [17] (main.aux)
+LaTeX Font Warning: Some font shapes were not available, defaults substituted.
+ )
+Output written on main.xdv (17 pages, 553908 bytes).

docs/paper/main.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:afcb564ae43329b0a7174b676446fe1204146968d9ed9a22426ab82454039e70
+size 201091

docs/paper/main.tex ADDED Viewed

	@@ -0,0 +1,1021 @@

+\documentclass[11pt]{article}
+\usepackage[utf8]{inputenc} % no-op on TeXLive >= 2018 (arXiv pdflatex); explicit for safety
+\usepackage[margin=1in]{geometry}
+\usepackage{amsmath,amssymb}
+\usepackage{booktabs}
+\usepackage{graphicx}
+\usepackage{url}
+\usepackage{xcolor}
+\usepackage[numbers]{natbib}
+\input{numbers}
+\title{Verified Cleaning Plans: Plan-Level Selective Prediction Turns Local LLM
+Planners into Trustworthy Table Cleaners}
+\author{Ricardo Alanis\\ \small{\texttt{ricardo.alanis@gmail.com}}}
+\date{June 2026}
+\begin{document}
+\maketitle
+\begin{abstract}
+Cleaning messy tabular data---particularly \emph{canonicalization}, the merging of
+inconsistent surface forms such as \texttt{USA}/\texttt{U.S.A}/\texttt{united states}
+into one canonical value---resists rule-based automation and is routinely done by hand.
+We present ScrubData, an architecture built around a trust contract. A local LLM
+\emph{planner} reads an aggregated column profile (per-value frequency counts,
+invariant to row count) and \emph{proposes} a structured JSON cleaning plan; a
+deterministic executor \emph{applies} it, making every change auditable and reversible;
+and \emph{plan-level selective prediction} --- a deterministic verifier that scores
+every proposed mapping and drops low-confidence entries to review flags --- extends
+abstention from cell-level confidence to the plan itself. The verified union of the
+gated model plan with a reference-grounded heuristic is the architecture's operating
+point: a zero-configuration, zero-label system that repairs 41\% of the hospital
+benchmark's 509 real errors at \unionGatePrec{} precision (strongest of three training
+seeds; 3-seed mean \unionGateThreeSeedPrec{} at \unionGateThreeSeedCov{} coverage,
+$\pm$ = 95\% CI),
+with every declined merge surfaced for review. Four deterministic capabilities ---
+profile-level \emph{suspect surfacing} for high-cardinality columns, reconciliation
+against a pluggable \emph{entity reference} built from open vocabularies,
+\emph{cross-row majority voting} over repeated-entity groups, and
+\emph{convention-conservatism} gates --- carry the system to never-seen tables:
+macro F1 \unseenMacroF{} at \unseenMacroDamage{} damage over the 35 unseen-source
+pairs of a \nPairs-pair benchmark, and \emph{zero silent edits} across \nWild{} wild
+tables plus a \nTrust-table trust audit, released together as the \textsc{WildClean}
+benchmark.
+Finally, we report where the capability lives. \emph{Execution-verified} synthetic
+supervision --- a training example is kept only if executing its plan provably
+recovers the known-clean table --- buys the 4B fine-tune real in-distribution skill
+and the most precise gated planner at usable coverage (\modelGatePrec{} precision at
+\modelGateCov{} coverage); but five further retrains and a three-arm GRPO pilot leave
+held-out generalization statistically bounded (TOST against a pre-registered margin),
+while two of three zero-shot 24--31B open-weights planners (devstral-24B, gemma4-31B)
+dropped into the \emph{identical} harness exceed the fine-tune's operating point
+(\scalePrecBig{} precision at \scaleCovBig{} coverage) with no task training. The
+architecture is planner-agnostic: it converts capability gains into trustworthy
+operating points without retraining. The shipped system runs entirely locally on commodity hardware;
+no data leaves the machine (the scaling-arm planners were measured via hosted
+endpoints; all are locally deployable open weights).
+\end{abstract}
+\section{Introduction}
+A large share of practical data work is cleaning: a sales export where the same country
+is spelled four ways, a hospital roster where \texttt{birminghxm} should be
+\texttt{birmingham}, a CRM dump with mixed date formats and duplicated contacts. The
+fuzzy half of this work---recognizing that distinct surface forms denote the same
+entity---is exactly what rules do poorly and humans do slowly.
+Large language models can do this fuzzy matching, but deploying them as cell editors has
+three problems. First, \emph{trust}: a model that edits cells directly can silently
+corrupt data, and its errors are unauditable. Second, \emph{cost and privacy}: shipping
+every row of a private table to a hosted frontier model is expensive and often
+unacceptable. Third, \emph{hallucination}: asked for a canonical form, a generative model
+will invent one, and on tail entities it will invent wrong ones.
+ScrubData addresses all three with an architecture in which the model never touches
+data. A profiler aggregates each column into a value-frequency distribution; a small
+local model reads the profile and \emph{proposes} a JSON cleaning plan; a deterministic
+pandas executor \emph{applies} it. The plan is the complete, inspectable, reversible
+specification of every change---there are no silent edits by construction
+(\S\ref{sec:method}). Because the prompt scales with the number of \emph{distinct}
+values rather than rows, a million-row table profiles like a hundred-row one.
+This paper makes five contributions:
+\begin{enumerate}
+\item \textbf{A planner/executor decomposition with plan-level selective prediction}:
+  the model proposes, a deterministic engine executes with full lineage, and a
+  deterministic verifier gates every proposed mapping, extending abstention to the plan
+  itself. The verified union of the gated model plan with a reference-grounded
+  heuristic repairs 41\% of hospital's \hospErrors{} real errors at \unionGatePrec{}
+  precision with zero configuration and zero labels (\S\ref{sec:method},
+  \S\ref{sec:verifier}, \S\ref{sec:ws1results}).
+\item \textbf{\textsc{WildClean} and an un-gameable evaluation}: a 65-dataset suite
+  (real-error benchmarks plus seeded error injection over 15 harvested open-data
+  domains) scored with a churn-neutral, convention-tolerant metric that cannot be
+  inflated by mass rewriting, with damage and silent edits scored alongside repair F1,
+  degenerate baselines pinning the metric's floor and ceiling, and the scorer itself
+  validated against 30 adversarial known-by-construction cases (\S\ref{sec:eval},
+  \S\ref{sec:degenerate}).
+\item \textbf{Four deterministic capabilities that carry never-seen-table
+  generalization}: bounded suspect surfacing for high-cardinality columns, generic
+  entity-reference reconciliation with an exact-hit typing floor, cross-row majority
+  voting with a false-consensus guard, and convention-conservatism gates --- each
+  motivated by a measured failure regime and gated by the verifier
+  (\S\ref{sec:capabilities}, \S\ref{sec:wild}).
+\item \textbf{Execution-verified synthetic supervision}, the training method behind
+  the 4B planner instantiation: every training example is validated by running the
+  executor on the (dirty table, plan) pair and checking that the known-clean table is
+  recovered; non-recovering examples are discarded (\S\ref{sec:sft}).
+\item \textbf{A unified finding on where capability lives in this architecture}: five
+  further supervised fine-tunes and a three-arm GRPO pilot with the executor as a
+  verifiable reward leave held-out generalization statistically bounded (TOST), while
+  two of three zero-shot 24--31B planners dropped into the same harness exceed the
+  fine-tune's operating point --- deterministic machinery plus plan-level verification carry the
+  generalization that exists, and raw planner capability, not task fine-tuning, scales
+  it (\S\ref{sec:negative}, \S\ref{sec:scaling}).
+\end{enumerate}
+We deliberately report a negative-flavored finding alongside the positive ones: on
+\emph{injected} typos, classical frequency clustering remains a strong baseline---by
+construction, injection places the canonical form in the column, which is clustering's
+ideal regime. The advantage of grounding is concentrated where it matters: real errors,
+tail entities absent from the column, and adversarial near-misses where acting at all is
+wrong (\S\ref{sec:results}).
+\section{Related Work}
+\label{sec:related}
+\textbf{Error detection and repair.} Raha and Baran~\cite{raha} established
+configuration-free error detection and correction benchmarks (hospital, beers, flights,
+rayyan), which we adopt as out-of-distribution evaluation. HoloClean~\cite{holoclean}
+combines integrity constraints, external reference data, and statistics in probabilistic
+repair, demonstrating that external signals can veto statistically plausible but wrong
+repairs---an insight our reference-veto inherits. GARF~\cite{garf} learns repair rules
+self-supervised from the data itself; it also demonstrates the structural limit we
+observe for frequency-only methods: a lone categorical column offers no co-occurring
+signal to vote against an error.
+\textbf{The 2025--26 landscape.} Post-Cocoon work concentrates on zero-label
+\emph{detection}: ZeroED~\cite{zeroed} (cloud-LLM cluster labeling, hospital
+detection F1 0.81, collapsing to 0.27 on smaller models), ForestED~\cite{forested}
+(LLM-induced decision trees, 0.756), and Auto-Test~\cite{autotest} (corpus-mined
+semantic-domain constraints, no LLM at inference) --- none performs zero-label
+\emph{repair}. GIDCL~\cite{gidcl} sets the labeled-class repair ceiling
+(hospital \gidclHosp{} with 20 labels and a LoRA trained per cleaned table);
+Cocoon~\cite{cocoon} remains an unreproduced preprint (15 citing papers, none a
+reproduction). Two concurrent results corroborate facets of this paper's central
+negative finding that machinery, not weights, carries cleaning generalization: a
+study showing even frontier models cannot correct table distortions without
+explicit priors~\cite{distort}, and a large multi-agent-debate evaluation in which
+LLM self-critique \emph{degrades} repair and only an adversarially separate,
+execution-grounded critic helps~\cite{debate} --- the architecture our verifier
+instantiates. Spreadsheet-RL~\cite{spreadsheetrl} reports the complementary
+positive case: with full-scale RL infrastructure and execution-verified rewards,
+a 4B model's spreadsheet-manipulation skill \emph{does} move (12.0\%
+$\rightarrow$ 23.4\%) --- consistent with our reading that the gap between our
+\$30 pilot and such results is infrastructure scale, a boundary we state rather
+than blur (\S\ref{sec:negative}).
+\textbf{LLMs for data wrangling.} Narayan et al.~\cite{wrangle} showed frontier
+foundation models handle entity matching and imputation few-shot;
+Jellyfish~\cite{jellyfish} and Table-GPT~\cite{tablegpt} fine-tune mid-size models for
+data tasks. RetClean~\cite{retclean} is closest in spirit: retrieval from data lakes
+grounds cell repair, with the key empirical split that parametric knowledge suffices on
+world-known head values but collapses on the tail---motivating retrieval. Our work
+differs in the planner/executor decomposition (the model emits no cell values, only
+plans), in execution-verified supervision, and in the calibrated-abstention contract.
+\textbf{Entity linking over tables.} TURL~\cite{turl} and TableLlama~\cite{tablellama}
+inject candidate entities into table understanding; Belotti et al.~\cite{belotti}
+show retriever coverage is the accuracy ceiling for table entity disambiguation and that
+long candidate lists hurt smaller models. RACOON~\cite{racoon} shows inference-time KG
+retrieval lifts a frozen model substantially, supporting our choice to ground at
+inference rather than bake aliases into weights (TURL's out-of-domain collapse is the
+cautionary result). MTab~\cite{mtab} established type-constrained matching with
+abstention in semantic table annotation.
+\textbf{Clustering-based cleaning tools.} The de-facto practitioner baseline is
+OpenRefine: key-collision (fingerprint) clustering plus a nearest-neighbour mode; we
+reimplement both faithfully, including blocking, and compare head-to-head.
+\textbf{Selective prediction.} Risk--coverage analysis and calibration
+metrics~\cite{selective} formalize ``knowing when not to act''; to our knowledge their
+application to data-cleaning merge decisions is new.
+\textbf{Small specialized models.} OpenMed~\cite{openmed} fine-tunes sub-500M encoders
+to state-of-the-art biomedical NER, the sister result to our thesis that small
+specialized models beat large generic ones on narrow structured tasks; we adopt their
+released PII token classifiers for column typing (\S\ref{sec:pii}).
+\section{Method}
+\label{sec:method}
+\subsection{Planner / executor decomposition}
+A \emph{profiler} reduces each column to a typed summary: detected semantic type, missing
+counts, issue flags, and a value--frequency distribution capped at 80 distinct values
+(high-cardinality columns are summarized by their head). The \emph{planner}---either a
+deterministic heuristic or our fine-tuned 4B model---maps the profile (plus three sample
+rows) to a JSON plan: a list of per-column operations drawn from a closed vocabulary
+(\texttt{canonicalize\_categories} with an explicit mapping, \texttt{parse\_date},
+\texttt{standardize\_phone}, \texttt{mask\_pii}, \ldots), table operations, and review
+flags. The \emph{executor} applies the plan with pure pandas transforms. The plan is the
+only channel through which data changes: every diff is attributable to a named operation
+with a rationale, the original table is never mutated, and abstentions are first-class
+plan objects. We export per-run decision summaries as OpenTelemetry GenAI spans.
+\subsection{Execution-verified synthetic supervision}
+\label{sec:sft}
+Training pairs are generated by corrupting clean synthetic tables with realistic noise
+(casing, aliases, single-character typos with Zipf-distributed long-tail categorical
+columns of 30--80 distinct values) while recording the ground-truth plan. The defining
+step is \emph{verification by execution}: a candidate example is kept only if
+$\textsc{Execute}(\text{dirty}, \text{plan}) = \text{clean}$ cell-for-cell. This closes
+the loop between supervision and semantics---a plan that would not actually clean the
+table can never become a training label. We augment with real supervision derived from
+paired dirty/clean benchmarks by aligning cells and keeping only \emph{learnable}
+canonicalizations (a surface form that is a string variant of its target and never a
+legitimate value elsewhere), which excludes unlearnable per-cell corrections such as
+divergent flight times. The fine-tune is QLoRA (rank 32) over Qwen3-4B-Instruct in
+bf16; one practical finding is that the base model's tool-calling prior dominates
+free-running generation even after convergent fine-tuning (loss 0.16) and must be
+suppressed at decode time by banning the two tool-call tokens.
+\subsection{Reference-grounded canonicalization with abstention}
+\label{sec:grounding}
+For columns whose values reconcile to a known concept type (countries, administrative
+regions, cities), canonical forms are never generated: a fuzzy retriever (normalized
+edit similarity with first-character blocking and length prefilters) matches each
+distinct value against the type-scoped reference (ISO/pycountry; GeoNames cities500,
+196k entries). A value maps to a canonical only if (i) similarity clears a threshold
+$\tau{=}0.84$, (ii) the best--second-best margin clears $0.03$ (ambiguity veto: a value
+equally close to \texttt{Box} and \texttt{Boaz} abstains), and (iii) the canonical is
+cast to the column's observed case convention. Near-misses ($0.70{\le}s{<}\tau$) are
+surfaced as review flags. The same wrapper grounds the \emph{model} planner: for
+reference-typed columns the model's free-generated mapping is replaced by the grounded
+one, so the model can add coverage but never invent a canonical for a grounded type.
+\subsection{Plan-level selective prediction: the verified union planner}
+\label{sec:verifier}
+Grounding constrains reference-typed columns, but the planner's \emph{free}
+canonicalization mappings on non-grounded columns remain unguarded---and they are where
+real-data precision dies (the fine-tune's raw hospital plan: \hospModelPrecVSix{}
+precision at \hospModelRecallVSix{} recall). Rather than retrain, we extend abstention
+to the plan itself. A deterministic \emph{verifier} scores every proposed mapping entry
+$raw{\to}canon$ with contract-preserving evidence (no cell values emitted, no gold
+access): three hard gates distilled from the model's measured failure classes---a value
+occurring ${\ge}3$ times is data, not a typo (\emph{errors are rare}); the target must
+be a frequent column value clearly dominating the source (no mapping one typo onto
+another); digit-bearing codes repair only when the letter part is near-identical---then
+a confidence combining edit similarity with frequency support. Entries below a
+threshold $\tau$ are dropped to review flags; abstention stays first-class. Sweeping
+$\tau$ yields a plan-level precision--coverage curve. The shipped composition,
+the \emph{verified union planner}, is the verifier-gated model plan ($\tau{=}0.5$)
+unioned with the grounded heuristic's mappings (the model wins per surface form);
+the same code path is the product default.
+\subsection{Visibility and consensus: four deterministic capabilities}
+\label{sec:capabilities}
+Four further mechanisms, each motivated by a measured failure regime on never-seen
+tables, complete the deterministic machinery. \textbf{(a) Suspect surfacing.} The
+profile's value-frequency view is capped, so high-cardinality columns hide their
+dirty cells from any planner. Every column profile now carries a bounded
+\texttt{suspect\_values} section: rare anomalous surfaces with evidence-backed
+repair candidates (frequency dominance, edit similarity, reference membership).
+The heuristic planner repairs from suspects under a strict verifier bar
+($\tau_{hc}{=}0.8$) and flags the rest. \textbf{(b) Generic entity reference.}
+Open vocabularies (SemTab ToughTables aliases --- derived excluding our benchmark
+tables; MusicBrainz search-hint misspellings; RxNorm; Wikidata; ROR) register as a
+pluggable reference type. Because the reference is broad, entity-typing a column
+additionally requires that ${\ge}20\%$ of its distinct values match the reference
+\emph{exactly} --- fuzzy coverage alone over-fires on name-like columns (measured).
+This resolves the regime where every surface in a column is unique (no in-column
+frequency signal exists at all): five such benchmark tables go from 0.0 to
+\ttFOne{} F1 at \emph{zero} damage. \textbf{(c) Cross-row majority voting.} Tables
+that repeat a real-world entity across rows (a flight reported by many sources)
+carry their own repair signal. A detection step finds compact-token key columns
+with small groups (median multiplicity 3--30) and columns whose groups show
+\emph{majority-bearing} disagreement with per-group information; a table-level
+operation then resolves thin dissenting minorities to the group majority. A
+\emph{false-consensus} guard declines when minority shares look like legitimate
+correlated updates rather than reporting errors (mean minority share ${\ge}0.25$)
+--- a flat volume cap was measured to destroy the legitimate dense-disagreement
+regime and replaced. \textbf{(d) Convention conservatism.} The planner never
+re-formats an internally consistent column: date and percent ops are gated on
+dominant-shape inconsistency (digit and alpha runs collapsed; 90\% rule),
+ZIP/postal-named columns are never typed as phones or dates, and Excel-serial
+date typing requires a date-suggestive column name. Suppressed minority values
+surface as review flags --- abstention is visible, never silent. The verifier
+enforces the same gates on model-emitted plans at the verification boundary.
+\subsection{PII as a second task instance}
+\label{sec:pii}
+The identical contract covers PII: a deterministic tier types columns by checksum and
+pattern validators (Luhn, IBAN mod-97, SSN/email/phone) over distinct values; an
+optional 44M OpenMed-PII token classifier~\cite{openmed} extends coverage to names and
+addresses, gated by a sensitive-type allowlist and a column-level coverage vote; and
+masking, salted hashing, and join-stable pseudonymization are deterministic executor
+operations. Measured briefly: the classifier, though trained on sentence-level
+clinical text, transfers to bare cell values --- \piiNameBare{} detection on
+person-name cells and \piiAddrBare{} on address cells ($n{=}40$ sampled cells each);
+the validator tier, evaluated out-of-distribution on per-type columns from the Gretel
+PII test split, types 5/5 covered PII types correctly with 0/7 false positives on
+negative columns drawn from real open data; and after deterministic masking,
+re-running all validators over the output finds \piiLeakRate{} residual PII ---
+residual PII \emph{detectable by our validators}, a circularity we note explicitly:
+the leak test can only see what the validator tier sees.
+\section{Evaluation Design}
+\label{sec:eval}
+\textbf{Suite.} Five real-error benchmarks (Raha) plus seeded error injection
+(typo/OCR/case/whitespace) over 15 harvested open-data domains (NYC, Chicago, SF, LA,
+Seattle, Texas, WA portals; GitHub) $\approx$ 65 datasets per seed. We aggregate as a
+\emph{double macro}---mean over error types of mean over datasets, harmonically combined
+with the domain macro---so no single table or error type dominates:
+\begin{equation*}
+\textsc{north} \;=\; \operatorname{HM}\Biggl(
+\underbrace{\frac{1}{|T|}\sum_{t \in T}\frac{1}{|D_t|}\sum_{d \in D_t} F_1(d)}_{\text{error-type macro}},\;
+\underbrace{\frac{1}{|G|}\sum_{g \in G}\frac{1}{|D_g|}\sum_{d \in D_g} F_1(d)}_{\text{domain macro}}
+\Biggr),
+\end{equation*}
+where $T$ is the set of error types, $G$ the set of data domains, $D_t$ (resp.\ $D_g$)
+the datasets carrying error type $t$ (domain $g$), and $\operatorname{HM}$ the harmonic
+mean.
+\textbf{Churn-neutral metric.} A cell change that is case/whitespace-equivalent to the
+input but does not restore the gold counts as nothing: not a fix, not a change, not
+damage. Without this, mass case-rewriting inflates precision (we observed $+0.12$
+NORTH from \emph{removing} case matching before the correction); with it, fixing a
+case-injected error requires actually acting. We additionally report
+\emph{damage}---the rate of semantically corrupting clean cells---and an adversarial
+\emph{abstain slice} whose traps are garbage strings (not single-edit variants of any
+reference entity; an earlier trap set mis-scored grounding for correctly mapping
+\texttt{Boazz}$\to$\texttt{Boaz}). We report both repairs of these metric artifacts as
+evidence that gameability must be tested, not assumed.
+\textbf{Real vs.\ injected.} Injected typos are in-distribution for frequency
+clustering by construction (the canonical is present and dominant in the column), so we
+report the real-error and injected slices separately. A TableEG-style audit
+quantifies the gap (\texttt{eval/inject\_validity.py}): the injector covers three
+of nine error classes (Jensen--Shannon divergence 0.526 bits from the pooled real
+distribution over 163{,}607 real errors), and injected-only evaluation would
+invert the fingerprint-clustering ranking --- exactly the overstatement the
+separate-slice reporting prevents.
+\textbf{Scorer validation.} Following GroUSE-style evaluator
+testing~\cite{grouse}, the scorer itself is validated against 30 adversarial
+known-by-construction cases: a no-op plan must score 0 fixes and 0 damage, an
+oracle plan exactly 1.0, vandalizing $k$ of $m$ clean cells must score damage
+$k/m$ at precision 0, pure churn (case/whitespace rewrites that do not restore
+gold) must count as nothing although a naive scorer would count it, fixes must
+require actually acting, and silent edits must trip the audit. All 30 pass
+against the shipped scorer unmodified. We additionally cross-score every system
+under the \emph{original} Raha/Baran cell-repair protocol side by side with ours
+(\texttt{eval/cross\_scoring.py}): rankings agree at Kendall $\tau_b{=}1.0$ on
+three of five datasets, and the disagreements cut both ways --- raw string
+equality denies credit for numerically-correct serialization restorations (our
+movies\_1 repairs), while churn-neutrality charges Baran for load-time
+normalizer rewrites its own protocol hides (hospital precision
+$0.908\!\to\!0.783$). Neither metric family flatters us uniformly, and our Baran
+reproduction calibrates against its published Table~3 within $+0.02$ on three of
+the four shared datasets.
+\textbf{Contamination.} The Raha-suite benchmarks have been public on GitHub since
+2019 and sit inside every modern base model's training window; we treat them as
+potentially contaminated and split our claims accordingly. A verbatim-completion
+probe makes the concern concrete: prompted with five fields of a gold hospital row,
+a frontier-class model reproduces \textbf{25\%} of the held-out cells exactly
+(30/120 cells over 30 rows, exact-substring match), versus \textbf{0\%} (0/120) on a
+date-stamped post-training-cutoff wild harvest under the identical protocol
+(\texttt{eval/contamination\_probe.py}). The rate is an upper bound on memorization
+--- some completions are guessable from the given fields --- but it is not zero, so
+results on legacy-public benchmarks (including the all-hospital
+Table~\ref{tab:scaling}, whose zero-shot planners may partially benefit from
+memorized gold) carry this caveat, while the architecture's trust claims
+(zero silent edits, damage accounting, abstention) rest on the date-stamped wild
+and GitTables slices, where the probe finds nothing to complete.
+\section{Results}
+\label{sec:results}
+\subsection{Plan-level selective prediction on real errors}
+\label{sec:ws1results}
+On hospital's \hospErrors{} real errors, the verifier transforms the fine-tune from
+unshippable to precise (Figure~\ref{fig:pc}): the raw model plan repairs
+\hospModelRecallVSix{} of errors at \hospModelPrecVSix{} precision; gated at $\tau{=}0.5$ it
+reaches \modelGatePrec{} precision at \modelGateCov{} coverage (146 of 147 committed
+changes correct). The union with the grounded heuristic buys coverage back:
+\textbf{\unionGatePrec{} precision at \unionGateCov{} coverage} (\unionChanged{}
+changes, \unionFixed{} correct). This turns the system's promise into a measured
+sentence: \emph{zero-configuration and zero labels, repair 41\% of real errors at
+${\ge}0.90$ precision, with every declined merge surfaced for review}. For context,
+Baran given oracle error positions and 20 gold-labeled tuples per dataset reaches
+\realFBaran{} F1 on the same slice (\S\ref{sec:ws4})---selective prediction does not
+close a supervised gap, but it makes the zero-label operating point trustworthy, which
+is the regime our user occupies. Precision is flat ($0.89$--$0.91$) for
+$\tau\in[0.2,0.8]$, so the operating point is not threshold-brittle, and the result is
+seed-robust: across three training seeds of the same data recipe the union operating
+point is \unionGateThreeSeedPrec{} precision at \unionGateThreeSeedCov{} coverage
+(the shipped adapter is the strongest seed), with every seed clearing the
+$0.70$-precision/$0.30$-coverage bar decisively. All 3-seed intervals in this paper
+are normal-approximation 95\% CIs ($1.96\,\sigma/\sqrt{3}$); the $t$-based
+interval at $n{=}3$ is ${\sim}2.7\times$ wider ($\pm 0.031$ here) and every
+qualitative claim survives it --- the weakest seed alone clears the bar.
+\textbf{Candidate-constrained planning (negative result).} We also tested constraining
+the planner's \emph{inputs}: the profiler emits evidence-backed (variant$\,\to\,$
+canonical) candidate pairs (frequency dominance, edit similarity, reference membership)
+and the model may only select among them, with a deterministic check dropping
+off-candidate mappings to review flags. As a standalone guard it is strong---the raw
+plan's precision rises from \hospModelPrecVSix{} to \pairsRawPrec{} with no verifier at
+all---but composed with the verifier and union it reaches \pairsUnionPrec{} precision
+at \pairsUnionCov{} coverage, slightly \emph{below} the unconstrained pipeline at the
+same $\tau$: the candidate cap (top-3 per surface) removes some correct repairs the
+verifier would have kept, and the two mechanisms gate the same failure class. We ship
+the verifier and keep candidate constraining available but off by default, reporting
+this as a measured redundancy rather than a stacked win.
+\begin{figure}[t]
+\centering
+\includegraphics[width=0.62\linewidth]{fig_precision_coverage}
+\caption{Plan-level precision--coverage on hospital (509 real errors), sweeping the
+verifier threshold $\tau$. The union planner dominates the raw model plan; the shipped
+operating point ($\tau{=}0.5$) is annotated.}
+\label{fig:pc}
+\end{figure}
+\subsection{The 4B fine-tune as one planner instantiation}
+On frozen synthetic gold, the fine-tuned 4B planner reaches canonicalization micro-F1
+\canonFMultiSeed{} --- versus \canonFBig{} for a much larger zero-shot generalist
+prompted identically and \canonFHeur{} for the rule heuristic (best single run
+\canonFOursBest; operation-F1 \opFOurs, JSON validity \jsonValidOurs). On real hospital
+typos the synthetic-only fine-tune scores 0.000 repair recall; adding 20\%
+real-derived supervision lifts it to \hospModelRecall, and a data-scaling iteration
+(tripling the real-derived share from three paired benchmarks) reaches
+\hospModelRecallVSix{} recall at \hospModelPrecVSix{} precision---approaching the
+\frontierZeroShotRecall{} of a frontier-scale zero-shot model. The scaling gain is seed-robust: $+0.09$
+canonicalization F1 over the base mix under identical protocol, with non-overlapping
+3-seed confidence intervals. Real, execution-verified pairs are what transfer:
+the same iteration found frequency-derived and algorithm-cleaned labels both
+\emph{reduce} quality, consistent with our grounding thesis.
+\subsection{Grounding vs.\ clustering}
+With the errors-are-rare frequency gates now in both paths, grounding and frequency
+clustering are comparable on hospital alone (repairs-only, churn-neutral:
+\hospPrecGrounded{} precision at \hospRecallGrounded{} recall grounded vs
+\hospPrecFreq{} at \hospRecallFreq{} clustering---hospital's dominant errors are
+in-column typos, clustering's best case). Grounding's margin appears where references
+matter: across the five-benchmark real-error macro it reaches \ablFullRealF{} versus
+\ablNoGroundRealF{} for the frequency-clustering ablation ($+29\%$), and it carries
+the behavioral guarantees below.
+On the full suite against OpenRefine (Table~\ref{tab:money}), the result splits
+cleanly by regime, and we report both. On the \emph{real-error} slice---the regime the
+tool exists for---grounded cleaning reaches REAL-F1 \realFGrounded{}, $3.9\times$
+OpenRefine kNN (\realFORKnn) and $5.7\times$ fingerprint (\realFORFp), with seed CIs of
+$\pm$\northGroundedCI. Provenance: the grounded and OpenRefine rows of
+Table~\ref{tab:money} are regenerated at the current system head (2026-06-12,
+post-capability, scorer fix in); the dagger rows keep their original capture
+provenance. The June-10 freeze system measured REAL-F1 \realFGroundedFreeze{} on the
+same protocol --- the $+0.05$ difference is the measured contribution of the four
+deterministic capabilities (\S\ref{sec:capabilities}) on the real-error slice. On the \emph{injected} slice, fingerprint clustering wins
+(\injFORFp{} vs \injFGrounded) at near-zero damage: our case/whitespace injectors are
+exactly the perturbations key-collision normalizes away, so this is its home game and
+we say so. kNN clustering---the method that, like us, attempts typo repair---loses on
+both slices while incurring the highest damage among baselines (\damageORKnn), the
+no-reference over-merging failure the grounding was built to prevent. The shipped
+verified-union system's suite row (REAL-F1 \modelRealF, damage \modelDamage) shows the
+grounding wrapper and heuristic union carry entity canonicalization on these datasets ---
+the model's contribution concentrates on the synthetic regime and hospital repair
+(\S\ref{sec:ws1results}), and the verifier cuts its suite damage to \modelDamage,
+$6\times$ below the grounded heuristic's \damageGrounded{} (HEAD damage vs the union
+row's freeze-time capture --- a disclosed basis mix). Within our own ablations
+(June-10 freeze basis throughout), removing grounding cedes $22\%$ of real-error
+F1 (\ablNoGroundRealF{} vs \ablFullRealF) and forfeits the behavioral guarantees:
+perfect abstention on adversarial traps (\ablFullAbstain) versus
+\ablNoAbstainAbstain{} without abstention, and reference-vetoed wrong merges (e.g.\
+\texttt{guntxrsvillx}$\to$\texttt{huntsville}).
+\begin{table}[t]
+\centering
+\caption{Wide-suite comparison, 3 injection seeds, churn-neutral metric. NORTH is the
+double-macro harmonic mean; REAL-F1 is the real-error slice. Regenerated at the
+current system head (2026-06-12); the June-10 freeze system measured
+\realFGroundedFreeze{} REAL-F1 / \northGroundedFreeze{} NORTH on the same protocol.}
+\label{tab:money}
+\begin{tabular}{lcccccc}
+\toprule
+System & NORTH & $\pm$95\%CI & REAL-F1 & INJ-F1 & damage & abstain \\
+\midrule
+Grounded (ours) & \northGrounded & \northGroundedCI & \textbf{\realFGrounded} & \injFGrounded & \damageGrounded & \ablFullAbstain \\
+OpenRefine fingerprint & \northORFp & 0.000 & \realFORFp & \injFORFp & \damageORFp & 1.000 \\
+OpenRefine kNN & \northORKnn & 0.002 & \realFORKnn & 0.148 & \damageORKnn & 1.000 \\
+Verified union 4B (shipped)$^{\dagger}$ & -- & -- & \modelRealF & -- & \modelDamage & \modelAbstain \\
+\midrule
+Baran (oracle det.\ + 20 labels)$^{\ddagger}$ & -- & -- & \realFBaran & -- & \damageBaran & -- \\
+Jellyfish-13B (ED+DI)$^{\ddagger}$ & -- & -- & \realFJelly & -- & \damageJelly & -- \\
+\bottomrule
+\end{tabular}
+\smallskip
+{\small $^{\dagger}$single seed, REAL + typo-injected slice only (GPU cost); other rows
+are 3-seed means. $^{\ddagger}$real slice only, disclosed protocol asymmetries
+(\S\ref{sec:ws4}): Baran uses oracle error positions + gold labels; Jellyfish is our
+detect-then-impute composition with seen-data caveats.}
+\end{table}
+\subsection{Generalization to never-seen tables}
+\label{sec:wild}
+The freeze-version system above was then pointed at data it had never seen, under
+three new harnesses (all released with this paper as the \textsc{WildClean} bundle).
+\textbf{(1) Paired bench}: \nPairs{} dirty/gold pairs spanning the Raha suite, SemTab
+ToughTables, government open-data typo corpora, entity-matching tables, and
+LLM-cleaning evaluation sets. On the 35 pairs from sources absent from training ---
+a count that coincidentally equals, but is distinct from, the \nWild{} gold-free wild
+tables of harness~(2) below --- the
+post-freeze system scores \textbf{macro F1 \unseenMacroF{} at damage
+\unseenMacroDamage}. The largest single contribution is the regime
+\S\ref{sec:capabilities}(b) unlocks: on five all-unique entity tables where no
+in-column frequency signal exists, F1 moves from $0.0$ to \ttFOne{} at zero damage.
+Cross-row voting (\S\ref{sec:capabilities}c) is the second: flights---many sources
+reporting the same flight---goes from \flightsBaseF{} to \flightsVoteF{} F1
+heuristic-only, and the heuristic hospital path doubles from \hospBaseHeur{} to
+\hospVoteHeur{}. The hospital union gate is invariant under all of this
+(\unionGatePrec{} at \unionGateCov). \textbf{(2) Wild bench}: \nWild{} uncurated
+in-the-wild tables (open-data portals, GitHub, Kaggle) with no gold; we score seeded
+inject--recovery on each table's own data (mean recovery \wildRecovery{} over the 34
+tables with inject scores; one table has none) plus a
+behavioral audit: every run yields a valid plan, every changed cell is attributable
+to a logged operation --- \textbf{zero silent edits across all \nWild{} tables}.
+\textbf{(3) Trust audit at scale}: \nTrust{} GitTables tables, same property ---
+\nTrust{}/\nTrust{} valid plans, zero crashes, zero silent edits. The held-out-source
+generalization metric (train and evaluation drawn from disjoint benchmark sources)
+remains low in absolute terms (GEN-F1 \genFTwo{}, variant-recall \genVRTwo{}, damage
+\genDamageTwo): cleaning unfamiliar tables is far from solved, and we report the
+number to anchor the next section's claim about \emph{where} the capability that does
+exist actually lives.
+\subsection{Where capability lives: a bounded null for fine-tuning}
+\label{sec:negative}
+Every attempt to move never-seen-table performance through the model weights failed;
+every gain in \S\ref{sec:wild} came from deterministic machinery plus the verifier.
+Five further supervised fine-tunes --- adding 109k harvested real-world alias pairs
+(ToughTables-derived, MusicBrainz search hints, RxNorm, OpenFlights), error-dense
+episode mixes, and a suspects-contract retrain --- left held-out GEN-F1
+\emph{statistically bounded}: every retrain's delta is positive but negligible (mean
+$+0.003$), never approaching the pre-registered $\delta{=}0.05$. ``Bounded'' is a
+tested equivalence claim, not an eyeballed one~\cite{lakens}: across the five-retrain
+series the mean held-out GEN-F1 delta (retrain minus champion) is $+0.0028$
+(90\% bootstrap CI $[+0.0008, +0.0049]$, strictly positive;
+10{,}000 resamples, seed 42; per-dataset granularity, $n{=}15$ over 3 held-out
+sources $\times$ 5 retrains --- per-pair deltas do not exist for the retrain
+series, so within-retrain deltas are clustered and we add a retrain-level
+robustness check, $n{=}5$ macro deltas), and TOST rejects effects beyond the
+pre-registered SESOI of $\pm 0.05$ ($p = 8.0\times10^{-16}$; retrain-level check
+$p = 8.3\times10^{-8}$). One disclosure sharpens the clustering caveat: two
+retrains' held-out rows are \emph{bit-identical} --- mechanically verified as
+verifier-collapse, not a data error (their raw plans share zero mapping entries,
+9 vs.\ 82 on flights, yet the verifier kills all of both, so each union
+degenerates to the same deterministic plan;
+\texttt{eval/results/equivalence\_coincidence.json}) --- so the $n{=}15$ rows
+carry fewer independent observations than their count suggests, which is exactly
+why the $n{=}5$ retrain-level test is the one we lean on. The collapse itself is
+the finding in miniature: different weights, same held-out behavior, because the
+verifier and the deterministic machinery decide what survives. Two reconciliations make the claim auditable. First, the
+basis: the equivalence series is scored against the champion's absolute GEN-F1 of
+\genChampionBasis{}, while the \genFTwo{} of \S\ref{sec:wild} is the \emph{shipped
+system} at the post-freeze HEAD with all deterministic capabilities --- the
+equivalence series scores each retrain's model-union path at its own capture time,
+so the two figures share a metric but not a basis. Second, the SESOI: weight
+interventions move GEN-F1 by at most $0.005$, while the deterministic machinery of
+\S\ref{sec:capabilities} moved the unseen-pair macro from $0.10$ to \unseenMacroF{}
+--- $\delta{=}0.05$ sits an order of magnitude above the measured weight effect and
+well below the machinery effect, which is exactly the boundary the test is meant to
+police. Mixing harvested pairs into the training blend
+\emph{diluted} the synthetic skill the executor verifies (a monotonic dilution law
+across mix ratios). A GRPO pilot using the executor as a verifiable reward (the
+direction RLVR table work~\cite{tabler1} motivates) was negative in all three arms at
+4B/LoRA scale: the main arm and a KL-anchored variant degraded plan-format validity,
+and a random-reward control arm reproduced the same drift, identifying it as an RL
+artifact rather than signal~\cite{spurious}. We state this as a \emph{bounded} null,
+not a universal one: at 4B/LoRA scale, under our propose/execute protocol and
+training budgets, no weight intervention we ran produced measurable movement in
+never-seen-table repair --- profiling visibility, reference grounding, cross-row
+consensus, convention conservatism, and plan-level verification carry the capability
+that exists. The bound is explicit: results with full-scale RL infrastructure
+(execution-verified rewards on multi-GPU RLVR stacks~\cite{spreadsheetrl,tabler1})
+show task skill moving at the same parameter scale, so our claim is about what
+SFT-and-pilot-RL buy in this protocol class, not about reinforcement learning in
+general. A second explicit bound: every weight experiment here uses the Qwen3
+family --- and the very work we cite to explain the control arm's drift documents
+that random-reward GRPO effects are themselves family-sensitive~\cite{spurious}
+--- so the null is stated for Qwen3-class models pending a cross-family
+replication. Concurrent evaluations corroborate the mechanism from independent
+directions~\cite{distort,debate}. The practical corollary is unusual but actionable:
+a contributor who wants to improve a system like this should write a deterministic
+capability and gate it with the verifier, not collect more training data.
+The null extends to test-time compute --- with one instructive exception that
+\emph{confirms} the architecture claim. Self-consistency \emph{voting} over
+$N{=}16$ temperature-0.7 samples (cell-edit-level majority, run through the
+identical verifier--union pipeline) yields 0.906 precision at 0.454 coverage
+versus 0.9055 at 0.4519 for matched greedy decoding on the same local runtime ---
+a null at matched precision, the visibility law from the test-time side: voting
+cannot surface repairs the profile does not expose, and it actively discards
+verified-recoverable coverage. But pooling \emph{every} mapping from all 16
+samples and letting the verifier filter the union gives the best operating point
+we measure for the 4B: \textbf{0.911 precision at 0.483 coverage} ($+0.6$ points
+precision, $+7.1$ points coverage over the shipped gate; an independent $N{=}8$
+replication reproduces the \emph{voted} point to $\pm 0.0003$ precision /
+$\pm 0.002$ coverage, and the greedy anchor exactly). The lesson is the paper's thesis in miniature: sampling
+helps only as a \emph{candidate generator}; consensus adds nothing the verifier
+does not already provide --- pool candidates, verify, do not vote. Separately,
+the local capture path itself (Q8 quantization with grammar-constrained decoding)
+is worth $+3.9$ points of coverage over the original Modal capture at equal
+precision.
+\subsection{Zero-label capability scaling: the verifier harness is planner-agnostic}
+\label{sec:scaling}
+The negative result bounds what fine-tuning small weights buys; it says nothing
+about raw capability. To separate the two we dropped zero-shot, $\leq$32B
+open-weights planners --- with \emph{no} task training --- into the identical
+hospital pipeline the 4B fine-tune uses: same prompt contract, same
+verify($\tau{=}0.5$), same union with the grounded heuristic
+(Table~\ref{tab:scaling}). devstral-small-2-24B and gemma4-31B both reach
+\textbf{\scalePrecBig{} precision at \scaleCovBig{} coverage} --- exceeding the
+fine-tune's union point of \unionGatePrec{} at \unionGateCov{} --- while
+nemotron-30B reaches \scalePrecNemo{} at \scaleCovNemo{} with JSON-plan validity
+0.4 (validity is part of the measurement: a planner that cannot reliably emit the
+plan schema loses coverage before capability is measured). gpt-oss-20B is
+excluded as a serving failure, documented rather than scored as capability: the
+hosted proxy returned empty content on every planning call despite full-length
+generation. The arm is multi-family (Mistral, Google, NVIDIA), which addresses
+the single-family bound of \S\ref{sec:negative} for the inference side; the
+weight-training null itself remains Qwen3-scoped. Disclosure: these models were
+measured via hosted inference for speed; all are $\leq$32B open weights and
+locally deployable in principle. The interpretation we draw is the paper's
+sharpest: SFT at 4B does not buy held-out generalization (\S\ref{sec:negative}),
+but raw capability at 24--31B does lift the same harness --- the verifier/union
+architecture is the portable contribution, converting any sufficiently capable
+planner into a trustworthy cleaner.
+\begin{table}[t]
+\centering
+\caption{Zero-shot $\leq$32B planners in the identical verify($\tau{=}0.5$)+union
+harness, hospital's \hospErrors{} real errors. Validity = fraction of planning
+calls returning schema-valid JSON. Runtime = wall-clock for the planning calls on
+hosted endpoints (single capture, no seeds; the 4B row is a prior Modal A100
+capture with no comparable local figure). Each scaling row is a single capture;
+the primary evidence is the union coverage delta ($+0.07$) at matched-or-better
+precision, not any single cell. For context, 16-sample pooling lifts the 4B
+fine-tune to $0.911@0.483$ at $16\times$ planning compute
+(\S\ref{sec:negative}); the 24--31B planners reach $0.915@0.485$ in a single
+greedy pass --- single-pass capability versus test-time compute, both converted
+into trustworthy operating points by the same verifier. Bold marks the best union operating point.
+gpt-oss-20B excluded (serving failure: empty
+proxy responses, not measurable capability).
+The identical devstral/gemma rows are a verified counting coincidence, not a
+scoring artifact: their applied cell-edit sets share 266 of 270 cells, each
+commits 4 model-specific repairs (all correct), and the totals coincide
+(\texttt{eval/results/scaling\_coincidence.json}).
+}
+\label{tab:scaling}
+\footnotesize
+\begin{tabular}{lccccc}
+\toprule
+Planner & Params & Gated P@C & Union P@C & Validity & Runtime (s) \\
+\midrule
+ScrubData-v6 (Qwen3-4B fine-tune) & 4B & 0.993 @ 0.287 & 0.905 @ 0.413 & --- & --- \\
+devstral-small-2 (Mistral) & 24B & 0.943 @ 0.426 & \textbf{0.915 @ 0.485} & 1.0 & \runtimeDevstral \\
+nemotron-3-nano (NVIDIA) & 30B & 1.000 @ 0.138 & 0.877 @ 0.336 & 0.4 & \runtimeNemo \\
+gemma4 (Google) & 31B & 0.943 @ 0.426 & \textbf{0.915 @ 0.485} & 1.0 & \runtimeGemma \\
+\bottomrule
+\end{tabular}
+\end{table}
+\subsection{Ablations}
+All ablations are 3-seed means (CIs $\le\pm0.003$). Removing abstention costs $-0.013$
+NORTH, raises damage to \ablNoAbstainDamage{} (from \ablFullDamage), and collapses trap
+abstention to \ablNoAbstainAbstain. Removing the ambiguity margin costs $-0.006$ with
+$+0.001$ damage. Removing case matching costs $-0.002$ under the churn-neutral metric
+(and \emph{gained} $+0.12$ under the uncorrected metric---the artifact). Replacing
+grounding with frequency clustering gains $+0.020$ NORTH, all of it from the injected
+slice (\S\ref{sec:eval}), while ceding $-0.039$ real-error F1---the trade the system
+refuses by design.
+\subsection{Learned-repair baselines under disclosed protocols}
+\label{sec:ws4}
+We additionally run two learned-repair baselines on the real-error (Raha) slice,
+under the identical churn-neutral metric but with honestly disclosed protocol
+asymmetries. \textbf{Baran}~\cite{raha} is semi-supervised: we run its reference
+configuration---oracle error positions from the dirty/gold diff plus 20 gold-labeled
+tuples per dataset (its package default), without the optional Wikipedia-pretrained
+value models. It reaches REAL-F1 \realFBaran{}$\,\pm$\realFBaranCI{} (3 label-sampling
+seeds) at \damageBaran{} damage---an upper bound under a strictly more informed
+protocol than ours (zero labels, no oracle detection); with oracle positions it can
+essentially only edit true-error cells, so its near-zero damage is structural.
+\textbf{Jellyfish-13B}~\cite{jellyfish} publishes per-cell error detection and
+imputation but no repair task; we compose the two (detect, then impute flagged cells
+with the attribute masked) --- a pipeline of our construction, not theirs. It scores
+REAL-F1 \realFJelly{} at \damageJelly{} damage (single seed, recommended decoding;
+note hospital is in its instruction-tuning data and flights/rayyan in its published
+evaluation suite, so these numbers may flatter it). Neither baseline is run on the
+56-spec injected suite (computationally and methodologically out of scope for
+semi-supervised and per-cell-LLM repair); their NORTH/INJ-F1 cells in
+Table~\ref{tab:money} are blank by design. The comparison locates our contribution:
+zero-config systems (ours, OpenRefine) occupy a different protocol class from
+supervised repair, and the verifier (\S\ref{sec:ws1results}) is what makes the
+zero-config class precise enough to trust, not what closes the labeled gap.
+Table~\ref{tab:perdataset} breaks the real-error slice down per dataset at HEAD.
+The verified-union rows are reported with their honest shape: off hospital the
+union turns ultra-conservative --- on rayyan it commits 12 changes at 0.001
+damage; on beers it holds precision 0.546 at recall 0.018. The gate's precision
+premise transfers as \emph{safety} (union damage stays at 0.001--0.080) but not
+as coverage. The movies\_1 union cell ($^{q}$: local Q8 capture, the disclosed
+quantized protocol) is the instructive worst case: on entity-rich name columns
+the quantized planner proposes plausible-but-wrong merges
+(\texttt{The Longest Day}$\,\to\,$\texttt{The Longest Yard}); the verifier kills
+most, and what leaks through is damage within the disclosed band with zero
+credited fixes --- the planner contributes nothing there, and the system's value
+is that it \emph{contains} a bad planner rather than amplifying it. This directly
+answers the co-adaptation concern: hospital is where the model's learned mappings
+live, and elsewhere the system abstains or contains rather than guesses.
+\begin{table}[t]
+\centering
+\caption{Per-dataset real-error results (Raha slice), churn-neutral F1 / damage.
+Grounded is the HEAD deterministic system; OR = OpenRefine reimplementations;
+Union is the verified union planner ($\tau{=}0.5$) where a captured model plan
+exists (movies\_1 capture pending); Baran uses oracle error positions + 20 gold
+labels (mean of 3 label-sampling seeds) and is a supervised reference, not a
+peer.}
+\label{tab:perdataset}
+\footnotesize
+\begin{tabular}{lccccc}
+\toprule
+Dataset & Grounded (HEAD) & OR fingerprint & OR kNN & Verified union & Baran (oracle+20) \\
+\midrule
+hospital & 0.258 / .066 & 0.000 / .000 & 0.189 / .083 & 0.567 / .001 & 0.827 / .004 \\
+beers & 0.025 / .005 & 0.194 / .000 & 0.086 / .074 & 0.035 / .001 & 0.918 / .000 \\
+flights & 0.127 / .082 & 0.000 / .000 & 0.014 / .065 & 0.035 / .080 & 1.000 / .000 \\
+rayyan & 0.000 / .118 & 0.000 / .001 & 0.002 / .008 & 0.000 / .001 & 0.402 / .010 \\
+movies\_1 & 0.714 / .025 & 0.002 / .018 & 0.001 / .072 & 0.000 / .025$^{q}$ & 0.909 / .001 \\
+\midrule
+macro F1 & \realFOursHead & 0.039 & 0.058 & --- & \realFBaran \\
+\bottomrule
+\end{tabular}
+\end{table}
+\subsection{A matched label budget separates the supervision regimes}
+\label{sec:labelcurve}
+The Baran comparison above is two points (zero labels, twenty labels); the
+matched-budget curve in Figure~\ref{fig:labelcurve} measures what each label is
+worth to each system on the same five-dataset real-error macro. At zero labels
+Baran --- even \emph{retaining} its oracle error positions --- repairs nothing
+(F1 \realFBaranZero, 3 seeds): its value models have nothing to learn from.
+ScrubData operates at \realFOursHead{} with zero configuration. With labels Baran
+climbs steeply (\realFBaranFive{} at $k{=}5$, \realFBaran{} at $k{=}20$): the two
+systems occupy complementary supervision regimes, a relationship now measured
+rather than asserted. ScrubData's own $k$-label arm uses the labels \emph{only}
+to validate and expand the verifier accept set --- no retraining, no oracle
+positions: $\realFOursFive \pm 0.023$ at $k{=}5$ and $\realFOursTwenty \pm 0.012$
+at $k{=}20$ (3 label-sampling seeds). The disclosed asymmetry stands at every
+budget: Baran keeps oracle error positions throughout, so the curve is an upper
+bound in its favor.
+\begin{figure}[t]
+\centering
+\includegraphics[width=0.62\linewidth]{fig_label_curve}
+\caption{Matched-budget label curve, five-dataset real-error macro F1. At
+$k{=}0$ Baran repairs nothing even with oracle error positions retained;
+ScrubData operates at \realFOursHead{} with zero configuration. With labels
+Baran climbs steeply --- complementary supervision regimes, measured. Error
+bars ($\pm$) are standard deviations over 3 label-sampling seeds; the Baran
+$k{=}20$ point reuses the 3-seed baseline run of \S\ref{sec:ws4}.}
+\label{fig:labelcurve}
+\end{figure}
+\subsection{Degenerate baselines and cost-weighted damage}
+\label{sec:degenerate}
+Four degenerate policies pin the metric's floor and ceiling on the full 42-pair
+bench (Table~\ref{tab:degenerate}). No-op and oracle land exactly at 0 and 1;
+abstain-all is score-identical to no-op because the repair metric is flag-blind
+by design (abstentions are audited separately); seeded random editing of 5\% of
+cells is vandalism the metric must punish. Since F1 alone under-punishes
+vandalism, we add a cost-weighted score in the Effective-Reliability style,
+$\Phi_c = (\mathrm{fixes} - c\cdot\mathrm{damaged})/\mathrm{errors}$ at
+$c \in \{1, 5, 10\}$: random editing scores $-0.49$ to $-4.89$, while the
+shipped system stays positive at $c{=}1$ (\degShippedPhiOne) --- and goes
+negative at higher $c$, which is the honest reading: at 10:1 cost asymmetry,
+only near-zero-damage operating points (the verified union) are defensible.
+One disclosure: the oracle acceptance check itself surfaced a scorer artifact
+--- 3 cells in 1.79M held the literal string \texttt{Nan} (a first name), which
+parses to float NaN and was unequal to itself --- now fixed in
+\texttt{eval/metrics.py} with a regression test; published numbers shift by
+less than $10^{-4}$.
+\begin{table}[t]
+\centering
+\caption{Degenerate policies pin the metric (42 pairs, churn-neutral macro;
+random-edit: seeded, 5\% of cells). $\Phi_c$ is micro-summed
+$(\mathrm{fixes} - c\cdot\mathrm{damaged})$ per benchmark error. ``Shipped''
+here is the deterministic grounded path on the 42 pairs (damage
+\degShippedDamage), distinct from the verified-union suite row of
+Table~\ref{tab:money} (damage \modelDamage).}
+\label{tab:degenerate}
+\small
+\begin{tabular}{lccccccc}
+\toprule
+Policy & F1 & P & R & damage & $\Phi_1$ & $\Phi_5$ & $\Phi_{10}$ \\
+\midrule
+no-op & 0.000 & 1.000 & 0.000 & 0.000 & $0.00$ & $0.00$ & $0.00$ \\
+abstain-all & 0.000 & 1.000 & 0.000 & 0.000 & $0.00$ & $0.00$ & $0.00$ \\
+random-edit & 0.000 & 0.001 & 0.001 & 0.049 & $-0.49$ & $-2.45$ & $-4.89$ \\
+oracle & 1.000 & 1.000 & 1.000 & 0.000 & $+1.00$ & $+1.00$ & $+1.00$ \\
+shipped & \degShippedF & \degShippedP & 0.308 & \degShippedDamage & $+0.13$ & $-1.37$ & $-3.26$ \\
+\bottomrule
+\end{tabular}
+\end{table}
+\subsection{Calibration of abstention}
+\label{sec:calibration}
+On a probe of reference-entity typos plus garbage traps, retrieval confidence is a
+usable selective-prediction signal: AURC \aurc, ECE \ece{} (over-confident;
+temperature scaling is future work), and
+precision rises monotonically with threshold---\precAtDefault{} precision at the default
+$\tau{=}0.84$ (coverage \covAtDefault), and $\geq$95\% precision at
+$\tau{=}\threshNinetyFive$ (coverage \covNinetyFive). Figure~\ref{fig:rc} shows the
+risk--coverage curve.
+\begin{figure}[t]
+\centering
+\includegraphics[width=0.62\linewidth]{fig_risk_coverage}
+\caption{Risk--coverage for grounded city reconciliation (650 probes). Operating points
+annotated; the confidence supports thresholded abstention.}
+\label{fig:rc}
+\end{figure}
+\section{Limitations}
+Reference coverage is the recall ceiling: entities absent from the taxonomy abstain by
+design, which is safe but not helpful; coverage work (larger gazetteers, ROR for
+organizations) moves recall directly. Our damage metric is convention-tolerant for case
+and whitespace but still counts alias expansion (\texttt{NYC}$\to$\texttt{New York}) as
+damage when the gold keeps the alias---a value-level convention question we leave open.
+The confidence signal is over-confident (ECE \ece); temperature scaling is future
+work. The injected half of the suite, while seeded and reproducible, inherits the
+injector's error model; we mitigate with the real-error slice and report both. All
+weight-training experiments (SFT and GRPO) use a single model family (Qwen3), so
+the negative result of \S\ref{sec:negative} is family-scoped until replicated on a
+second family. PII
+coverage is English-only, and we make no de-identification guarantee. Finally, the
+fine-tune headline is reported with multi-seed confidence intervals, but the wide-suite
+model row is single-seed for cost reasons and scoped as such.
+\section{Conclusion}
+A planner/executor decomposition with plan-level selective prediction --- the model
+proposes, a deterministic engine executes, a verifier gates every mapping --- turns
+LLM data cleaning from a trust liability into an auditable system: every change is a
+named, reversible operation; uncertain actions become review flags rather than silent
+corruptions; and the evaluation itself is built to resist gaming. The post-freeze
+program sharpened the architecture into a finding: across
+five further fine-tunes and a three-arm GRPO pilot, the weights never moved
+never-seen-table performance --- deterministic visibility, grounding, consensus, and
+verification did, at zero silent edits across \nWild{} wild tables and a
+\nTrust{}-table trust audit. The scaling arm completes the picture: the bounded null
+is about fine-tuning small weights, not about capability --- two of three zero-shot
+24--31B planners dropped into the unchanged verifier harness exceed the
+fine-tune's operating point (\S\ref{sec:scaling}), so the architecture is
+planner-agnostic: capability gains arrive as better operating points without
+retraining. The shipped system runs
+entirely locally on commodity hardware and no data leaves the machine; the
+scaling-arm planners were measured via hosted endpoints, but all are locally
+deployable open weights. We believe the recipe---propose/execute decomposition,
+verification-by-execution, retrieval-grounded outputs, and selective prediction over
+deterministic capabilities---is a template for deploying small specialized models on
+other structured tasks.
+\section*{Reproducibility}
+\begin{sloppypar}
+The model weights are public:
+\url{https://huggingface.co/ricalanis/scrubdata-qwen3-4b-v6-q8}. Code, evaluation
+suite, and result artifacts are released at the project repository,
+\url{https://github.com/ricalanis/scrubdata-hackathon} (public upon publication,
+available to reviewers from the initial submission). The \textsc{WildClean}
+bundle --- redistributable dirty/gold pairs, the GitTables audit slice, open
+vocabularies, result JSONs, and license-gated loaders for the non-redistributable
+pairs --- is a public Hugging Face dataset
+(\url{https://huggingface.co/datasets/ricalanis/wildclean}). The shipped product
+planner is the identical code path measured here (\texttt{scrubdata/active.py}).
+\end{sloppypar}
+\paragraph{Release integrity.} Our own reproducibility QA discovered that the
+published Q8\_0 GGUF was corrupted by an export bug (the export declared a wrong
+end-of-generation token id inside the Qwen3 vocabulary, degenerating into
+tool-call loops on all runtimes; a base-model control isolated the fault to the
+export, not the adapter). It has been re-exported from the v6 adapter and
+replaced under the same filename, with both sha256 checksums recorded in the
+model card's Integrity section. Third-party reproduction of the model-path
+numbers additionally requires constrained decoding on long prompts ---
+\texttt{format=json} under Ollama, or
+\texttt{suppress\_tokens=[151657,151658]} under transformers --- which is now
+documented in the model card and \texttt{notebooks/Modelfile}.
+\paragraph{Setup.} Clone the repository and run \texttt{uv sync} (Python 3.12;
+\texttt{uv} resolves the pinned environment). The non-redistributable benchmark
+pairs materialize from their original sources with the \textsc{WildClean}
+\texttt{loaders.py}. Model-path results additionally need the released Q8\_0 GGUF
+served by a local Ollama (\texttt{SCRUBDATA\_MODEL}); every deterministic-path
+number runs with no model at all. Baran runs in the separate pinned environment
+documented at the top of \texttt{eval/run\_baran.py}; Jellyfish-13B runs remotely
+via Modal.
+\paragraph{One command per reported number} (all from the repository root, at the
+released revision):
+\begin{center}
+\footnotesize
+\begin{tabular}{@{}ll@{}}
+\toprule
+Reported result & Command \\
+\midrule
+Wide-suite comparison (Table~\ref{tab:money}) & \texttt{python -m eval.run\_real\_multi --out eval/results} \\
+Precision--coverage curve + gate & \texttt{python -m eval.precision\_curve} \\
+\quad (Figure~\ref{fig:pc}, \S\ref{sec:ws1results}) & \texttt{\ \ --plan eval/results/v6\_hospital\_raw\_plan.json --union} \\
+Ablations & \texttt{python -m eval.ablations} \\
+Calibration (Figure~\ref{fig:rc}) & \texttt{python -m eval.calibration} \\
+PII leak test & \texttt{python -m eval.pii\_leak} \\
+Baran baseline & \texttt{python eval/run\_baran.py}, then \\
+ & \texttt{python -m eval.baselines\_learned --score-baran} \\
+Jellyfish baseline & \texttt{modal run scripts/modal\_jellyfish.py} \\
+\midrule
+Paired bench (\S\ref{sec:wild}) & \texttt{python -m eval.paired\_bench} \\
+Wild bench (\S\ref{sec:wild}) & \texttt{python -m eval.wild\_bench} \\
+GitTables trust audit (\S\ref{sec:wild}) & \texttt{python -m eval.gittables\_audit} \\
+Held-out-source generalization & \texttt{python -m eval.generalization} \\
+\midrule
+Scorer validation (\S\ref{sec:eval}) & \texttt{python -m pytest tests/test\_wildclean\_scorer.py} \\
+Degenerate baselines (Table~\ref{tab:degenerate}) & \texttt{python -m eval.degenerate} \\
+TOST equivalence (\S\ref{sec:negative}) & \texttt{python -m eval.equivalence} \\
+Label curve (Figure~\ref{fig:labelcurve}) & \texttt{python -m eval.label\_curve} \\
+Per-dataset table (Table~\ref{tab:perdataset}) & \texttt{python -m eval.raha\_table} \\
+Self-consistency vote/pool (\S\ref{sec:negative}) & \texttt{python -m eval.sc\_rerank --model scrubdata-ft --n 16} \\
+Scaling arm (Table~\ref{tab:scaling}) & \texttt{python -m eval.scaling\_arm} \\
+\bottomrule
+\end{tabular}
+\end{center}
+\begin{thebibliography}{20}
+\bibitem{raha} M.~Mahdavi, Z.~Abedjan, R.~Castro Fernandez, S.~Madden, M.~Ouzzani,
+M.~Stonebraker, N.~Tang. Raha: A Configuration-Free Error Detection System. SIGMOD
+2019; M.~Mahdavi, Z.~Abedjan. Baran: Effective Error Correction via a Unified Context
+Representation and Transfer Learning. PVLDB 13(11):1948--1961, 2020.
+\bibitem{holoclean} T.~Rekatsinas, X.~Chu, I.~F.~Ilyas, C.~R\'e. HoloClean: Holistic
+Data Repairs with Probabilistic Inference. PVLDB 10(11), 2017. arXiv:1702.00820.
+\bibitem{garf} J.~Peng, D.~Shen, N.~Tang, T.~Liu, Y.~Kou, T.~Nie, H.~Cui, G.~Yu.
+Self-Supervised and Interpretable Data Cleaning with Sequence Generative Adversarial
+Networks (GARF). PVLDB 16(3):433--446, 2022.
+\bibitem{wrangle} A.~Narayan, I.~Chami, L.~Orr, S.~Arora, C.~R\'e. Can Foundation
+Models Wrangle Your Data? PVLDB 16(4):738--746, 2022. arXiv:2205.09911.
+\bibitem{jellyfish} H.~Zhang, Y.~Dong, C.~Xiao, M.~Oyamada. Jellyfish:
+Instruction-Tuning Local Large Language Models for Data Preprocessing. EMNLP 2024.
+arXiv:2312.01678.
+\bibitem{cocoon} S.~Zhang, Z.~Huang, E.~Wu. Data Cleaning Using Large Language Models
+(Cocoon). arXiv:2410.15547, 2024 (preprint; no published reproduction).
+\bibitem{zeroed} W.~Ni, K.~Zhang, X.~Miao, X.~Zhao, Y.~Wu, Y.~Wang, J.~Yin. ZeroED:
+Hybrid Zero-Shot Error Detection Through Large Language Model Reasoning. ICDE 2025.
+arXiv:2504.05345.
+\bibitem{forested} M.~Wang, J.~Wang, Q.~Liu, X.~Xu, Z.~Xing, L.~Zhu, W.~Zhang.
+Ensembling LLM-Induced Decision Trees for Explainable and Robust Error Detection.
+arXiv:2512.07246, 2025 (preprint).
+\bibitem{autotest} Q.~Chen, Y.~He, R.~C.-W.~Wong, W.~Cui, S.~Ge, H.~Zhang, D.~Zhang,
+S.~Chaudhuri. Auto-Test: Learning Semantic-Domain Constraints for Unsupervised Error
+Detection in Tables. SIGMOD 2025. arXiv:2504.10762.
+\bibitem{gidcl} M.~Yan, Y.~Wang, Y.~Wang, X.~Miao, J.~Li. GIDCL: A Graph-Enhanced
+Interpretable Data Cleaning Framework with Large Language Models. Proc.\ ACM Manag.\
+Data 2(6), Article 236, 2024 (SIGMOD).
+\bibitem{spreadsheetrl} B.~Chi, Y.~Xie, M.~Wu, J.~Yang, J.~Jiang, Z.~Li, et al.
+Spreadsheet-RL: Advancing Large Language Model Agents on Realistic Spreadsheet Tasks
+via Reinforcement Learning. arXiv:2605.22642, 2026.
+\bibitem{distort} A.~Dutta, H.~Nigam, H.~Hasanbeig, A.~Radhakrishna, S.~Gulwani.
+An Empirical Investigation of Robustness in Large Language Models under Tabular
+Distortions. arXiv:2601.05009, 2026.
+\bibitem{debate} C.~Parmar, A.~Mehta, H.~Wu, J.~Ramamurthy, S.~Medhekar. When Helping
+Hurts and How to Fix It: Multi-Agent Debate for Data Cleaning. arXiv:2606.02866, 2026.
+\bibitem{tabler1} Z.~Yang, L.~Chen, A.~Cohan, Y.~Zhao. Table-R1: Inference-Time
+Scaling for Table Reasoning. EMNLP 2025. arXiv:2505.23621.
+\bibitem{spurious} R.~Shao, S.~S.~Li, R.~Xin, S.~Geng, Y.~Wang, et al. Spurious
+Rewards: Rethinking Training Signals in RLVR. arXiv:2506.10947, 2025.
+\bibitem{tablegpt} P.~Li, Y.~He, D.~Yashar, W.~Cui, S.~Ge, H.~Zhang, D.~Rifinski
+Fainman, D.~Zhang, S.~Chaudhuri. Table-GPT: Table Fine-tuned GPT for Diverse Table
+Tasks. Proc.\ ACM Manag.\ Data 2(3), Article 176, 2024 (SIGMOD). arXiv:2310.09263.
+\bibitem{retclean} Z.~A.~Naeem, M.~S.~Ahmad, M.~Eltabakh, M.~Ouzzani, N.~Tang.
+RetClean: Retrieval-Based Data Cleaning Using LLMs and Data Lakes. PVLDB 17(12), 2024
+(demo). arXiv:2303.16909.
+\bibitem{turl} X.~Deng, H.~Sun, A.~Lees, Y.~Wu, C.~Yu. TURL: Table Understanding
+through Representation Learning. PVLDB 14(3):307--319, 2021. arXiv:2006.14806.
+\bibitem{tablellama} T.~Zhang, X.~Yue, Y.~Li, H.~Sun. TableLlama: Towards Open Large
+Generalist Models for Tables. NAACL 2024. arXiv:2311.09206.
+\bibitem{belotti} F.~Belotti, F.~Dadda, M.~Cremaschi, R.~Avogadro, M.~Palmonari.
+Evaluating LLMs on Entity Disambiguation in Tables. arXiv:2408.06423, 2024 (preprint).
+\bibitem{racoon} L.~L.~Wei, G.~Xiao, M.~Balazinska. RACOON: An LLM-based Framework for
+Retrieval-Augmented Column Type Annotation with a Knowledge Graph. arXiv:2409.14556,
+2024 (preprint).
+\bibitem{mtab} P.~Nguyen, N.~Kertkeidkachorn, R.~Ichise, H.~Takeda. MTab: Matching
+Tabular Data to Knowledge Graph using Probability Models. SemTab/ISWC 2019.
+arXiv:1910.00246.
+\bibitem{selective} R.~El-Yaniv, Y.~Wiener. On the Foundations of Noise-free Selective
+Classification. JMLR 11:1605--1641, 2010; Y.~Geifman, R.~El-Yaniv. Selective
+Classification for Deep Neural Networks. NeurIPS 2017.
+\bibitem{openmed} M.~Panahi. OpenMed NER: Open-Source, Domain-Adapted State-of-the-Art
+Transformers for Biomedical NER Across 12 Public Datasets. arXiv:2508.01630, 2025
+(preprint).
+\bibitem{lakens} D.~Lakens. Equivalence Tests: A Practical Primer for t Tests,
+Correlations, and Meta-Analyses. Social Psychological and Personality Science
+8(4):355--362, 2017.
+\bibitem{grouse} S.~Muller, A.~Loison, B.~Omrani, G.~Viaud. GroUSE: A Benchmark
+to Evaluate Evaluators in Grounded Question Answering. COLING 2025.
+arXiv:2409.06595.
+\end{thebibliography}
+\end{document}

docs/paper/numbers.tex ADDED Viewed

	@@ -0,0 +1,146 @@

+% Result macros — every value regenerates from one command (see Reproducibility section).
+% Headline fine-tune (synthetic frozen gold, Layer 1)
+\newcommand{\canonFOurs}{0.815}        % v5 bf16, n=20 (single run; multi-seed below)
+\newcommand{\canonFOursBest}{0.901}    % v4 Q8 measurement
+\newcommand{\canonFBig}{0.452}         % large generic model (GLM-class, zero-shot)
+\newcommand{\canonFHeur}{0.152}        % rule heuristic
+\newcommand{\canonFMultiSeed}{$0.803 \pm 0.009$ (95\% CI, 3 training seeds)}
+\newcommand{\opFOurs}{0.957}
+\newcommand{\jsonValidOurs}{0.950}
+% Hospital head-to-head, repairs-only churn-neutral (both paths incl. errors-are-rare gates)
+\newcommand{\hospRecallGrounded}{0.257}
+\newcommand{\hospRecallFreq}{0.293}
+\newcommand{\hospPrecGrounded}{0.845}
+\newcommand{\hospPrecFreq}{0.871}
+\newcommand{\hospModelRecall}{0.424}   % fine-tuned v5, repair_recall (vs 0.000 synthetic-only)
+% Wide-suite comparison (3 seeds, churn-neutral metric) — money table.
+% PRIMARY = HEAD regeneration 2026-06-12 (eval/results/money_table_head.json,
+% post-capability system, NaN metric fix in). Freeze (2026-06-10) values kept
+% as *Freeze macros where the narrative discusses the freeze-version system.
+\newcommand{\northGrounded}{0.224}
+\newcommand{\northGroundedCI}{0.004}
+\newcommand{\northORFp}{0.211}
+\newcommand{\northORKnn}{0.122}
+\newcommand{\realFGrounded}{0.225}
+\newcommand{\realFORKnn}{0.058}
+\newcommand{\damageGrounded}{0.092}
+\newcommand{\damageORKnn}{0.096}
+\newcommand{\northGroundedFreeze}{0.203}
+\newcommand{\realFGroundedFreeze}{0.174}
+\newcommand{\damageGroundedFreeze}{0.104}
+% SHIPPED system (verified union, v6 adapter) on suite — scripts/modal_eval_suite.py
+\newcommand{\modelRealF}{0.142}
+\newcommand{\modelDamage}{0.015}
+\newcommand{\modelAbstain}{1.000}
+% Ablations (churn-neutral metric, 3 seeds — eval/results/ablations.json)
+\newcommand{\ablFull}{0.203}
+\newcommand{\ablNoGround}{0.223}
+\newcommand{\ablNoAbstain}{0.190}
+\newcommand{\ablNoMargin}{0.197}
+\newcommand{\ablNoCase}{0.201}
+\newcommand{\ablFullRealF}{0.174}
+\newcommand{\ablNoGroundRealF}{0.135}
+\newcommand{\ablNoAbstainDamage}{0.108}
+\newcommand{\ablFullDamage}{0.104}
+\newcommand{\ablFullAbstain}{1.000}
+\newcommand{\ablNoAbstainAbstain}{0.250}
+% Selective prediction / calibration
+\newcommand{\aurc}{0.120}
+\newcommand{\ece}{0.169}
+\newcommand{\precAtDefault}{0.899}     % threshold 0.84
+\newcommand{\covAtDefault}{0.669}
+\newcommand{\threshNinetyFive}{0.91}
+\newcommand{\covNinetyFive}{0.206}
+% PII transfer validation (OpenMed-PII 44M on bare cells)
+\newcommand{\piiNameBare}{100\%}
+\newcommand{\piiAddrBare}{100\%}
+\newcommand{\piiNegRate}{43\%}
+\newcommand{\piiLeakRate}{zero (0/360)} % eval/pii_leak.py, seeded
+\newcommand{\realFORFp}{0.039}
+\newcommand{\injFGrounded}{0.224}
+\newcommand{\injFORFp}{0.282}
+\newcommand{\damageORFp}{0.001}
+\newcommand{\hospModelRecallVSix}{0.475}
+\newcommand{\hospModelPrecVSix}{0.185}
+% WS1 — plan-level selective prediction (verified union planner)
+% repro: uv run python -m eval.precision_curve --plan eval/results/v6_hospital_raw_plan.json --union
+\newcommand{\unionGatePrec}{0.905}
+\newcommand{\unionGateCov}{0.413}
+\newcommand{\modelGatePrec}{0.993}     % gated model plan alone, tau=0.5 (146/147 correct)
+\newcommand{\modelGateCov}{0.287}
+\newcommand{\unionChanged}{232}
+\newcommand{\unionFixed}{210}
+\newcommand{\hospErrors}{509}
+% 3 training seeds (mixA 21=shipped/25/26), union @ tau=0.5 (eval/results/union_gate_3seed.json)
+\newcommand{\unionGateThreeSeedPrec}{$0.891 \pm 0.012$}
+\newcommand{\unionGateThreeSeedCov}{$0.396 \pm 0.025$}
+% WS2 pair-profiles (measured-and-cut): constrained raw plan / composed at tau=0.5
+\newcommand{\pairsRawPrec}{0.760}
+\newcommand{\pairsRawCov}{0.348}
+\newcommand{\pairsUnionPrec}{0.876}
+\newcommand{\pairsUnionCov}{0.387}
+% ===== v2 (post-freeze system, 2026-06-11/12) =====
+\newcommand{\nPairs}{42}
+\newcommand{\nWild}{35}
+\newcommand{\nTrust}{239}
+\newcommand{\unseenMacroF}{0.363}
+\newcommand{\unseenMacroDamage}{0.0219}
+\newcommand{\wildRecovery}{0.207}
+\newcommand{\genFTwo}{0.058}
+\newcommand{\genVRTwo}{0.108}
+\newcommand{\genDamageTwo}{0.036}
+\newcommand{\ttFOne}{0.955--0.957}
+\newcommand{\flightsVoteF}{0.164}
+\newcommand{\flightsBaseF}{0.044}
+\newcommand{\hospVoteHeur}{0.186}
+\newcommand{\hospBaseHeur}{0.092}
+\newcommand{\gidclHosp}{0.97}
+% WS4 — learned-repair baselines, Raha real slice only (eval/baselines_learned.py)
+\newcommand{\realFBaran}{0.811}        % oracle detection + 20 gold labels: upper bound
+\newcommand{\realFBaranCI}{0.018}      % 3 label-sampling seeds
+\newcommand{\damageBaran}{0.003}
+\newcommand{\precBaran}{0.824}
+\newcommand{\realFJelly}{0.074}     % Jellyfish-13B ED+DI (scripts/modal_jellyfish.py)
+\newcommand{\damageJelly}{0.027}
+% W1.a — matched-budget label curve, 5-dataset Raha macro (eval/results/label_curve.json)
+\newcommand{\realFBaranZero}{0.000}    % Baran k=0 (oracle positions retained), 3 seeds
+\newcommand{\realFBaranFive}{0.504}    % Baran k=5
+\newcommand{\realFOursFive}{0.282}     % ours k=5 (labels validate/expand accept set only)
+\newcommand{\realFOursTwenty}{0.351}   % ours k=20
+\newcommand{\realFOursHead}{0.225}     % ours k=0 at HEAD (post-freeze capabilities)
+% W4.3/4.4 — degenerate baselines + cost-weighted scores (eval/results/degenerate.json)
+\newcommand{\degShippedF}{0.343}
+\newcommand{\degShippedP}{0.576}
+\newcommand{\degShippedDamage}{0.023}
+\newcommand{\degShippedPhiOne}{$+0.13$}
+% W1.c — zero-shot capability scaling arm (eval/results/scaling_arm.json)
+\newcommand{\scalePrecBig}{0.915}      % devstral-24B and gemma4-31B union point
+\newcommand{\scaleCovBig}{0.485}
+\newcommand{\scalePrecNemo}{0.877}
+\newcommand{\scaleCovNemo}{0.336}
+% hosted wall-clock for the 5 planning calls, single capture (scaling_arm.json runtime_s)
+\newcommand{\runtimeDevstral}{135}
+\newcommand{\runtimeNemo}{114}
+\newcommand{\runtimeGemma}{104}
+% Frontier zero-shot reference point: hospital repair recall of a vanilla frontier-scale
+% cloud planner run through the same propose/execute harness (2026-06-04..07 architecture
+% validation captures, pre-verifier; recorded in project training-run logs). Quoted in the
+% fine-tune results subsection as the zero-shot ceiling the v6 recall approaches.
+\newcommand{\frontierZeroShotRecall}{0.51}
+% R3 — absolute champion GEN-F1 basis of the equivalence retrain series
+% (eval/results/equivalence.json spec.champion_macro_gen_f1 = 0.014606)
+\newcommand{\genChampionBasis}{0.0146}

eval/README.md ADDED Viewed

	@@ -0,0 +1,136 @@

+# Eval harness + goalpost
+Measures any planner against a **held-out** synthetic gold set (seed differs from
+training, and gold is filtered to oracle-solvable so the ceiling is a clean 1.0).
+```bash
+uv run eval/run_eval.py --n 300 --seed 4242
+```
+Adopts the researched tooling: `jsonschema` for plan validity; set-based micro-F1 for
+operations and canonicalization mappings; the **executor itself** for end-to-end
+cell-recovery (the Raha-style dirty→clean comparison). promptfoo + `llm-rubric` will
+wrap the report-quality layer once a model exists.
+## Metrics
+- **json_valid** — plan conforms to the schema (`eval/metrics.py:PLAN_SCHEMA`).
+- **op_f1 / op_r** — micro-F1 / recall over `(column, operation)` pairs vs gold.
+- **canon_f1 / canon_r** — micro-F1 / recall over `(column, raw→canonical)` mapping
+  pairs. *This is the fuzzy skill rules can't do — the whole reason for the model.*
+- **recovery** — fraction of clean-reference cells recovered by executing the plan.
+## Baseline (measured) and the goalpost
+Two reference systems frame every run:
+- **ORACLE** = the gold plan → the ceiling.
+- **HEURISTIC** (`scrubdata.mock_plan`) = the rule-based baseline the model must beat.
+Measured on the frozen 300-example gold set (`eval/gold.jsonl`, **value_counts/aggregation
+format**):
+| system | json_valid | op_f1 | canon_f1 | canon_r | recovery |
+|---|---|---|---|---|---|
+| ORACLE (gold) | 1.000 | 1.000 | 1.000 | 1.000 | **1.000** |
+| HEURISTIC (baseline) | 1.000 | 0.932 | **0.189** | 0.129 | **0.637** |
+**Reading:** with case-folding + typo-clustering the heuristic does the *easy*
+canonicalization (collapse to most-frequent surface), but it's still ~blind to
+**alias/semantic** canonicalization (`USA`→`United States`, `NYC`→`New York`) — canon_f1
+0.19 vs the oracle's 1.0. That gap is the fine-tuned model's job. (Earlier, on the old
+sample-rows format, a fine-tune reached canon_f1 0.86 vs a big vanilla model's 0.45 —
+proving small-aligned > big-generic; the v4 retrain re-establishes this on the new format.)
+### 🎯 Goalpost for the fine-tuned Qwen3-4B
+| metric | baseline | **target** | ceiling |
+|---|---|---|---|
+| json_valid | 1.000 | **≥ 0.99** | 1.000 |
+| op_f1 | 0.932 | **≥ 0.98** | 1.000 |
+| canon_f1 | 0.189 | **≥ 0.85** | 1.000 |
+| recovery | 0.637 | **≥ 0.95** | 1.000 |
+A fine-tune that hits these clearly beats the (now stronger) heuristic and approaches the
+oracle — the headline being **canon_f1 0.133 → ≥0.85** (alias-level canonicalization) and
+**recovery 0.627 → ≥0.95**.
+## Plugging in the model
+`evaluate(planner, gold)` takes any `planner(dirty_df, gold_plan) -> plan dict`. For
+the model, wrap inference (build prompt via `scrubdata.prompt`, parse JSON) and pass it
+in alongside the two reference systems. Track the table every fine-tune iteration; the
+per-metric delta vs baseline is the cheap regression signal.
+## Layer 2 — real out-of-distribution data (`uv run eval/run_real.py`)
+Raha `hospital` (1000×20, row-aligned dirty/clean). Errors are char-substitution typos
+(`birminghxm`→`birmingham`) — only ~2.5% of cells. Scored with the Raha **repair**
+protocol (the right metric when data is already mostly correct):
+| system | recovery | repair_recall | repair_prec | broken |
+|---|---|---|---|---|
+| NO-OP (dirty as-is) | 0.975 | 0.000 | 0.000 | 0 |
+| HEURISTIC (baseline) | 0.880 | **0.293** | 0.065 | 2041 |
+(Typo-clustering now fixes ~29% of the real char-substitution errors — up from 0. The
+model should push repair_recall higher and improve repair_prec.)
+**Reading (honest + important):** the rule heuristic fixes **0** typos. Its 2021 changed
+cells are **convention divergence, not errors** — our tool parses `100%`→`1.0` and
+reformats phones, which this benchmark stores as raw text. That's product value, so raw
+`recovery`/`broken` *understates* a standardizing tool on a foreign benchmark. The honest
+metric here is **`repair_recall`** — did we fix the actual char-substitution typos
+(`birminghxm`→`birmingham`)? The heuristic can't (scores 0); cluster-canonicalization is
+the model's job. Two takeaways:
+1. **The headline real-data metric is `repair_recall`** (error-fixing), not recovery.
+2. **Product feature surfaced:** offer a "preserve original formats" toggle — some users
+   want raw representation kept; standardizing is the default but should be reversible
+   (matches PRODUCT.md's trust contract).
+### 🎯 Real-data goalpost (fine-tuned model)
+| metric | NO-OP | HEURISTIC | **target** | note |
+|---|---|---|---|---|
+| **repair_recall** | 0.000 | 0.000 | **≥ 0.30** | the real test — fix typos via clustering |
+| repair_prec | 0.000 | 0.000 | **≥ 0.70** | of cells changed, fraction that fixed an error |
+| recovery | 0.975 | 0.874 | report-only | convention-sensitive; not a pass/fail gate |
+The model plugs into `_score(dirty, clean, model_output)` exactly like the heuristic.
+> Data auto-fetched to `data/real/hospital/` (gitignored). Add Flights/Beers/CleanML the
+> same way for breadth.
+## Scale: aggregation + agentic batching (validated)
+Cleaning *large* tables doesn't mean bigger prompts — it means reasoning over **patterns**:
+- **Aggregation** — the profiler sends per-column `value_counts` (`[value, frequency]`), so
+  the prompt size depends on DISTINCT values, not rows. Rare typos sit at the tail next to
+  their dominant canonical (`birminghxm`:1 vs `birmingham`:312) — visible at any scale.
+- **Column batching** — `scrubdata.model_planner.make_batched_planner` plans a wide table
+  in small column-batches, so a 20-column table never blows one prompt.
+**Validated** on the real Raha hospital table (1000×20) with a *vanilla* model (no retrain):
+**repair_recall 0.509** (fixed 259/509 typos), vs **0.000** for the old one-shot+sample-rows
+approach. The v4 fine-tune trains on this `value_counts` format.
+---
+## The wide suite (current north-star)
+The single-dataset hospital metric was retired as north-star (biased: one table,
+recall-only, convention-sensitive, abstain-blind). The current harness:
+- **`run_real_multi.py`** — 65-dataset suite (5 Raha real-error benchmarks + seeded
+  error injection over 15 harvested open-data domains), scored with a **churn-neutral**
+  metric (pure case/whitespace rewrites that don't restore gold count as nothing) and
+  aggregated as a **double macro** (error-type × domain, harmonic mean) so no single
+  table or error type dominates. Reports REAL vs INJECTED slices separately — injected
+  typos are in-distribution for frequency clustering by construction.
+- **`ablations.py`** — removes one grounding component at a time (reference, abstain,
+  ambiguity margin, case-match). Caught two metric artifacts (churn inflation,
+  reference-unsafe traps) now fixed and documented in the paper.
+- **`calibration.py`** — risk–coverage + ECE for the abstention confidence
+  (AURC 0.120; 90% precision at the default threshold, ≥95% at 0.91).
+- **`pii_leak.py`** — masking leak test: 0/360 residual detectable PII.
+- **`pii_slice.py`** — OOD PII column typing on Gretel test: 5/5 types, 0/7 FP.
+- **`inject.py`** — seeded, self-verifying error injectors (typo/OCR/case/whitespace)
+  that turn any clean table into validation data.
+Baselines include OpenRefine fingerprint + kNN clustering (`scrubdata/baselines.py`,
+with blocking, as the real tool uses). Full results & discussion: `docs/paper/`.

eval/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""Evaluation harness for the ScrubData planner.
+Measures any planner (`callable(dirty_df) -> plan dict`) against a held-out gold set:
+- JSON-schema validity of the plan
+- operation-level micro-F1 vs the gold plan
+- canonicalization mapping micro-F1 (the fuzzy skill rules can't do)
+- end-to-end cell-recovery (executor(dirty, plan) vs known-clean reference)
+Two reference systems frame every run:
+- HEURISTIC (`scrubdata.mock_plan`) = the baseline a fine-tuned model must beat.
+- ORACLE (the gold plan itself) = the goalpost ceiling (~100% by construction).
+"""

eval/ablations.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""Ablation suite — isolate each grounding component's contribution to the north-star.
+Each row turns ONE design decision off (via mock_plan's ground_cfg) and re-runs the wide
+validation suite. Shows what grounding / abstention / ambiguity-checking / case-matching each
+buy in F1 and (critically) in DAMAGE.
+    uv run python -m eval.ablations
+"""
+from __future__ import annotations
+from scrubdata.planner import mock_plan
+from .run_real_multi import evaluate_suite
+ABLATIONS = [
+    ("full (grounded)",            {}),
+    ("- grounding (freq-cluster)", {"use_reference": False}),
+    ("- abstain (map nearest)",    {"threshold": 0.0, "min_margin": 0.0}),
+    ("- ambiguity check",          {"min_margin": 0.0}),
+    ("- case match",               {"case_match": False}),
+]
+def main(seeds=(7, 17, 27), out: str | None = None) -> None:
+    def mean(xs):
+        xs = list(xs)
+        return sum(xs) / len(xs) if xs else 0.0
+    print(f"\n=== Ablation suite (wide validation suite, {len(seeds)} seeds) — each "
+          "removes ONE grounding component ===\n")
+    print(f"{'variant':<28}{'NORTH*':>9}{'REAL-F1':>9}{'INJ-F1':>8}{'damage':>9}{'abstain':>9}")
+    print("-" * 72)
+    rows = []
+    for name, cfg in ABLATIONS:
+        planner = (lambda df, c=cfg: mock_plan(df, ground_cfg=c))
+        per_seed = [evaluate_suite(planner, seed=s) for s in seeds]
+        r = {k: mean(p[k] for p in per_seed)
+             for k in ("north", "real", "injected", "damage", "abstain")}
+        mu = r["north"]
+        var = mean([(p["north"] - mu) ** 2 for p in per_seed])
+        r["north_ci"] = 1.96 * (var ** 0.5) / (len(per_seed) ** 0.5)
+        rows.append((name, r))
+        print(f"{name:<28}{r['north']:>9.3f}{r['real']:>9.3f}{r['injected']:>8.3f}"
+              f"{r['damage']:>9.3f}{r['abstain']:>9.3f}", flush=True)
+    full = rows[0][1]
+    print("\nDeltas vs full (what each component buys):")
+    for name, r in rows[1:]:
+        print(f"  {name:<28} ΔNORTH={r['north'] - full['north']:+.3f}  "
+              f"Δdamage={r['damage'] - full['damage']:+.3f}  Δabstain={r['abstain'] - full['abstain']:+.3f}")
+    if out:
+        import json
+        json.dump([{"variant": n, **r, "seeds": list(seeds)} for n, r in rows],
+                  open(out, "w"), indent=1)
+        print(f"rows written to {out}")
+    print("\nGrounding lifts F1; abstain + ambiguity-check cut DAMAGE; case-match avoids "
+          "convention damage. The combination is the contribution.")
+if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--out", type=str, default=None)
+    main(out=ap.parse_args().out)

eval/baselines_learned.py ADDED Viewed

	@@ -0,0 +1,145 @@

+"""WS4 learned-repair baselines: scoring + Jellyfish prompt construction.
+Both baselines bypass plan dicts (the executor is column-level by design; learned repair
+is per-cell) — they produce repaired DataFrames scored by the SAME churn-neutral
+`eval.run_real_multi.score` as every other row of the money table.
+* Baran: repaired CSVs come from eval/run_baran.py (pinned env). Score here:
+      uv run python -m eval.baselines_learned --score-baran
+* Jellyfish: prompts built here (unit-testable without a GPU), executed by
+  scripts/modal_jellyfish.py (vLLM on Modal), scored in-run with the same `score`.
+Jellyfish has NO repair task — we compose its two published cell-level tasks:
+error detection (yes/no per cell) then data imputation (infer the flagged cell with the
+attribute removed). Prompt templates are verbatim from the NECOUDBFM/Jellyfish-13B model
+card; this composition is OURS, not theirs (disclosed in the paper).
+"""
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+SYSTEM_MESSAGE = ("You are an AI assistant that follows instruction extremely well. "
+                  "Help as much as you can.")
+_ED_TEMPLATE = (
+    "Your task is to determine if there is an error in the value of a specific "
+    "attribute within the whole record provided.\n"
+    "The attributes may include {attrs}.\n"
+    "Errors may include, but are not limited to, spelling errors, inconsistencies, "
+    "or values that don't make sense given the context of the whole record.\n"
+    "Record [{record}]\n"
+    "Attribute for Verification: [{col}: {val}]\n"
+    "Question: Is there an error in the value of {col}? "
+    "Choose your answer from: [Yes, No]."
+)
+_DI_TEMPLATE = (
+    "You are presented with a {keyword} record that is missing a specific attribute: "
+    "{col}.\n"
+    "Your task is to deduce or infer the value of {col} using the available "
+    "information in the record.\n"
+    "You may be provided with fields like {attrs} to help you in the inference.\n"
+    "Record: [{record}]\n"
+    "Based on the provided record, what would you infer is the value for the missing "
+    "attribute {col}?\n"
+    "Answer only the value of {col}."
+)
+def wrap_prompt(user_message: str) -> str:
+    """The Jellyfish-13B chat scaffold (verbatim from the model card)."""
+    return f"{SYSTEM_MESSAGE}\n\n### Instruction:\n\n{user_message}\n\n### Response:\n\n"
+def _serialize(record: dict, skip: str | None = None) -> str:
+    return ", ".join(f"{k}: {v}" for k, v in record.items() if k != skip)
+def ed_prompt(record: dict, col: str) -> str:
+    """Error-detection prompt (whole-record form) for one cell."""
+    return wrap_prompt(_ED_TEMPLATE.format(
+        attrs=", ".join(record.keys()), record=_serialize(record),
+        col=col, val=record[col]))
+def di_prompt(record: dict, col: str, keyword: str) -> str:
+    """Data-imputation prompt for a flagged cell — the attribute is REMOVED from the
+    serialized record so the model infers, not copies."""
+    attrs = [k for k in record.keys() if k != col]
+    return wrap_prompt(_DI_TEMPLATE.format(
+        keyword=keyword, col=col, attrs=", ".join(attrs),
+        record=_serialize(record, skip=col)))
+def parse_ed(text: str) -> bool:
+    """True = the model says the cell is erroneous."""
+    return text.strip().lower().lstrip("[").startswith("yes")
+def parse_di(text: str, original: str) -> str:
+    """Imputed value, or the original (abstain) when the answer is unusable —
+    empty, multi-line/rambling, or implausibly long for a cell."""
+    ans = text.strip().strip('"').strip()
+    if not ans or "\n" in ans or len(ans) > 80:
+        return original
+    return ans
+# ---------------------------------------------------------------- Baran scoring
+def score_baran(repaired_dir: str = "eval/results/baran",
+                out: str = "eval/results/baran_raha.json") -> dict:
+    """Score every <name>_seed<k>_repaired.csv against (dirty, clean) under the
+    identical churn-neutral protocol; macro REAL-F1 mean ± 95% CI over seeds."""
+    import collections
+    import pandas as pd
+    from .run_real_multi import _raha_pair, score
+    per_seed: dict[int, list] = collections.defaultdict(list)
+    per_ds = []
+    for p in sorted(Path(repaired_dir).glob("*_seed*_repaired.csv")):
+        name, seed = p.stem.rsplit("_repaired", 1)[0].rsplit("_seed", 1)
+        repaired = pd.read_csv(p, dtype=str, keep_default_na=False)
+        dirty, clean = _raha_pair(name)
+        m = score(dirty, clean, repaired)
+        per_seed[int(seed)].append(m)
+        per_ds.append({"name": name, "seed": int(seed), **{k: v for k, v in m.items()}})
+        print(f"  {name:<10} seed{seed}: F1={m['f1']:.3f} P={m['precision']:.3f} "
+              f"R={m['recall']:.3f} dmg={m['damage']:.3f}")
+    if not per_seed:
+        raise SystemExit(f"no repaired CSVs found in {repaired_dir}")
+    def mean(xs):
+        xs = list(xs)
+        return sum(xs) / len(xs) if xs else 0.0
+    seed_f1 = [mean(m["f1"] for m in ms) for ms in per_seed.values()]
+    mu = mean(seed_f1)
+    var = mean([(x - mu) ** 2 for x in seed_f1])
+    ci = 1.96 * (var ** 0.5) / (len(seed_f1) ** 0.5)
+    result = {
+        "system": "Baran (oracle detection, 20 gold labels)",
+        "real_f1": mu, "real_f1_ci": ci, "real_f1_per_seed": seed_f1,
+        "damage": mean(mean(m["damage"] for m in ms) for ms in per_seed.values()),
+        "precision": mean(mean(m["precision"] for m in ms) for ms in per_seed.values()),
+        "recall": mean(mean(m["recall"] for m in ms) for ms in per_seed.values()),
+        "n_seeds": len(per_seed), "per_dataset": per_ds,
+        "protocol_note": "upper bound: oracle error positions + 20 gold-labeled tuples "
+                         "(its package default); damage=0 by construction",
+    }
+    json.dump(result, open(out, "w"), indent=1)
+    print(f"\nBaran macro REAL-F1 {mu:.3f} ± {ci:.3f} (n={len(seed_f1)} seeds) -> {out}")
+    return result
+if __name__ == "__main__":
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--score-baran", action="store_true")
+    args = ap.parse_args()
+    if args.score_baran:
+        score_baran()

eval/calibration.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""Selective prediction / calibration study for grounded canonicalization.
+"Knowing when NOT to act" is the research contribution (and the AI-safety monitorability
+angle): instead of always emitting a canonical, the grounded reconciler attaches a
+CONFIDENCE and ABSTAINS below threshold. This module measures whether that confidence is
+trustworthy:
+  * Risk-Coverage curve + AURC — sort decisions by confidence; as we cover more (abstain
+    less) does risk rise gracefully? Low AURC = a good selective predictor.
+  * ECE (Expected Calibration Error) — does a confidence of 0.9 actually mean ~90% correct?
+  * Operating point — at our default threshold, what coverage and precision do we get, and
+    what threshold hits a target precision (e.g. 95%)?
+Probe = real cities sampled from the reference with injected typos (recoverable, gold known)
++ garbage TRAP strings (acting at all is an error). Reproducible (fixed seed).
+    uv run python -m eval.calibration
+"""
+from __future__ import annotations
+import random
+import string
+from scrubdata.reconcile import _norm, default_index
+def _typo(s: str, rng: random.Random) -> str:
+    if len(s) < 4:
+        return s + rng.choice(string.ascii_lowercase)
+    i = rng.randrange(1, len(s) - 1)
+    if not s[i].isalpha():
+        i = 1
+    pool = string.ascii_lowercase if s[i].islower() else string.ascii_uppercase
+    return s[:i] + rng.choice([c for c in pool if c != s[i].lower()]) + s[i + 1:]
+def build_probe(n_real: int = 500, n_trap: int = 150, seed: int = 5):
+    """(value, gold|None, kind) probes: real-city typos (recoverable) + garbage traps."""
+    idx = default_index()
+    cities = [c for bucket in idx._buckets.get("city", {}).values() for (c, _) in bucket]
+    rng = random.Random(seed)
+    probe = []
+    for c in rng.sample(cities, min(n_real, len(cities))):
+        probe.append((_typo(c, rng), c, "real"))
+    for _ in range(n_trap):
+        g = "".join(rng.choice(string.ascii_lowercase) for _ in range(rng.randint(5, 9)))
+        probe.append((g, None, "trap"))
+    rng.shuffle(probe)
+    return probe, idx
+def _scored(probe, idx, ctype="city"):
+    """(confidence, correct_if_acted) per probe."""
+    out = []
+    for value, gold, kind in probe:
+        b = idx.best(value, ctype)
+        conf = b[1] if b else 0.0
+        correct = bool(kind == "real" and b and _norm(b[0]) == _norm(gold))
+        out.append((conf, correct))
+    return out
+def risk_coverage(scored):
+    rows = sorted(scored, key=lambda x: -x[0])
+    n, cum = len(rows), 0
+    curve = []
+    for k, (conf, ok) in enumerate(rows, 1):
+        cum += int(ok)
+        curve.append((k / n, 1 - cum / k, conf))      # coverage, risk, confidence
+    aurc = sum(r for _, r, _ in curve) / len(curve)
+    return curve, aurc
+def ece(scored, bins: int = 10) -> float:
+    n = len(scored)
+    e = 0.0
+    for b in range(bins):
+        lo, hi = b / bins, (b + 1) / bins
+        bucket = [(c, ok) for c, ok in scored if (lo <= c < hi) or (b == bins - 1 and c == 1.0)]
+        if not bucket:
+            continue
+        conf = sum(c for c, _ in bucket) / len(bucket)
+        acc = sum(int(ok) for _, ok in bucket) / len(bucket)
+        e += len(bucket) / n * abs(conf - acc)
+    return e
+def operating_point(scored, threshold: float):
+    acted = [(c, ok) for c, ok in scored if c >= threshold]
+    coverage = len(acted) / len(scored)
+    precision = (sum(int(ok) for _, ok in acted) / len(acted)) if acted else 1.0
+    return coverage, precision
+def main() -> None:
+    probe, idx = build_probe()
+    scored = _scored(probe, idx)
+    curve, aurc = risk_coverage(scored)
+    e = ece(scored)
+    print(f"\n=== Selective prediction / calibration — grounded city reconciliation "
+          f"({len(probe)} probes: real typos + traps) ===\n")
+    print(f"  AURC (area under risk-coverage, lower=better) = {aurc:.4f}")
+    print(f"  ECE  (expected calibration error, lower=better) = {e:.4f}")
+    print("\n  Risk-Coverage operating points:")
+    print(f"  {'threshold':>10}{'coverage':>10}{'precision':>11}")
+    for t in (0.70, 0.78, 0.84, 0.90, 0.95, 1.00):
+        cov, prec = operating_point(scored, t)
+        print(f"  {t:>10.2f}{cov:>10.3f}{prec:>11.3f}")
+    # threshold achieving >=95% precision
+    best_t = next((t / 100 for t in range(70, 101)
+                   if operating_point(scored, t / 100)[1] >= 0.95), 1.0)
+    cov95, _ = operating_point(scored, best_t)
+    print(f"\n  -> for >=95% precision use threshold {best_t:.2f} (coverage {cov95:.3f}). "
+          "The confidence is trustworthy enough to ABSTAIN on the rest — the safety contract.")
+if __name__ == "__main__":
+    main()

eval/capture_plan_local.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""Capture a raw v6 model plan LOCALLY (Ollama Q8_0 GGUF) for a Raha dataset.
+Mirrors the Modal capture composition (scripts/modal_eval_v5.py --capture):
+make_batched_planner(base, batch_size=4), greedy, no grounded wrapper, no union —
+verification/union happen downstream (eval/raha_table.py, eval/precision_curve.py).
+DISCLOSED deltas vs the Modal captures: (1) Q8_0 GGUF on local Ollama instead of the
+bf16 merged adapter on A100 — quantization may shift individual mappings; (2) Ollama
+format=json instead of generate(suppress_tokens=[151657,151658]) — both exist solely
+to block the degenerate <tool_call> first token (without either, generation loops).
+Prereq: ollama pull hf.co/ricalanis/scrubdata-qwen3-4b-v6-q8:Q8_0
+        ollama create scrubdata-ft -f notebooks/Modelfile
+    uv run python -m eval.capture_plan_local --dataset beers
+Writes eval/results/v6_<dataset>_raw_plan_localq8.json.
+"""
+from __future__ import annotations
+import argparse
+import json
+import time
+from pathlib import Path
+from scrubdata.model_planner import _extract_json, make_batched_planner
+from .run_real_multi import _raha_pair
+def make_json_constrained_planner(model: str, host: str = "http://localhost:11434",
+                                  timeout: int = 600):
+    """Local Ollama planner with format=json (grammar-constrained decoding)."""
+    import urllib.request
+    from scrubdata.profiler import profile_dataframe
+    from scrubdata.prompt import SYSTEM_PROMPT, build_user_prompt
+    def planner(dirty_df, *_):
+        user = build_user_prompt(profile_dataframe(dirty_df), dirty_df)
+        payload = {
+            "model": model, "stream": False, "format": "json",
+            "messages": [{"role": "system", "content": SYSTEM_PROMPT},
+                         {"role": "user", "content": user}],
+            "options": {"temperature": 0, "num_predict": 2000, "num_ctx": 16384},
+        }
+        req = urllib.request.Request(
+            host + "/api/chat", data=json.dumps(payload).encode(),
+            headers={"Content-Type": "application/json"})
+        try:
+            with urllib.request.urlopen(req, timeout=timeout) as r:
+                out = json.loads(r.read())["message"]["content"]
+        except Exception as e:  # noqa: BLE001
+            print(f"  batch failed: {str(e)[:80]}", flush=True)
+            return {"__error__": str(e)[:120]}
+        plan = _extract_json(out)
+        if plan is None:
+            print(f"  batch returned no JSON: {out[:80]!r}", flush=True)
+            return {"__error__": "no_json"}
+        plan.setdefault("table_operations", [])
+        plan.setdefault("columns", [])
+        plan.setdefault("flags", [])
+        return plan
+    return planner
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--dataset", required=True)
+    ap.add_argument("--model", default="scrubdata-ft")
+    ap.add_argument("--timeout", type=int, default=600)
+    args = ap.parse_args()
+    dirty, _clean = _raha_pair(args.dataset)   # same table the scorer sees
+    print(f"capturing plan: {args.dataset} ({len(dirty)} rows x {dirty.shape[1]} cols)",
+          flush=True)
+    t0 = time.time()
+    plan = make_batched_planner(make_json_constrained_planner(args.model, timeout=args.timeout),
+                                batch_size=4)(dirty)
+    dt = time.time() - t0
+    n_ops = sum(len(c.get("operations", [])) for c in plan.get("columns", []))
+    print(f"done in {dt:.0f}s — {len(plan.get('columns', []))} columns, {n_ops} ops")
+    out = (Path(__file__).resolve().parent / "results"
+           / f"v6_{args.dataset}_raw_plan_localq8.json")
+    json.dump(plan, open(out, "w"), indent=1)
+    print(f"written to {out}")
+if __name__ == "__main__":
+    main()

eval/contamination_probe.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Memorization probe (W4.6): can a web-trained model complete benchmark rows verbatim?
+Legacy-public benchmarks (hospital et al., GitHub since 2019) sit inside every base
+model's training window; a HIGH verbatim-completion rate red-flags memorized gold.
+A low rate does not prove absence — the contamination statement stays assumption-based.
+Control: a date-stamped post-cutoff wild harvest (expected ~0).
+    uv run python -m eval.contamination_probe
+"""
+from __future__ import annotations
+import json
+import random
+import subprocess
+from pathlib import Path
+import pandas as pd
+ROOT = Path(__file__).resolve().parent.parent
+N_ROWS, N_GIVEN, MODEL = 30, 5, "glm-5.1"
+def probe(df: pd.DataFrame, name: str) -> dict:
+    rng = random.Random(0)
+    rows = rng.sample(range(len(df)), min(N_ROWS, len(df)))
+    cols = list(df.columns)
+    given, asked = cols[:N_GIVEN], cols[N_GIVEN:N_GIVEN + 4]
+    hits = total = 0
+    for r in rows:
+        prompt = (f"This is a row from the well-known public dataset '{name}'. "
+                  f"Complete the remaining fields EXACTLY as they appear in the dataset. "
+                  f"Known fields: "
+                  + "; ".join(f"{c}={df.iloc[r][c]}" for c in given)
+                  + ". Respond ONLY with: " + "; ".join(f"{c}=<value>" for c in asked))
+        out = subprocess.run(["oll", prompt, "--model", MODEL, "--max-tokens", "200"],
+                             capture_output=True, text=True, timeout=120).stdout.lower()
+        for c in asked:
+            total += 1
+            v = str(df.iloc[r][c]).strip().lower()
+            if v and v not in ("nan", "") and v in out:
+                hits += 1
+    return {"table": name, "rows": len(rows), "cells_asked": total,
+            "verbatim_hits": hits, "rate": round(hits / max(total, 1), 4)}
+def main() -> None:
+    hosp = pd.read_csv(ROOT / "data" / "real" / "hospital" / "clean.csv").astype(str)
+    wild = pd.read_csv(ROOT / "data" / "wild" / "glassdoor_jobs.csv").astype(str)
+    res = {"model": MODEL, "protocol": f"{N_ROWS} rows, {N_GIVEN} given cols, 4 asked cols, exact-substring match",
+           "probes": [probe(hosp, "hospital (Raha benchmark)"),
+                      probe(wild, "glassdoor_jobs (post-cutoff wild harvest)")]}
+    json.dump(res, open(ROOT / "eval" / "results" / "contamination_probe.json", "w"), indent=1)
+    print(json.dumps(res["probes"], indent=1))
+if __name__ == "__main__":
+    main()

eval/cross_scoring.py ADDED Viewed

	@@ -0,0 +1,294 @@

+"""B1 (W4.2) dual-metric cross-scoring on the 5 Raha real-error datasets.
+Scores every system under BOTH metric families, side by side:
+  * original  — the Raha/Baran cell-level repair protocol (Mahdavi & Abedjan,
+    PVLDB 13(12), p1948, Sec 6.1 + raha/dataset.py get_data_cleaning_evaluation):
+    values minimally normalized (html-unescape, whitespace collapse — their
+    value_normalizer), then RAW string equality; precision = exact-gold repairs /
+    cells changed; recall = exact-gold repairs / (dirty->clean diff); no
+    churn-neutrality, no case folding, no semantic tolerance, no damage metric.
+  * churn_neutral — our eval.run_real_multi.score (the scoring contract):
+    convention-normalized, churn ignored, damage reported.
+Systems: grounded (HEAD mock_plan), verified union (v6, tau=0.5 — identical plan
+files to eval.raha_table), OpenRefine fingerprint/kNN, and Baran at labeling
+budgets 0/5/20 (oracle detection; repaired CSVs from eval/run_baran.py, 3 seeds,
+seed-mean). Baran-from-CSV caveat: corrections equal to the dirty value vanish
+from the repaired-vs-dirty diff, so reconstructed |changed| is a lower bound on
+Baran's own output_size (precision an upper bound; recall exact).
+Also computes Kendall tau-b between the SYSTEM RANKINGS induced by the two F1s
+(per dataset + macro), and a calibration block: our Baran oracle+20 repro vs the
+published Table 3 "Baran" row (verified from the PVLDB PDF; see PUBLISHED below).
+Acceptance: the churn-neutral rows must reproduce eval/results/raha_per_dataset.json
+exactly (checked, hard-fails otherwise).
+    uv run python -m eval.cross_scoring
+Writes eval/results/cross_scoring.json and prints LaTeX rows.
+"""
+from __future__ import annotations
+import html
+import json
+import re
+from pathlib import Path
+import pandas as pd
+from scrubdata.baselines import openrefine_fingerprint_plan, openrefine_knn_plan
+from scrubdata.executor import apply_plan
+from scrubdata.planner import mock_plan
+from scrubdata.verifier import union_plans, verify_plan
+from .precision_curve import _repairs_only
+from .raha_table import TAU, UNION_PLANS, _gen_plan
+from .run_real_multi import RAHA, _cell_only, _raha_pair, score
+RESULTS = Path(__file__).resolve().parent / "results"
+BARAN_DIRS = {0: RESULTS / "baran_n0", 5: RESULTS / "baran_n5", 20: RESULTS / "baran"}
+# Baran PVLDB'20 Table 3, row "Baran" (no TL): complete set of data errors given as
+# input (= oracle detection), labeling budget 20, mean of 10 runs. Verified by reading
+# vldb.org/pvldb/vol13/p1948-mahdavi.pdf p1957 (2026-06-12). movies_1 is not evaluated
+# in the paper (its real-error sets are hospital/flights/address/beers/rayyan/it/tax).
+PUBLISHED = {"hospital": {"precision": 0.88, "recall": 0.86, "f1": 0.87},
+             "flights": {"precision": 1.00, "recall": 1.00, "f1": 1.00},
+             "beers": {"precision": 0.91, "recall": 0.89, "f1": 0.90},
+             "rayyan": {"precision": 0.76, "recall": 0.40, "f1": 0.52}}
+def _norm(v: str) -> str:
+    """raha.dataset.Dataset.value_normalizer, verbatim semantics."""
+    v = html.unescape(str(v))
+    v = re.sub("[\t\n ]+", " ", v, re.UNICODE)
+    return v.strip("\t\n ")
+def baran_score(dirty: pd.DataFrame, clean: pd.DataFrame, out: pd.DataFrame) -> dict:
+    """The original Raha/Baran repair metric over a repaired DataFrame: minimal
+    normalization then raw equality; changed = repaired-vs-dirty diff."""
+    n = min(len(dirty), len(out), len(clean))
+    errors = changed = tp = 0
+    for j, col in enumerate(dirty.columns):
+        present = col in out.columns
+        for i in range(n):
+            dv, cv = _norm(dirty.iat[i, j]), _norm(clean.iat[i, j])
+            ov = _norm(out.iloc[i][col]) if present else dv
+            err, chg = dv != cv, ov != dv
+            errors += err
+            changed += chg
+            tp += chg and err and ov == cv
+    p = tp / changed if changed else 0.0
+    r = tp / errors if errors else 0.0
+    f1 = 2 * p * r / (p + r) if (p + r) else 0.0
+    return {"f1": f1, "precision": p, "recall": r,
+            "_errors": errors, "_changed": changed, "_tp": tp}
+def _both(dirty, clean, out) -> dict:
+    m = score(dirty, clean, out)
+    return {"original": baran_score(dirty, clean, out),
+            "churn_neutral": {k: m[k] for k in
+                              ("f1", "precision", "recall", "damage",
+                               "_errors", "_changed", "_fixed")}}
+def kendall_tau(xs, ys) -> float:
+    """Kendall tau-b (tie-corrected), stdlib."""
+    n = len(xs)
+    n0, n1, n2, nc, nd = n * (n - 1) // 2, 0, 0, 0, 0
+    for i in range(n):
+        for j in range(i + 1, n):
+            a, b = xs[i] - xs[j], ys[i] - ys[j]
+            n1 += a == 0
+            n2 += b == 0
+            if a != 0 and b != 0:
+                nc += (a > 0) == (b > 0)
+                nd += (a > 0) != (b > 0)
+    denom = ((n0 - n1) * (n0 - n2)) ** 0.5
+    return (nc - nd) / denom if denom else 0.0
+def _mean_rows(rows: list[dict]) -> dict:
+    return {k: sum(r[k] for r in rows) / len(rows) for k in rows[0]}
+def main() -> None:
+    out = {"protocol": {
+        "original": "Raha/Baran convention: value_normalizer (html-unescape + "
+                    "whitespace collapse) then raw string equality; P = exact-gold "
+                    "repairs / changed cells, R = exact-gold repairs / (dirty->clean "
+                    "diff); no churn-neutrality, no damage",
+        "churn_neutral": "eval.run_real_multi.score — the scoring contract",
+        "baran_rows": "oracle error positions + n gold labels, 3 seeds, seed-mean; "
+                      "reconstructed from repaired CSVs (no-op corrections vanish: "
+                      "|changed| lower-bounds Baran's output_size)",
+        "movies_1": "first 2000 rows (_raha_pair), as everywhere in the suite"},
+        "systems": {}}
+    deterministic = [("grounded", mock_plan),
+                     ("openrefine_fingerprint", openrefine_fingerprint_plan),
+                     ("openrefine_knn", openrefine_knn_plan)]
+    for label, planner in deterministic:
+        rows = []
+        for name, _dom in RAHA:
+            dirty, clean = _raha_pair(name)
+            cleaned, _ = apply_plan(dirty, _cell_only(planner(dirty)))
+            m = _both(dirty, clean, cleaned)
+            rows.append({"dataset": name, **m})
+            print(f"  {label:<24}{name:<10} orig={m['original']['f1']:.3f} "
+                  f"cn={m['churn_neutral']['f1']:.3f}", flush=True)
+        out["systems"][label] = {"per_dataset": rows}
+    rows = []
+    for name, _dom in RAHA:
+        base = (json.load(open(UNION_PLANS[name])) if name in UNION_PLANS
+                else _gen_plan(name))
+        dirty, clean = _raha_pair(name)
+        plan = _repairs_only(union_plans(verify_plan(dirty, base, tau=TAU),
+                                         mock_plan(dirty)))
+        cleaned, _ = apply_plan(dirty, plan)
+        m = _both(dirty, clean, cleaned)
+        rows.append({"dataset": name, **m})
+        print(f"  {'verified_union':<24}{name:<10} orig={m['original']['f1']:.3f} "
+              f"cn={m['churn_neutral']['f1']:.3f}", flush=True)
+    out["systems"]["verified_union_v6_tau0.5"] = {"per_dataset": rows}
+    for n_labels, d in BARAN_DIRS.items():
+        rows = []
+        for name, _dom in RAHA:
+            dirty, clean = _raha_pair(name)
+            per_seed = []
+            for p in sorted(d.glob(f"{name}_seed*_repaired.csv")):
+                repaired = pd.read_csv(p, dtype=str, keep_default_na=False)
+                per_seed.append(_both(dirty, clean, repaired))
+            m = {"original": _mean_rows([s["original"] for s in per_seed]),
+                 "churn_neutral": _mean_rows([s["churn_neutral"] for s in per_seed])}
+            rows.append({"dataset": name, "n_seeds": len(per_seed), **m})
+            print(f"  {'baran_oracle%d' % n_labels:<24}{name:<10} "
+                  f"orig={m['original']['f1']:.3f} "
+                  f"cn={m['churn_neutral']['f1']:.3f}", flush=True)
+        out["systems"][f"baran_oracle{n_labels}"] = {"per_dataset": rows}
+    for sys in out["systems"].values():
+        for fam in ("original", "churn_neutral"):
+            sys[f"macro_f1_{fam}"] = _mean_rows(
+                [r[fam] for r in sys["per_dataset"]])["f1"]
+    # acceptance: churn-neutral rows == raha_per_dataset.json (exact)
+    ref = json.load(open(RESULTS / "raha_per_dataset.json"))
+    checks = []
+    for key, ref_key in [("grounded", "grounded"),
+                         ("openrefine_fingerprint", "openrefine_fingerprint"),
+                         ("openrefine_knn", "openrefine_knn"),
+                         ("verified_union_v6_tau0.5", "verified_union_v6_tau0.5"),
+                         ("baran_oracle20", "baran_oracle20")]:
+        for got, want in zip(out["systems"][key]["per_dataset"],
+                             ref["systems"][ref_key]["per_dataset"]):
+            for k in ("f1", "precision", "recall", "damage"):
+                ok = abs(got["churn_neutral"][k] - want[k]) < 1e-9
+                checks.append(ok)
+                if not ok:
+                    print(f"MISMATCH {key}/{got['dataset']}/{k}: "
+                          f"{got['churn_neutral'][k]} vs {want[k]}")
+    out["acceptance"] = {"vs": "raha_per_dataset.json", "n_cells": len(checks),
+                         "pass": all(checks)}
+    print(f"\nacceptance: {sum(checks)}/{len(checks)} cells match "
+          f"-> {'PASS' if all(checks) else 'FAIL'}")
+    if not all(checks):
+        raise SystemExit("acceptance FAILED")
+    # Kendall tau-b between system rankings under the two F1s
+    primary = ["grounded", "verified_union_v6_tau0.5", "openrefine_fingerprint",
+               "openrefine_knn", "baran_oracle20"]
+    extended = primary + ["baran_oracle0", "baran_oracle5"]
+    taus = {}
+    for label, sysset in [("primary", primary), ("extended", extended)]:
+        per_ds = {}
+        for i, (name, _dom) in enumerate(RAHA):
+            xs = [out["systems"][s]["per_dataset"][i]["original"]["f1"] for s in sysset]
+            ys = [out["systems"][s]["per_dataset"][i]["churn_neutral"]["f1"] for s in sysset]
+            per_ds[name] = kendall_tau(xs, ys)
+        xs = [out["systems"][s]["macro_f1_original"] for s in sysset]
+        ys = [out["systems"][s]["macro_f1_churn_neutral"] for s in sysset]
+        taus[label] = {"systems": sysset, "per_dataset": per_ds,
+                       "macro": kendall_tau(xs, ys)}
+        print(f"tau-b ({label}): macro={taus[label]['macro']:.3f}  " +
+              "  ".join(f"{n}={t:.3f}" for n, t in per_ds.items()))
+    out["kendall_tau_b"] = taus
+    # calibration: our Baran oracle+20 repro (ORIGINAL metric) vs published Table 3
+    cal = []
+    b20 = {r["dataset"]: r for r in out["systems"]["baran_oracle20"]["per_dataset"]}
+    for name, pub in PUBLISHED.items():
+        ours = b20[name]["original"]
+        cal.append({"dataset": name, "published_f1": pub["f1"],
+                    "published_precision": pub["precision"],
+                    "published_recall": pub["recall"],
+                    "repro_f1": ours["f1"], "repro_precision": ours["precision"],
+                    "repro_recall": ours["recall"],
+                    "delta_f1": ours["f1"] - pub["f1"]})
+        print(f"calibration {name:<10} published F1={pub['f1']:.2f} "
+              f"repro F1={ours['f1']:.3f} (d={ours['f1'] - pub['f1']:+.3f})")
+    out["calibration"] = {
+        "source": "Mahdavi & Abedjan, PVLDB 13(12) p1948, Table 3 row 'Baran' "
+                  "(no TL): complete error set given (oracle detection), budget 20, "
+                  "mean of 10 runs; PDF read 2026-06-12",
+        "notes": "their runs: full datasets, 10 label seeds, Wikipedia value models "
+                 "available in package but Table-3 row is without TL; ours: 3 label "
+                 "seeds, no pretraining, movies_1 not in their paper; our "
+                 "churn-neutral macro for this row is the paper's 0.811",
+        "rows": cal}
+    dest = RESULTS / "cross_scoring.json"
+    json.dump(out, open(dest, "w"), indent=1)
+    print(f"written to {dest}")
+    print(latex(out))
+LABELS = [("grounded", "Grounded (ours, deterministic)"),
+          ("verified_union_v6_tau0.5", r"Verified union (v6, $\tau{=}0.5$)"),
+          ("openrefine_fingerprint", "OpenRefine fingerprint"),
+          ("openrefine_knn", "OpenRefine kNN"),
+          ("baran_oracle20", r"Baran (oracle det.\ + 20 labels)")]
+def latex(out: dict) -> str:
+    """Booktabs rows: per system x dataset, original P/R/F1 next to churn-neutral
+    P/R/F1 + damage."""
+    L = [r"\begin{tabular}{llrrrrrrr}", r"\toprule",
+         r" & & \multicolumn{3}{c}{Original (Baran) metric} & "
+         r"\multicolumn{4}{c}{Churn-neutral (ours)} \\",
+         r"\cmidrule(lr){3-5}\cmidrule(lr){6-9}",
+         r"System & Dataset & Prec. & Rec. & F1 & Prec. & Rec. & F1 & Damage \\",
+         r"\midrule"]
+    for key, label in LABELS:
+        for i, r in enumerate(out["systems"][key]["per_dataset"]):
+            o, c = r["original"], r["churn_neutral"]
+            L.append(f"{label if i == 0 else ''} & "
+                     f"{r['dataset'].replace('_', r'\_')} & "
+                     f"{o['precision']:.3f} & {o['recall']:.3f} & {o['f1']:.3f} & "
+                     f"{c['precision']:.3f} & {c['recall']:.3f} & {c['f1']:.3f} & "
+                     f"{c['damage']:.3f} \\\\")
+        L.append(f" & \\emph{{macro}} &  &  & "
+                 f"\\emph{{{out['systems'][key]['macro_f1_original']:.3f}}} &  &  & "
+                 f"\\emph{{{out['systems'][key]['macro_f1_churn_neutral']:.3f}}} &  \\\\")
+        L.append(r"\midrule")
+    t = out["kendall_tau_b"]["primary"]
+    L.append(r"\multicolumn{9}{l}{Kendall $\tau_b$ between system rankings: "
+             f"macro {t['macro']:.2f}; per dataset " +
+             ", ".join(f"{n.replace('_', r'\_')} {v:.2f}"
+                       for n, v in t["per_dataset"].items()) + r"} \\")
+    cal = ", ".join(f"{r['dataset'].replace('_', r'\_')} {r['repro_f1']:.3f} vs "
+                    f"{r['published_f1']:.2f}" for r in out["calibration"]["rows"])
+    L.append(r"\multicolumn{9}{l}{Calibration, original metric (our Baran oracle+20 "
+             r"repro vs PVLDB'20 Table~3): " + cal + r"} \\")
+    L.append(r"\bottomrule")
+    L.append(r"\end{tabular}")
+    return "\n".join(L)
+if __name__ == "__main__":
+    main()

eval/degenerate.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""W4.3 + W4.4 — degenerate baselines + cost-weighted damage over the paired sets.
+Four scorer-pinning policies over the same dirty/clean pairs eval/paired_bench.py
+walks: no-op (output = dirty), abstain-all (no-op + flags; score-identical at the
+cell level — the repair metric is flag-blind by design, flags surface in audit
+metrics), random-edit (seeded vandalism: 5% of cells replaced with another value
+from the same column) and oracle (output = clean, headers realigned to dirty's —
+23/42 pairs differ in header naming only; cell alignment is positional). They pin
+the metric's floor (no-op F1 = 0, damage = 0), ceiling (oracle F1 = 1, damage = 0)
+and show it punishes vandalism. Also reruns the SHIPPED pipeline (mock_plan) to
+capture raw fix/damage cell counts and reports Effective-Reliability-style
+cost-weighted scores score_c = fixes - c*damage_cells for c in {1, 5, 10}.
+    uv run python -m eval.degenerate
+Writes eval/results/degenerate.json + docs/DEGENERATE_BASELINES.md. Per-pair rows
+are cached incrementally (eval/results/degenerate_pairs.json) so a killed run
+resumes where it stopped.
+"""
+from __future__ import annotations
+import argparse
+import json
+import random
+import time
+from pathlib import Path
+from scrubdata.executor import apply_plan
+from scrubdata.planner import mock_plan
+from .paired_bench import _load, pairs
+from .run_real_multi import _cell_only, score
+ROOT = Path(__file__).resolve().parent.parent
+EDIT_FRAC = 0.05
+SEED = 7
+COSTS = (1, 5, 10)
+def _noop(dirty, clean):
+    return dirty
+def _abstain_all(dirty, clean):
+    return dirty.copy()          # + flags conceptually; the cell metric is flag-blind
+def _random_edit(dirty, clean, seed=SEED):
+    rng = random.Random(seed)
+    out = dirty.copy()
+    n, m = out.shape
+    uniq = [list(dict.fromkeys(out.iloc[:, j])) for j in range(m)]
+    for idx in rng.sample(range(n * m), max(1, int(n * m * EDIT_FRAC))):
+        i, j = divmod(idx, m)
+        alts = [v for v in uniq[j] if v != out.iat[i, j]]
+        if alts:
+            out.iat[i, j] = rng.choice(alts)
+    return out
+def _oracle(dirty, clean):
+    out = clean.copy()
+    out.columns = dirty.columns  # header-naming variants only; alignment is positional
+    return out
+def _shipped(dirty, clean):
+    return apply_plan(dirty, _cell_only(mock_plan(dirty)))[0]
+POLICIES = [("no-op", _noop), ("abstain-all", _abstain_all),
+            ("random-edit", _random_edit), ("oracle", _oracle),
+            ("shipped", _shipped)]
+def _mean(xs):
+    xs = list(xs)
+    return sum(xs) / len(xs) if xs else 0.0
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--only", default=None)
+    ap.add_argument("--out", default="eval/results/degenerate.json")
+    ap.add_argument("--cache", default="eval/results/degenerate_pairs.json")
+    args = ap.parse_args()
+    cache = json.load(open(args.cache)) if Path(args.cache).exists() else {}
+    for p in pairs():
+        if args.only and p.name != args.only:
+            continue
+        if p.name in cache:
+            continue
+        try:
+            dirty, clean = _load(p)
+        except Exception as e:  # noqa: BLE001
+            print(f"  {p.name}: LOAD FAILED {type(e).__name__}")
+            continue
+        entry = {}
+        for name, policy in POLICIES:
+            t0 = time.perf_counter()
+            m = score(dirty, clean, policy(dirty, clean))
+            n = min(len(dirty), len(clean))
+            clean_cells = n * dirty.shape[1] - m["_errors"]
+            entry[name] = {
+                "name": p.name, "errors": m["_errors"],
+                "f1": m["f1"], "precision": m["precision"], "recall": m["recall"],
+                "damage": m["damage"], "fixed": m["_fixed"], "changed": m["_changed"],
+                "damage_cells": round(m["damage"] * clean_cells),
+                "sec": round(time.perf_counter() - t0, 1)}
+        cache[p.name] = entry
+        json.dump(cache, open(args.cache, "w"), indent=1)
+        print(f"  {p.name:<46} " + " ".join(
+            f"{name}={entry[name]['f1']:.3f}" for name, _ in POLICIES), flush=True)
+    res = {name: [cache[k][name] for k in sorted(cache)] for name, _ in POLICIES}
+    out = {"n_pairs": len(res["no-op"]), "edit_frac": EDIT_FRAC, "seed": SEED,
+           "policies": {}, "acceptance": {}}
+    for name, _ in POLICIES:
+        rows = res[name]
+        E, F, D = (sum(r[k] for r in rows) for k in ("errors", "fixed", "damage_cells"))
+        out["policies"][name] = {
+            "macro": {k: round(_mean(r[k] for r in rows), 4)
+                      for k in ("f1", "precision", "recall", "damage")},
+            "micro": {"errors": E, "fixed": F, "changed": sum(r["changed"] for r in rows),
+                      "damage_cells": D},
+            "score_c": {f"c={c}": {"raw": F - c * D,
+                                   "per_error": round((F - c * D) / E, 4)}
+                        for c in COSTS},
+            "sec": round(sum(r["sec"] for r in rows), 1),
+            "per_pair": rows}
+    bad_oracle = [r["name"] for r in res["oracle"] if r["f1"] != 1.0]
+    bad_noop = [r["name"] for r in res["no-op"] if r["damage"] != 0.0]
+    out["acceptance"] = {"oracle_f1_all_exactly_1": not bad_oracle,
+                         "noop_damage_all_exactly_0": not bad_noop,
+                         "violations": {"oracle": bad_oracle, "no-op": bad_noop}}
+    Path(args.out).parent.mkdir(parents=True, exist_ok=True)
+    json.dump(out, open(args.out, "w"), indent=1)
+    P = out["policies"]
+    L = ["# Degenerate baselines + cost-weighted damage (W4.3 + W4.4)", "",
+         f"Same {out['n_pairs']} dirty/clean pairs as `eval/paired_bench.py`, scored with "
+         "`run_real_multi.score()` (churn-neutral F1 + damage). The degenerate policies pin",
+         "the metric: no-op = floor (F1 0, damage 0), oracle = ceiling (F1 1, damage 0),",
+         "random-edit (seeded, 5% of cells) = vandalism the metric must punish. Abstain-all",
+         "is score-identical to no-op — the repair metric is flag-blind by design.", "",
+         "| policy | macro F1 | macro P | macro R | macro damage | fixed | damage cells |",
+         "|---|---|---|---|---|---|---|"]
+    for name, _ in POLICIES:
+        ma, mi = P[name]["macro"], P[name]["micro"]
+        L.append(f"| {name} | {ma['f1']:.3f} | {ma['precision']:.3f} | {ma['recall']:.3f} "
+                 f"| {ma['damage']:.4f} | {mi['fixed']} | {mi['damage_cells']} |")
+    L += ["", "## Cost-weighted scores (Effective-Reliability style, W4.4)", "",
+          "score_c = fixes − c·damage_cells, micro-summed over all pairs; per-error =",
+          f"score_c / {P['shipped']['micro']['errors']} total benchmark errors.", "",
+          "| policy | " + " | ".join(f"c={c} (per-error)" for c in COSTS) + " |",
+          "|---|" + "---|" * len(COSTS)]
+    for name, _ in POLICIES:
+        sc = P[name]["score_c"]
+        L.append(f"| {name} | " + " | ".join(
+            f"{sc[f'c={c}']['raw']} ({sc[f'c={c}']['per_error']:+.3f})" for c in COSTS) + " |")
+    a = out["acceptance"]
+    L += ["", f"Acceptance: oracle F1 = 1.0 on all pairs: **{a['oracle_f1_all_exactly_1']}** · "
+          f"no-op damage = 0.0 on all pairs: **{a['noop_damage_all_exactly_0']}**",
+          f"Repro: `uv run python -m eval.degenerate` (seed {SEED}, edit fraction {EDIT_FRAC})."]
+    (ROOT / "docs" / "DEGENERATE_BASELINES.md").write_text("\n".join(L) + "\n")
+    print(f"{out['n_pairs']} pairs x {len(POLICIES)} policies -> {args.out} "
+          "+ docs/DEGENERATE_BASELINES.md")
+    print("acceptance:", out["acceptance"])
+if __name__ == "__main__":
+    main()

eval/diagnose_model.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""Diagnose vanilla-model failures: truncation vs genuine schema violation.
+Runs N examples through an Ollama Cloud model, categorizing each output:
+  empty / no_json / truncated / json_but_schema_invalid / valid
+and reading `oll`'s stderr token counts to detect output hitting the cap.
+    uv run eval/diagnose_model.py --n 12 --model glm-5.1 --max-tokens 8000
+"""
+from __future__ import annotations
+import argparse
+import json
+import random
+import re
+import subprocess
+from collections import Counter
+from jsonschema import Draft202012Validator
+from scrubdata.prompt import SYSTEM_PROMPT, build_user_prompt
+from scrubdata.profiler import profile_dataframe
+from training.generate import make_example
+from .metrics import PLAN_SCHEMA
+_V = Draft202012Validator(PLAN_SCHEMA)
+_TOK = re.compile(r"out\s+(\d+)\s*tok", re.I)
+def _call(user: str, model: str, max_tokens: int):
+    r = subprocess.run(
+        ["oll", "--model", model, "--system", SYSTEM_PROMPT,
+         "--max-tokens", str(max_tokens), "--temperature", "0"],
+        input=user, capture_output=True, text=True, timeout=300)
+    out_tok = None
+    m = _TOK.search(r.stderr or "")
+    if m:
+        out_tok = int(m.group(1))
+    return r.stdout, out_tok
+def _categorize(out: str, out_tok: int | None, max_tokens: int):
+    s = out.strip()
+    if not s:
+        return "empty", None
+    i, j = s.find("{"), s.rfind("}")
+    if i == -1:
+        return "no_json", None
+    near_cap = out_tok is not None and out_tok >= max_tokens - 50
+    if j < i:
+        return ("truncated" if near_cap else "no_close_brace"), None
+    try:
+        plan = json.loads(s[i:j + 1])
+    except json.JSONDecodeError:
+        return ("truncated" if near_cap else "malformed_json"), None
+    errs = sorted(_V.iter_errors(plan), key=lambda e: e.path)
+    if not errs:
+        return "valid", None
+    return "schema_invalid", errs[0].message[:90]
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--n", type=int, default=12)
+    ap.add_argument("--model", type=str, default="glm-5.1")
+    ap.add_argument("--max-tokens", type=int, default=8000)
+    ap.add_argument("--seed", type=int, default=4242)
+    args = ap.parse_args()
+    rng = random.Random(args.seed)
+    cats = Counter()
+    print(f"Diagnosing {args.model} @ max_tokens={args.max_tokens} on {args.n} examples\n")
+    for k in range(args.n):
+        ex = make_example(rng)
+        user = build_user_prompt(profile_dataframe(ex["dirty_df"]), ex["dirty_df"])
+        out, out_tok = _call(user, args.model, args.max_tokens)
+        cat, detail = _categorize(out, out_tok, args.max_tokens)
+        cats[cat] += 1
+        print(f"  ex{k:2d}: {cat:<16} out_tok={out_tok}"
+              + (f"  [{detail}]" if detail else ""))
+    print("\nBreakdown:", dict(cats))
+    valid = cats.get("valid", 0)
+    trunc = cats.get("truncated", 0)
+    print(f"valid={valid}/{args.n} ({valid/args.n:.0%}) | truncated={trunc} "
+          f"| schema_invalid={cats.get('schema_invalid', 0)}")
+if __name__ == "__main__":
+    main()

eval/equivalence.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""W2.d — TOST equivalence statistics for the SFT null (the bounded negative claim).
+Operationalizes "weight interventions did not move held-out repair": paired
+per-dataset GEN-F1 deltas (retrain minus champion v6) over the 3 held-out EVAL
+sources x the 5-retrain SFT series (challenger seed31, v7 seed32, v8 seed33,
+v9 seed34, v10 seed35), pooled (n=15). DISCLOSED granularity: the retrain series
+was scored per held-out SOURCE only (eval/results/generalization_*.json) — the
+42-pair paired bench exists for the shipped pipeline, not per retrain — so the
+unit here is per-dataset, not per-pair, and within-retrain deltas are clustered
+(flights/rayyan deltas are near-identical across retrains). A retrain-level
+robustness check (n=5 macro deltas, one per retrain) is reported alongside.
+PRE-REGISTERED (docs/ROADMAP_PUBLICATION.md W2.d, before this analysis ran):
+SESOI delta = +/-0.05 GEN-F1, justified as smaller than the gain deterministic
+grounding provides. TOST per Lakens'17: two one-sided t-tests against the SESOI
+bounds; equivalence p = max of the two. Bootstrap: 10k resamples, seed 42, 90% CI.
+    uv run python -m eval.equivalence
+Writes eval/results/equivalence.json.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+import numpy as np
+from scipy import stats
+RESULTS = Path(__file__).resolve().parent / "results"
+SESOI = 0.05            # pre-registered (roadmap W2.d) — do not change post hoc
+N_BOOT = 10_000
+SEED = 42
+CHAMPION = "generalization_champion.json"           # champion v6/seed21 (union)
+RETRAINS = [                                        # the five SFT retrains (paper sec:negative)
+    ("generalization_challenger.json", "challenger seed31"),
+    ("generalization_v7.json", "v7 seed32 (unicode-punct archetype)"),
+    ("generalization_v8.json", "v8 seed33 (+109k harvested alias vocabs)"),
+    ("generalization_v9.json", "v9 seed34 (+MusicBrainz hints, gidcl pairs)"),
+    ("generalization_v10.json", "v10 seed35 (suspects-contract)"),
+]
+def _per_source_f1(fname: str) -> dict[str, float]:
+    rec = json.loads((RESULTS / fname).read_text())[0]
+    return {s["source"]: s["f1"] for s in rec["per_source"]}, rec["gen_f1"]
+def _tost(deltas: np.ndarray) -> dict:
+    """Two one-sided t-tests against [-SESOI, +SESOI]; equivalence p = max."""
+    p_lo = stats.ttest_1samp(deltas, -SESOI, alternative="greater").pvalue
+    p_hi = stats.ttest_1samp(deltas, +SESOI, alternative="less").pvalue
+    return {"p_lower": float(p_lo), "p_upper": float(p_hi),
+            "p_tost": float(max(p_lo, p_hi)), "n": int(len(deltas)),
+            "mean": float(deltas.mean()), "sd": float(deltas.std(ddof=1))}
+def main() -> dict:
+    champ, champ_macro = _per_source_f1(CHAMPION)
+    pooled, per_retrain = [], []
+    for fname, label in RETRAINS:
+        ps, macro = _per_source_f1(fname)
+        assert set(ps) == set(champ), f"{fname}: source mismatch vs champion"
+        per_retrain.append({
+            "retrain": label, "file": fname,
+            "macro_gen_f1": round(macro, 6),
+            "macro_delta": round(macro - champ_macro, 6),
+            "per_dataset_delta": {s: round(ps[s] - champ[s], 6) for s in champ},
+        })
+        pooled += [ps[s] - champ[s] for s in sorted(champ)]
+    deltas = np.array(pooled)
+    rng = np.random.default_rng(SEED)
+    boot = np.array([rng.choice(deltas, size=len(deltas), replace=True).mean()
+                     for _ in range(N_BOOT)])
+    ci = (float(np.percentile(boot, 5)), float(np.percentile(boot, 95)))
+    macro_deltas = np.array([r["macro_delta"] for r in per_retrain])
+    out = {
+        "spec": {"sesoi": SESOI, "sesoi_preregistered": "docs/ROADMAP_PUBLICATION.md W2.d",
+                 "n_boot": N_BOOT, "seed": SEED, "ci_level": 0.90,
+                 "champion": CHAMPION, "champion_macro_gen_f1": round(champ_macro, 6)},
+        "granularity": ("per-dataset (3 held-out sources x 5 retrains = 15 paired "
+                        "deltas). Per-pair rows do not exist for the retrain series "
+                        "(only the shipped pipeline was scored on the 42-pair bench); "
+                        "within-retrain deltas are clustered, hence the retrain-level "
+                        "robustness check below."),
+        "per_retrain": per_retrain,
+        "pooled_per_dataset": {
+            **_tost(deltas),
+            "ci90_bootstrap": [round(ci[0], 6), round(ci[1], 6)],
+            "ci90_width": round(ci[1] - ci[0], 6),
+            "equivalent_at_sesoi": bool(-SESOI < ci[0] and ci[1] < SESOI),
+        },
+        "retrain_level_robustness": _tost(macro_deltas),
+        "caveat": ("GEN-F1 sits near floor (champion 0.015 absolute), so the bound "
+                   "certifies absence of movement on a low-dynamic-range metric; "
+                   "the CI width (~0.004) shows the data could have detected effects "
+                   "an order of magnitude smaller than the 0.05 SESOI."),
+    }
+    p = out["pooled_per_dataset"]
+    out["paper_sentence"] = (
+        f"Across the five-retrain series the mean held-out GEN-F1 delta (retrain "
+        f"minus champion, per-dataset, n={p['n']}) is {p['mean']:+.4f} (90\\% "
+        f"bootstrap CI [{ci[0]:+.4f}, {ci[1]:+.4f}]); TOST rejects effects larger "
+        f"than the pre-registered $\\pm$0.05 SESOI (p = {p['p_tost']:.1e}), and the "
+        f"retrain-level check (n=5 macro deltas) agrees "
+        f"(p = {out['retrain_level_robustness']['p_tost']:.1e}).")
+    (RESULTS / "equivalence.json").write_text(json.dumps(out, indent=2) + "\n")
+    print(json.dumps({k: out[k] for k in ("pooled_per_dataset",
+                                          "retrain_level_robustness",
+                                          "paper_sentence")}, indent=2))
+    return out
+if __name__ == "__main__":
+    main()

eval/generalization.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""D1 — the GENERALIZATION metric: held-out-source real-error evaluation.
+The wide-suite REAL slice mixes sources whose pairs are IN the champion's training mix
+(hospital/beers/movies_1 -> mixA), so it part-measures memorization. This metric fixes
+that and one more honesty problem:
+  * HELD-OUT SOURCES ONLY: a model is scored only on real-error benchmarks whose pairs
+    were never used to train it. The split is explicit and committed (TRAIN_SOURCES);
+    new harvested sources must be assigned to exactly one side.
+  * ERROR-CLASS BREAKDOWN: benchmark errors split by the SAME variant gate the training
+    derivation uses (training.real_data._is_variant — one source of truth). A
+    canonicalization system claims competence on the VARIANT class (typos / casing /
+    aliases); imputation-class errors (missing or non-variant rewrites) are reported,
+    never hidden, but a system that abstains on them is behaving correctly.
+Headline numbers per system:
+    GEN-F1          churn-neutral F1 over ALL errors, macro over held-out sources
+    VARIANT-RECALL  share of variant-class errors repaired (claimed competence)
+    VARIANT-PREC    of committed changes on variant cells, share correct
+    damage          clean cells corrupted (churn-neutral)
+DISCLOSED class imperfection: the string-variant gate over-counts on flights —
+single-digit time differences ('7:59 p.m.' vs '7:58 p.m.') pass the similarity
+threshold but are cross-source VALUE disagreements (need per-entity cross-row
+voting, a different capability), not surface canonicalization. ~950 of flights'
+1049 "variant" errors are of this kind; treat flights' variant-recall as a
+lower-bound stress number, not addressable headroom.
+    uv run python -m eval.generalization                 # grounded heuristic baseline
+"""
+from __future__ import annotations
+import argparse
+import json
+from scrubdata.executor import apply_plan
+from scrubdata.planner import mock_plan
+from training.real_data import _is_variant
+from .metrics import _cell_equal
+from .run_real_multi import _cell_only, _fetch, _sem_equal, score
+# pairs used to train the current champion (v6 = mixA) — anything here is OFF-LIMITS
+# for generalization scoring of that model. Update per training run.
+TRAIN_SOURCES = {"v6": {"hospital", "beers", "movies_1"}}
+# held-out real-error sources. Harvested D1 sources get appended here OR to the
+# training side — never both. ed2_restaurants (stage-2 harvest): real NYC-restaurant
+# typos, in-regime, EVAL-ONLY — its sibling domain source (fodors_zagats) trains, so
+# this measures cross-source same-domain transfer. dblp_scholar was REJECTED as an
+# eval source: its gold systematically prefers the opposite case convention from the
+# dirty side (Scholar lowercase vs DBLP Title Case), which measures convention
+# preference, not cleaning — the artifact this metric is designed against.
+EVAL_SOURCES = ["flights", "rayyan", "ed2_restaurants"]
+def variant_breakdown(dirty, clean, out) -> dict:
+    """Split benchmark errors by class and count repairs per class (churn-neutral)."""
+    n = min(len(dirty), len(out), len(clean))
+    c = {"variant_errors": 0, "variant_fixed": 0, "variant_changed": 0,
+         "variant_good": 0, "other_errors": 0, "other_fixed": 0}
+    for j, col in enumerate(dirty.columns):
+        present = col in out.columns
+        for i in range(n):
+            dv, cv = dirty.iat[i, j], clean.iat[i, j]
+            if _cell_equal(dv, cv):
+                continue                                   # not a benchmark error
+            ov = out.iloc[i][col] if present else dv
+            chg = present and not _cell_equal(ov, dv)
+            if chg and _sem_equal(ov, dv) and not _cell_equal(ov, cv):
+                chg = False                                # churn: ignore
+            fixed = _cell_equal(ov, cv) or (_sem_equal(ov, cv) and chg)
+            is_variant = (str(dv).strip() and str(cv).strip()
+                          and _is_variant(str(dv), str(cv)))
+            if is_variant:
+                c["variant_errors"] += 1
+                c["variant_fixed"] += int(fixed)
+                if chg:
+                    c["variant_changed"] += 1
+                    c["variant_good"] += int(_sem_equal(ov, cv))
+            else:
+                c["other_errors"] += 1
+                c["other_fixed"] += int(fixed)
+    return c
+def evaluate_generalization(planner, sources=None, label: str = "system") -> dict:
+    sources = sources or EVAL_SOURCES
+    rows = []
+    for name in sources:
+        # FULL tables, no truncation — ed2_restaurants' real errors are concentrated
+        # outside the first 2k rows (_raha_pair's head(2000) hid 473 of 477).
+        dirty, clean = _fetch(name)
+        cleaned, _ = apply_plan(dirty, _cell_only(planner(dirty)))
+        m = score(dirty, clean, cleaned)
+        b = variant_breakdown(dirty, clean, cleaned)
+        rows.append({"source": name, **{k: m[k] for k in
+                                        ("f1", "precision", "recall", "damage")}, **b})
+        print(f"  {name:<10} F1={m['f1']:.3f} dmg={m['damage']:.3f} | variant: "
+              f"{b['variant_fixed']}/{b['variant_errors']} fixed, "
+              f"{b['variant_good']}/{b['variant_changed']} changes good | "
+              f"other: {b['other_fixed']}/{b['other_errors']}", flush=True)
+    return _aggregate(rows, sources, label)
+def evaluate_captured_union(plans: dict, sources, label: str, tau: float = 0.5) -> dict:
+    """Score the SHIPPED pipeline from captured raw model plans (Modal --capture):
+    per source, verify(tau) the captured plan, union with the grounded heuristic —
+    byte-identical composition to scrubdata/active.py."""
+    from scrubdata.verifier import union_plans, verify_plan
+    def planner_for(name):
+        def planner(df, *_):
+            return union_plans(verify_plan(df, plans[name], tau=tau), mock_plan(df))
+        return planner
+    rows = []
+    for name in sources:
+        dirty, clean = _fetch(name)
+        cleaned, _ = apply_plan(dirty, _cell_only(planner_for(name)(dirty)))
+        m = score(dirty, clean, cleaned)
+        b = variant_breakdown(dirty, clean, cleaned)
+        rows.append({"source": name, **{k: m[k] for k in
+                                        ("f1", "precision", "recall", "damage")}, **b})
+        print(f"  {name:<16} F1={m['f1']:.3f} dmg={m['damage']:.3f} | variant: "
+              f"{b['variant_fixed']}/{b['variant_errors']} fixed", flush=True)
+    return _aggregate(rows, sources, label)
+def _aggregate(rows, sources, label) -> dict:
+    def mean(xs):
+        xs = list(xs)
+        return sum(xs) / len(xs) if xs else 0.0
+    def rate(num, den):
+        return num / den if den else 0.0
+    out = {
+        "system": label, "sources": list(sources),
+        "gen_f1": mean(r["f1"] for r in rows),
+        "variant_recall": mean(rate(r["variant_fixed"], r["variant_errors"]) for r in rows),
+        "variant_precision": mean(rate(r["variant_good"], r["variant_changed"])
+                                  if r["variant_changed"] else 1.0 for r in rows),
+        "other_recall": mean(rate(r["other_fixed"], r["other_errors"]) for r in rows),
+        "damage": mean(r["damage"] for r in rows),
+        "per_source": rows,
+    }
+    print(f"{label}: GEN-F1={out['gen_f1']:.3f} VARIANT-RECALL={out['variant_recall']:.3f} "
+          f"VARIANT-PREC={out['variant_precision']:.3f} dmg={out['damage']:.3f}")
+    return out
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--sources", default=",".join(EVAL_SOURCES))
+    ap.add_argument("--plans", default=None,
+                    help="JSON file {source: captured raw model plan} -> score the "
+                         "shipped union pipeline instead of the local baselines")
+    ap.add_argument("--label", default="captured union")
+    ap.add_argument("--out", default="eval/results/generalization_baseline.json")
+    args = ap.parse_args()
+    sources = args.sources.split(",")
+    if args.plans:
+        plans = json.load(open(args.plans))
+        results = [evaluate_captured_union(plans, sources, args.label)]
+    else:
+        results = [
+            evaluate_generalization(mock_plan, sources, "grounded heuristic"),
+            evaluate_generalization(
+                lambda df: {"table_operations": [], "columns": [], "flags": []},
+                sources, "no-op"),
+        ]
+    json.dump(results, open(args.out, "w"), indent=1)
+    print(f"written to {args.out}")
+if __name__ == "__main__":
+    main()

eval/gittables_audit.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""N=250 GitTables audit — the at-scale trust + repair board.
+250 real GitHub tables (LUH-DBS Matelda GitTables-subsets, Apache-2.0; injected
+typos on real heterogeneous tables) scored end-to-end with the shipped pipeline:
+schema validity, SILENT-EDIT attribution (the trust contract at scale), and the
+churn-neutral repair metric. No inject-recovery here (these pairs carry their own
+errors). Summary feeds docs/GITTABLES_AUDIT.md.
+    uv run python -m eval.gittables_audit
+"""
+from __future__ import annotations
+import json
+import time
+from pathlib import Path
+import pandas as pd
+from scrubdata.executor import apply_plan
+from scrubdata.planner import mock_plan
+from .metrics import is_valid
+from .run_real_multi import _cell_only, score
+from .wild_bench import behavioral
+ROOT = Path(__file__).resolve().parent.parent
+DIR = ROOT / "data" / "gittables250"
+N_CAP = 3000
+def _load(p: Path):
+    kw = dict(dtype=str, keep_default_na=False, nrows=N_CAP, on_bad_lines="skip")
+    try:
+        return pd.read_csv(p, encoding_errors="replace", **kw)
+    except Exception:  # noqa: BLE001
+        return pd.read_csv(p, engine="python", **kw)
+def main() -> None:
+    slugs = sorted({p.name.split("_")[0] for p in DIR.glob("t*_dirty.csv")})
+    rows, failures = [], []
+    t0 = time.perf_counter()
+    for slug in slugs:
+        try:
+            dirty = _load(DIR / f"{slug}_dirty.csv")
+            clean = _load(DIR / f"{slug}_clean.csv")
+            n = min(len(dirty), len(clean))
+            if n < 3 or dirty.shape[1] < 2:
+                continue
+            dirty, clean = dirty.head(n), clean.head(n)
+            b = behavioral(dirty)
+            plan = _cell_only(mock_plan(dirty))
+            cleaned, _ = apply_plan(dirty, plan)
+            m = score(dirty, clean, cleaned)
+            rows.append({"table": slug, "rows": n, "cols": dirty.shape[1],
+                         "plan_valid": b["plan_valid"],
+                         "silent_edit_columns": len(b["silent_edit_columns"]),
+                         "errors": m["_errors"], "f1": round(m["f1"], 3),
+                         "damage": round(m["damage"], 4)})
+        except Exception as e:  # noqa: BLE001
+            failures.append(f"{slug}: {type(e).__name__}")
+    dt = time.perf_counter() - t0
+    n = len(rows)
+    valid = sum(r["plan_valid"] for r in rows)
+    silent = sum(1 for r in rows if r["silent_edit_columns"])
+    scored = [r for r in rows if r["errors"] > 0]
+    f1s = [r["f1"] for r in scored]
+    dmgs = [r["damage"] for r in rows]
+    summary = {
+        "tables_audited": n, "pipeline_failures": len(failures),
+        "plan_valid": valid, "tables_with_silent_edits": silent,
+        "tables_with_errors": len(scored),
+        "macro_f1_on_errored": round(sum(f1s) / len(f1s), 3) if f1s else None,
+        "macro_damage": round(sum(dmgs) / len(dmgs), 4),
+        "zero_damage_tables": sum(1 for d in dmgs if d == 0),
+        "seconds": round(dt, 1),
+    }
+    json.dump({"summary": summary, "rows": rows, "failures": failures},
+              open(ROOT / "eval" / "results" / "gittables_audit.json", "w"), indent=1)
+    L = ["# GitTables N=250 audit — trust contract at scale", "",
+         f"Shipped pipeline over {n} real GitHub tables (Matelda GitTables-subsets,",
+         "Apache-2.0; injected typos on real heterogeneous tables).", "",
+         "| metric | value |", "|---|---|"]
+    for k, v in summary.items():
+        L.append(f"| {k} | {v} |")
+    (ROOT / "docs" / "GITTABLES_AUDIT.md").write_text("\n".join(L) + "\n")
+    print(json.dumps(summary, indent=1))
+    if failures:
+        print("failures:", failures[:8])
+if __name__ == "__main__":
+    main()

eval/gold.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

eval/gold.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""Frozen held-out gold eval set (committed to eval/gold.jsonl).
+A FIXED test set so every fine-tune iteration (and generator change) is scored on the
+same examples — v1 vs v2 stay comparable. Regenerate intentionally with `build_gold`.
+"""
+from __future__ import annotations
+import json
+import random
+from pathlib import Path
+import pandas as pd
+from training.generate import make_example
+from . import metrics
+GOLD_PATH = Path(__file__).resolve().parent / "gold.jsonl"
+def build_gold(n: int = 300, seed: int = 4242, path: Path = GOLD_PATH) -> list[dict]:
+    rng = random.Random(seed)
+    out = []
+    while len(out) < n:
+        ex = make_example(rng)
+        if metrics.recovery(ex["clean_df"], ex["dirty_df"], ex["plan"]) >= 0.999:
+            out.append(ex)
+    with Path(path).open("w", encoding="utf-8") as f:
+        for ex in out:
+            clean = ex["clean_df"].where(pd.notna(ex["clean_df"]), None)
+            f.write(json.dumps({
+                "dirty": ex["dirty_df"].to_dict("records"),
+                "clean": clean.to_dict("records"),
+                "dirty_cols": list(ex["dirty_df"].columns),
+                "clean_cols": list(ex["clean_df"].columns),
+                "plan": ex["plan"],
+            }, ensure_ascii=False, default=str) + "\n")
+    return out
+def load_gold(path: Path = GOLD_PATH) -> list[dict]:
+    p = Path(path)
+    if not p.exists():
+        return build_gold(path=p)
+    out = []
+    for line in p.read_text(encoding="utf-8").splitlines():
+        d = json.loads(line)
+        dirty = (pd.DataFrame(d["dirty"])[d["dirty_cols"]] if d["dirty"]
+                 else pd.DataFrame(columns=d["dirty_cols"]))
+        clean = (pd.DataFrame(d["clean"])[d["clean_cols"]] if d["clean"]
+                 else pd.DataFrame(columns=d["clean_cols"]))
+        out.append({"dirty_df": dirty, "clean_df": clean, "plan": d["plan"]})
+    return out
+if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--n", type=int, default=300)
+    ap.add_argument("--seed", type=int, default=4242)
+    args = ap.parse_args()
+    g = build_gold(args.n, args.seed)
+    print(f"Wrote {len(g)} frozen gold examples to {GOLD_PATH}")

eval/inject.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""Seeded, self-verifying error injection — turns any CLEAN table into dirty/clean
+validation. This is the de-biasing core of the north-star: our 20+ harvested clean
+domains become per-cell-ground-truth validation across error types, far beyond any one
+published benchmark.
+Self-contained (no nlpaug/BART deps): we inject a KNOWN corruption into a clean cell, so
+the (dirty -> clean) ground truth is exact and the run is reproducible (fixed seed).
+Injects RECOVERABLE error types (the cleaner can restore the clean value): typo, ocr,
+case, whitespace — i.e. the canonicalization + format axes. Targets CATEGORICAL text
+columns (recurring values), where canonicalization is the task.
+"""
+from __future__ import annotations
+import random
+import string
+_OCR = {"O": "0", "o": "0", "l": "1", "I": "1", "S": "5", "s": "5",
+        "B": "8", "Z": "2", "z": "2", "g": "9", "G": "6", "b": "6"}
+def _typo(s: str, rng: random.Random) -> str:
+    if len(s) < 4:
+        return s
+    i = rng.randrange(1, len(s) - 1)
+    if not s[i].isalpha():
+        return s
+    m = rng.random()
+    if m < 0.55:                                  # substitute (the classic 'birminghxm')
+        pool = string.ascii_uppercase if s[i].isupper() else string.ascii_lowercase
+        return s[:i] + rng.choice([c for c in pool if c != s[i].lower()]) + s[i + 1:]
+    if m < 0.78:                                  # delete
+        return s[:i] + s[i + 1:]
+    return s[:i] + s[i + 1] + s[i] + s[i + 2:]    # transpose
+def _ocr(s: str, rng: random.Random) -> str:
+    idxs = [i for i, c in enumerate(s) if c in _OCR]
+    if not idxs:
+        return _typo(s, rng)
+    i = rng.choice(idxs)
+    return s[:i] + _OCR[s[i]] + s[i + 1:]
+def _case(s: str, rng: random.Random) -> str:
+    return rng.choice([s.upper(), s.lower(), s.title()])
+def _ws(s: str, rng: random.Random) -> str:
+    return rng.choice([" " * rng.randint(1, 2) + s, s + " " * rng.randint(1, 2),
+                       s.replace(" ", "  ", 1) if " " in s else " " + s])
+INJECTORS = {"typo": _typo, "ocr": _ocr, "case": _case, "whitespace": _ws}
+def _categorical_text_cols(df, max_cols: int = 12) -> list[str]:
+    """Text columns whose values RECUR (canonicalization is meaningful)."""
+    out = []
+    for c in df.columns:
+        vals = [str(v).strip() for v in df[c].tolist() if str(v).strip()]
+        if len(vals) < 20:
+            continue
+        alpha = sum(1 for v in vals if any(ch.isalpha() for ch in v)) / len(vals)
+        nonnum = 0
+        for v in vals:
+            try:
+                float(v.replace(",", ""))
+            except ValueError:
+                nonnum += 1
+        if alpha < 0.7 or nonnum / len(vals) < 0.7:
+            continue
+        if len(set(vals)) / len(vals) > 0.5:       # must recur (categorical)
+            continue
+        out.append(c)
+        if len(out) >= max_cols:
+            break
+    return out
+def inject(clean_df, error_type: str, seed: int, rate: float = 0.07):
+    """Return a dirty copy of `clean_df` with `error_type` errors injected into a
+    `rate` fraction of cells in its categorical-text columns, or None if no eligible
+    column. The original `clean_df` is the exact ground truth."""
+    fn = INJECTORS[error_type]
+    cols = _categorical_text_cols(clean_df)
+    if not cols:
+        return None
+    rng = random.Random(seed)
+    dirty = clean_df.copy()
+    touched = 0
+    for c in cols:
+        col = dirty[c].tolist()
+        for i, v in enumerate(col):
+            s = str(v)
+            if s.strip() and rng.random() < rate:
+                nv = fn(s, rng)
+                if nv != s:
+                    col[i] = nv
+                    touched += 1
+        dirty[c] = col
+    return dirty if touched else None

eval/inject_validity.py ADDED Viewed

	@@ -0,0 +1,317 @@

+"""W4.5 inject-validity (TableEG-style) — does the injected slice LOOK like and RANK
+like the real slice?
+(1) Classifies every real dirty->gold cell error (hospital's 509 + all 42 paired
+sources eval/paired_bench.py walks) with a deterministic taxonomy (typo/edit-dist<=2,
+case-only, whitespace, encoding/mojibake, numeric, date-format, token-swap, missing,
+other); (2) classifies the suite's INJECTED errors at the money-table seeds (7/17/27);
+(3) reports Jensen-Shannon divergence (base 2) between injected and real type
+distributions, pooled and per real source; (4) reports Kendall tau-b between system
+rankings on the injected vs real F1 slices of money_table_head.json, with degenerate
+policies (abstain-all / random-edit / oracle) run through the same suite as anchors.
+Honesty rule: if the injector is far from real (high JSD), that IS the result — the
+paper's mitigation (both slices reported separately) already stands.
+    uv run python -m eval.inject_validity              # full run (~15 min CPU)
+    uv run python -m eval.inject_validity --tex-only   # rebuild the snippet from JSON
+Writes eval/results/inject_validity.json + eval/results/inject_validity_appendix.tex.
+"""
+from __future__ import annotations
+import collections
+import json
+import math
+import time
+from datetime import datetime
+from pathlib import Path
+from .degenerate import _abstain_all, _oracle, _random_edit
+from .metrics import _cell_equal
+from .paired_bench import _load, pairs
+from .run_real_multi import build_suite, score
+ROOT = Path(__file__).resolve().parent.parent
+SEEDS = (7, 17, 27)            # money-table seeds (run_real_multi.main)
+CATS = ["typo", "case", "whitespace", "encoding", "numeric", "date-format",
+        "token-swap", "missing", "other"]
+EXPECT = {"typo": "typo", "ocr": "typo", "case": "case", "whitespace": "whitespace"}
+_MOJI = ("�", "Ã", "Â", "â€", "ï¿")
+_DATE_FMTS = ("%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y", "%m/%d/%y", "%Y/%m/%d",
+              "%d-%m-%Y", "%b %d, %Y", "%B %d, %Y", "%d %b %Y", "%Y%m%d")
+def _num(s: str):
+    t = s.strip().replace(",", "").lstrip("$").rstrip("%")
+    try:
+        return float(t)
+    except ValueError:
+        return None
+def _date(s: str):
+    for f in _DATE_FMTS:
+        try:
+            return datetime.strptime(s.strip(), f).date()
+        except ValueError:
+            pass
+    return None
+def _lev_gt2(a: str, b: str) -> bool:
+    """True iff Levenshtein(a, b) > 2 (banded DP, O(len*5))."""
+    k = 2
+    la, lb = len(a), len(b)
+    if abs(la - lb) > k:
+        return True
+    INF = k + 1
+    prev = [min(j, INF) for j in range(lb + 1)]
+    for i in range(1, la + 1):
+        lo, hi = max(1, i - k), min(lb, i + k)
+        cur = [INF] * (lb + 1)
+        if i <= k:
+            cur[0] = i
+        for j in range(lo, hi + 1):
+            cur[j] = min(prev[j] + 1, cur[j - 1] + 1,
+                         prev[j - 1] + (a[i - 1] != b[j - 1]), INF)
+        prev = cur
+        if min(prev[max(0, lo - 1):hi + 1]) >= INF:
+            return True
+    return prev[lb] > k
+def classify(d, g) -> str:
+    """Deterministic error type from (dirty, gold) cell pair. Order matters:
+    surface classes first, then value classes, edit-distance last."""
+    ds, gs = str(d), str(g)
+    if not ds.strip() or not gs.strip():
+        return "missing"
+    if "".join(ds.split()) == "".join(gs.split()):
+        return "whitespace"
+    if "".join(ds.split()).casefold() == "".join(gs.split()).casefold():
+        return "case"
+    if any(m in ds for m in _MOJI) != any(m in gs for m in _MOJI):
+        return "encoding"
+    if _num(ds) is not None and _num(gs) is not None:
+        return "numeric"
+    dd, gd = _date(ds), _date(gs)
+    if dd is not None and dd == gd:
+        return "date-format"
+    dt, gt = sorted(ds.casefold().split()), sorted(gs.casefold().split())
+    if dt == gt and len(dt) > 1:
+        return "token-swap"
+    if not _lev_gt2(ds.strip(), gs.strip()):
+        return "typo"
+    return "other"
+def _classify_pair(dirty, clean) -> collections.Counter:
+    n = min(len(dirty), len(clean))
+    c = collections.Counter()
+    for j in range(dirty.shape[1]):
+        for i in range(n):
+            dv, cv = dirty.iat[i, j], clean.iat[i, j]
+            if not _cell_equal(dv, cv):
+                c[classify(dv, cv)] += 1
+    return c
+def _jsd(p: dict, q: dict) -> float:
+    """Jensen-Shannon divergence, base 2 (0 = identical, 1 = disjoint)."""
+    sp, sq = sum(p.values()), sum(q.values())
+    out = 0.0
+    for k in set(p) | set(q):
+        a, b = p.get(k, 0) / sp, q.get(k, 0) / sq
+        m = (a + b) / 2
+        if a:
+            out += 0.5 * a * math.log2(a / m)
+        if b:
+            out += 0.5 * b * math.log2(b / m)
+    return out
+def _tau_b(xs, ys) -> float:
+    """Kendall tau-b (tie-corrected); n is small, O(n^2) is fine."""
+    n0 = nc = nd = tx = ty = 0
+    for i in range(len(xs)):
+        for j in range(i + 1, len(xs)):
+            n0 += 1
+            a, b = xs[i] - xs[j], ys[i] - ys[j]
+            tx += a == 0
+            ty += b == 0
+            nc += a * b > 0
+            nd += a * b < 0
+    den = ((n0 - tx) * (n0 - ty)) ** 0.5
+    return (nc - nd) / den if den else 0.0
+def _dist(counter) -> dict:
+    tot = sum(counter.values())
+    return {k: round(counter.get(k, 0) / tot, 4) for k in CATS} if tot else {}
+def _suite_slices(cleaner) -> tuple[float, float]:
+    """(real-slice mean F1, injected-slice mean F1 over SEEDS) for a degenerate
+    cleaner(dirty, clean) -> out, mirroring run_real_multi's by-source means."""
+    real = []
+    for spec in build_suite(seed=SEEDS[0]):
+        if spec["source"] != "real":
+            continue
+        dirty, clean = spec["load"]()
+        real.append(score(dirty, clean, cleaner(dirty, clean))["f1"])
+    inj = []
+    for s in SEEDS:
+        fs = []
+        for spec in build_suite(seed=s):
+            if spec["source"] != "injected":
+                continue
+            loaded = spec["load"]()
+            if loaded is None:
+                continue
+            dirty, clean = loaded
+            fs.append(score(dirty, clean, cleaner(dirty, clean))["f1"])
+        inj.append(sum(fs) / len(fs))
+    return sum(real) / len(real), sum(inj) / len(inj)
+def _write_tex(out: dict, res: Path) -> None:
+    rd, jd = out["real"]["pooled_dist"], out["injected"]["pooled_dist"]
+    j, rk = out["jsd"], out["ranking"]
+    L = [r"% Auto-generated by eval/inject_validity.py — do not edit by hand.",
+         r"\subsection{Validity of the Injected Slice}\label{app:inject-validity}",
+         r"Following the TableEG-style audit, we classify every error cell (dirty vs.\ gold)",
+         r"with a deterministic taxonomy and compare the suite's injected errors (money-table",
+         r"seeds " + "/".join(map(str, out["seeds"])) + r", $n=" +
+         f"{out['injected']['n']:,}".replace(",", r"{,}") + r"$) against the $" +
+         f"{out['real']['n']:,}".replace(",", r"{,}") +
+         r"$ real errors across the 42 paired sources (hospital's " +
+         f"{out['real']['hospital_n']}" + r" included).",
+         r"\begin{table}[t]\centering\small",
+         r"\caption{Error-type distributions, real vs.\ injected (pooled).}",
+         r"\label{tab:inject-validity}",
+         r"\begin{tabular}{lrr}\toprule",
+         r"error type & real & injected \\ \midrule"]
+    for c in CATS:
+        L.append(f"{c} & {rd.get(c, 0):.3f} & {jd.get(c, 0):.3f} " + r"\\")
+    L += [r"\bottomrule\end{tabular}\end{table}",
+          r"The injector covers only the recoverable surface classes it targets by design",
+          r"(typo/case/whitespace; injector--taxonomy agreement " +
+          f"{out['injected']['injector_taxonomy_agreement']:.3f}" + r"), whereas real errors",
+          r"are dominated by substitutions beyond edit distance~2 (other, " +
+          f"{rd['other']:.3f}" + r") and short typos (" + f"{rd['typo']:.3f}" +
+          r"), with numeric (" + f"{rd['numeric']:.3f}" + r"), missing-value (" +
+          f"{rd['missing']:.3f}" + r"), and encoding classes the injector never produces.",
+          r"Pooled Jensen--Shannon divergence is " + f"{j['pooled']:.3f}" +
+          r"~bits (per-source median " + f"{j['median']:.3f}" + r", range " +
+          f"{j['min']:.3f}" + r"--" + f"{j['max']:.3f}" + r"; hospital " +
+          f"{j['hospital_vs_injected']:.3f}" + r"): the two slices are \emph{not}",
+          r"interchangeable, which is why the paper reports them separately and localizes",
+          r"the grounding claim in the real slice. Ranking preservation is partial: Kendall",
+          r"$\tau_b$ between system rankings on the injected vs.\ real F1 slices is $" +
+          f"{rk['kendall_tau_b_money_table']:.2f}" + r"$ over the four cross-system rows and $" +
+          f"{rk['kendall_tau_b_with_anchors']:.2f}" + r"$ with the degenerate anchors",
+          r"(abstain-all, random-edit, oracle) included. The injected slice preserves the",
+          r"floor/ceiling ordering but ranks OpenRefine fingerprint above both our system",
+          r"and OpenRefine kNN, the reverse of the real slice --- frequency clustering looks",
+          r"strong exactly where the canonical form is present and dominant by construction.",
+          r"Injected-only evaluation would therefore overstate frequency-clustering",
+          r"baselines."]
+    (res / "inject_validity_appendix.tex").write_text("\n".join(L) + "\n")
+def main() -> None:
+    t0 = time.perf_counter()
+    # (1) real errors: all 42 paired sources (hospital included -> its 509)
+    real_per: dict[str, collections.Counter] = {}
+    for p in pairs():
+        try:
+            dirty, clean = _load(p)
+        except Exception as e:  # noqa: BLE001
+            print(f"  {p.name}: LOAD FAILED {type(e).__name__}")
+            continue
+        real_per[p.name] = _classify_pair(dirty, clean)
+        print(f"  real {p.name:<46} n={sum(real_per[p.name].values())}", flush=True)
+    real_pool = sum(real_per.values(), collections.Counter())
+    t_real = time.perf_counter() - t0
+    # (2) injected errors at the money-table seeds, via the SAME suite generator
+    inj_pool = collections.Counter()
+    inj_per_injector: dict[str, collections.Counter] = collections.defaultdict(collections.Counter)
+    inj_per_seed = {}
+    for s in SEEDS:
+        cs = collections.Counter()
+        for spec in build_suite(seed=s):
+            if spec["source"] != "injected":
+                continue
+            loaded = spec["load"]()
+            if loaded is None:
+                continue
+            dirty, clean = loaded
+            c = _classify_pair(dirty, clean)
+            cs += c
+            inj_per_injector[spec["name"].split(":")[1]] += c
+        inj_per_seed[s] = sum(cs.values())
+        inj_pool += cs
+        print(f"  injected seed={s} n={inj_per_seed[s]}", flush=True)
+    agree = sum(inj_per_injector[et][want] for et, want in EXPECT.items())
+    t_inj = time.perf_counter() - t0 - t_real
+    # (3) distribution similarity
+    jsd_per_source = {k: round(_jsd(real_per[k], inj_pool), 4)
+                      for k in sorted(real_per) if real_per[k]}
+    jsd_vals = sorted(jsd_per_source.values())
+    # (4) ranking preservation: money-table systems + degenerate anchors
+    money = json.load(open(ROOT / "eval" / "results" / "money_table_head.json"))
+    systems = [{"system": r["system"], "real_f1": r["real_f1"], "inj_f1": r["inj_f1"],
+                "anchor": False} for r in money]
+    for name, fn in [("abstain-all", _abstain_all), ("random-edit", _random_edit),
+                     ("oracle", _oracle)]:
+        rf, jf = _suite_slices(fn)
+        systems.append({"system": name, "real_f1": rf, "inj_f1": jf, "anchor": True})
+        print(f"  anchor {name:<12} real={rf:.3f} inj={jf:.3f}", flush=True)
+    tau_money = _tau_b([s["real_f1"] for s in systems if not s["anchor"]],
+                       [s["inj_f1"] for s in systems if not s["anchor"]])
+    tau_all = _tau_b([s["real_f1"] for s in systems], [s["inj_f1"] for s in systems])
+    out = {
+        "taxonomy": CATS, "seeds": list(SEEDS),
+        "real": {"n": sum(real_pool.values()), "n_sources": len(real_per),
+                 "hospital_n": sum(real_per.get("hospital", {}).values()),
+                 "pooled_counts": dict(real_pool), "pooled_dist": _dist(real_pool),
+                 "per_source": {k: {"n": sum(v.values()), "dist": _dist(v)}
+                                for k, v in sorted(real_per.items())}},
+        "injected": {"n": sum(inj_pool.values()), "per_seed_n": inj_per_seed,
+                     "pooled_counts": dict(inj_pool), "pooled_dist": _dist(inj_pool),
+                     "per_injector_dist": {k: _dist(v)
+                                           for k, v in sorted(inj_per_injector.items())},
+                     "injector_taxonomy_agreement": round(agree / sum(inj_pool.values()), 4)},
+        "jsd": {"pooled": round(_jsd(real_pool, inj_pool), 4),
+                "hospital_vs_injected": round(_jsd(real_per["hospital"], inj_pool), 4),
+                "per_real_source_vs_injected": jsd_per_source,
+                "min": jsd_vals[0], "median": jsd_vals[len(jsd_vals) // 2],
+                "max": jsd_vals[-1]},
+        "ranking": {"systems": systems,
+                    "kendall_tau_b_money_table": round(tau_money, 4),
+                    "kendall_tau_b_with_anchors": round(tau_all, 4)},
+        "sec": {"real_classify": round(t_real, 1), "injected_classify": round(t_inj, 1),
+                "total": round(time.perf_counter() - t0, 1)},
+    }
+    res = ROOT / "eval" / "results"
+    json.dump(out, open(res / "inject_validity.json", "w"), indent=1)
+    _write_tex(out, res)
+    print(f"JSD pooled={out['jsd']['pooled']} tau(money)={tau_money:.3f} "
+          f"tau(+anchors)={tau_all:.3f} -> {res / 'inject_validity.json'} "
+          f"+ inject_validity_appendix.tex ({out['sec']['total']}s)")
+if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--tex-only", action="store_true",
+                    help="rebuild the LaTeX snippet from the existing JSON")
+    if ap.parse_args().tex_only:
+        res = ROOT / "eval" / "results"
+        _write_tex(json.load(open(res / "inject_validity.json")), res)
+        print(f"-> {res / 'inject_validity_appendix.tex'}")
+    else:
+        main()