OpenAI Codex OpenAI Codex commited on
Commit
16dc556
·
0 Parent(s):

deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build

Browse files

Tags the submission for OpenAI's Best Use of Codex prize — backed by real
Codex-attributed commits (@codex in the connected GitHub repo + this Space's history).
Same human-verified Codex-hardened build (84 tests green).

Co-authored-by: OpenAI Codex <codex@openai.com>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +47 -0
  2. .gitignore +37 -0
  3. .python-version +1 -0
  4. PRODUCT.md +162 -0
  5. README.md +231 -0
  6. TRANSFER.md +69 -0
  7. app.py +90 -0
  8. design/mockups/calm/index.html +430 -0
  9. design/mockups/cozy/index.html +526 -0
  10. design/mockups/helper/index.html +517 -0
  11. design/mockups/office/index.html +219 -0
  12. docs/DATASETS.md +57 -0
  13. docs/DEGENERATE_BASELINES.md +30 -0
  14. docs/FIELD_NOTES.md +128 -0
  15. docs/GITTABLES_AUDIT.md +24 -0
  16. docs/PAIRED_BENCH.md +49 -0
  17. docs/PAPER.md +66 -0
  18. docs/SCALING_ARM.md +46 -0
  19. docs/TOOL_REFERENCE.md +251 -0
  20. docs/WILD_BENCH.md +41 -0
  21. docs/assets/space_landing.png +3 -0
  22. docs/assets/space_results.png +3 -0
  23. docs/paper/fig_label_curve.pdf +3 -0
  24. docs/paper/fig_label_curve.png +3 -0
  25. docs/paper/fig_precision_coverage.pdf +3 -0
  26. docs/paper/fig_precision_coverage.png +3 -0
  27. docs/paper/fig_risk_coverage.pdf +3 -0
  28. docs/paper/fig_risk_coverage.png +3 -0
  29. docs/paper/main.aux +59 -0
  30. docs/paper/main.log +269 -0
  31. docs/paper/main.pdf +3 -0
  32. docs/paper/main.tex +1021 -0
  33. docs/paper/numbers.tex +146 -0
  34. eval/README.md +136 -0
  35. eval/__init__.py +12 -0
  36. eval/ablations.py +64 -0
  37. eval/baselines_learned.py +145 -0
  38. eval/calibration.py +119 -0
  39. eval/capture_plan_local.py +90 -0
  40. eval/contamination_probe.py +57 -0
  41. eval/cross_scoring.py +294 -0
  42. eval/degenerate.py +172 -0
  43. eval/diagnose_model.py +91 -0
  44. eval/equivalence.py +119 -0
  45. eval/generalization.py +180 -0
  46. eval/gittables_audit.py +95 -0
  47. eval/gold.jsonl +0 -0
  48. eval/gold.py +64 -0
  49. eval/inject.py +103 -0
  50. eval/inject_validity.py +317 -0
.gitattributes ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ docs/assets/space_results.png filter=lfs diff=lfs merge=lfs -text
37
+ docs/paper/main.pdf filter=lfs diff=lfs merge=lfs -text
38
+ docs/paper-eab/main.pdf filter=lfs diff=lfs merge=lfs -text
39
+ docs/paper-pvldb/main.pdf filter=lfs diff=lfs merge=lfs -text
40
+ *.pdf filter=lfs diff=lfs merge=lfs -text
41
+ *.png filter=lfs diff=lfs merge=lfs -text
42
+ *.pdf filter=lfs diff=lfs merge=lfs -text
43
+ *.png filter=lfs diff=lfs merge=lfs -text
44
+ *.pdf filter=lfs diff=lfs merge=lfs -text
45
+ *.png filter=lfs diff=lfs merge=lfs -text
46
+ *.pdf filter=lfs diff=lfs merge=lfs -text
47
+ *.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ .venv/
5
+ *.egg-info/
6
+
7
+ # uv
8
+ .uv/
9
+
10
+ # Gradio
11
+ .gradio/
12
+ flagged/
13
+
14
+ # Models / data (keep large artifacts out of git; push to the Hub instead)
15
+ *.gguf
16
+ *.bin
17
+ *.safetensors
18
+ models/
19
+ data/
20
+
21
+ # Env / secrets
22
+ .env
23
+ .env.*
24
+
25
+ # OS / editor
26
+ .DS_Store
27
+ .idea/
28
+ .vscode/
29
+ .gstack/
30
+
31
+ # internal: working memory + agent/skill defs — never publish
32
+ project-memory/
33
+ .claude/
34
+ _private/
35
+
36
+ # demo video assets (local only)
37
+ _video/
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
PRODUCT.md ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ScrubData — Product Research & Spec
2
+
3
+ > What does an office worker actually mean by "just clean my data"? This doc
4
+ > pins down the expectations so the cleaning-plan schema and UX aren't guesses.
5
+ > (Living doc — refine when the deep-research workflows land.)
6
+
7
+ ## 1. The user & the moment
8
+
9
+ **Who:** an operations / sales-ops / finance / admin person. Lives in
10
+ spreadsheets exported from a CRM, an ERP, a Google Form, a POS, a bank portal.
11
+ Not a pandas user. Competent with Excel but doesn't want to write `=PROPER()`
12
+ across 40 columns or learn Power Query.
13
+
14
+ **The moment of pain:** they exported a file to do their actual job —
15
+ build a report, upload to another system, send a mail-merge, reconcile numbers —
16
+ and the file is dirty enough that the next step breaks or lies. The import fails,
17
+ the pivot double-counts, the vlookup misses, the "total revenue" is wrong because
18
+ amounts are text.
19
+
20
+ **What they want:** drop the file in, get a *trustworthy* clean file back, and
21
+ a plain sentence telling them what was wrong so they can vouch for it to their
22
+ boss. They do **not** want 30 config toggles. Hands-off is the whole pitch.
23
+
24
+ **What they fear (must design against):** that the tool silently changed
25
+ something it shouldn't have. Trust is the product. Every change must be
26
+ **visible, explained, and reversible**.
27
+
28
+ ## 2. Taxonomy of "dirty" — what we must detect & fix
29
+
30
+ Grouped by how an office worker would describe it. This list *is* the operation
31
+ set the planner emits and the executor implements.
32
+
33
+ ### A. Structural / table-level
34
+ - **Exact duplicate rows** — "this person is in here 3 times."
35
+ - **Near-duplicate rows** — same entity, trivial differences (later/stretch).
36
+ - **Empty rows & empty columns** — junk from the export.
37
+ - **Header problems** — header not in row 1, merged cells, `Unnamed: 0`,
38
+ duplicated column names, units baked into headers (`Amount (USD)`).
39
+ - **Inconsistent column naming** — `First Name` vs `first_name` (normalize to
40
+ snake_case as an option, off by default — it's a rename, higher-trust-risk).
41
+
42
+ ### B. Whitespace & casing (the silent killers behind failed joins)
43
+ - Leading/trailing whitespace; doubled internal spaces; non-breaking spaces.
44
+ - Inconsistent casing (`ACME`, `Acme`, `acme corp`).
45
+ - Invisible characters (zero-width, BOM), smart quotes.
46
+
47
+ ### C. Missing values, disguised
48
+ - Real blanks **plus** disguised nulls: `N/A`, `na`, `-`, `--`, `null`, `None`,
49
+ `#N/A`, `TBD`, `?`, `0` (context-dependent — risky, don't auto-assume).
50
+ - Decision: normalize disguised nulls → true missing; **imputation is opt-in**,
51
+ never silent (filling values is a claim about reality).
52
+
53
+ ### D. Type & format inconsistency (where the model earns its keep)
54
+ - **Numbers stored as text:** `"$1,200.50"`, `"1.200,50"` (EU), `"(500)"`
55
+ (accounting negative), `"12%"`, `"1,2k"`.
56
+ - **Dates in mixed formats:** `2023-01-05`, `01/05/2023`, `5 Jan 2023`,
57
+ `Jan-23`, Excel serial `44931`. Ambiguous DMY vs MDY must be detected, not
58
+ guessed blindly — infer from the column's evidence, flag if undecidable.
59
+ - **Booleans:** `Yes/No`, `Y/N`, `TRUE/FALSE`, `1/0`, `T/F`, `✓`.
60
+ - **Phone numbers:** wildly inconsistent; standardize to E.164-ish where region
61
+ is inferable, else just strip to digits + canonical format.
62
+ - **Emails:** casing, whitespace, obvious typos (`@gmial.com`), trailing junk.
63
+
64
+ ### E. Categorical canonicalization (the headline AI feature)
65
+ - Inconsistent labels for the same thing: `USA / U.S.A. / United States / us`,
66
+ `M/F vs Male/Female`, `NY / New York / new york`, status fields, product
67
+ names. Rules can't enumerate these — **the small model proposes the mapping**,
68
+ the executor applies it, the report shows the mapping for approval.
69
+
70
+ ### F. Validity / anomaly flags (flag, don't auto-delete)
71
+ - Out-of-range numbers (age 999, negative price), impossible dates (1899-12-31
72
+ Excel epoch), malformed emails/phones, values that don't match the column's
73
+ inferred type. Default action = **flag in the report**, not silent edit.
74
+
75
+ ## 3. The trust contract (design principles)
76
+
77
+ 1. **Visible** — every operation appears in a before/after diff and the report.
78
+ 2. **Explained** — plain-English rationale per operation ("standardized 4 date
79
+ formats into ISO `YYYY-MM-DD`").
80
+ 3. **Conservative by default** — destructive/assumptive ops (imputation, row
81
+ deletion beyond exact dups, renames) are surfaced as suggestions, applied
82
+ only if the user keeps them on. Safe ops (trim whitespace, normalize disguised
83
+ nulls, parse types) are on by default.
84
+ 4. **Reversible** — original file untouched; output is a new file + a machine-
85
+ readable plan the user could replay or undo.
86
+ 5. **No config to start** — sensible defaults run immediately on upload; the
87
+ plan is editable *after* the user sees it, not a wall of options before.
88
+
89
+ ## 4. Competitive landscape (what to learn / what to beat)
90
+
91
+ | Tool | What it does well | Why an office worker bounces |
92
+ |------|-------------------|------------------------------|
93
+ | **Excel / Power Query** | Ubiquitous, trusted | Manual; canonicalization is hand-built; steep |
94
+ | **OpenRefine** | Powerful clustering/canonicalization (key-collision, kNN) | Intimidating UI, GREL expressions, local Java app |
95
+ | **ydata-profiling / pandas-profiling** | Great *profiling* report | Diagnoses, doesn't *fix* |
96
+ | **Trifacta / Tableau Prep / Alteryx** | Visual prep pipelines | Enterprise, paid, config-heavy |
97
+ | **OpenRefine reconciliation** | Entity canonicalization | Manual, needs setup |
98
+
99
+ **Our wedge:** OpenRefine's clustering *automated and explained by a small
100
+ model*, with zero config and a one-screen trust-preserving UX. We borrow
101
+ OpenRefine's clustering idea but the model proposes the clusters/mappings and
102
+ narrates them, so the user never learns a tool — they just approve sentences.
103
+
104
+ ## 5. Cleaning-plan schema (v0 — drives the mock & later the model)
105
+
106
+ The model outputs this JSON; the executor consumes it. Designed so the model
107
+ only does *semantic/fuzzy* judgment, and all execution is deterministic.
108
+
109
+ ```json
110
+ {
111
+ "dataset_summary": "Contacts export, 38 rows × 9 cols; sales-lead data.",
112
+ "table_operations": [
113
+ {"op": "drop_exact_duplicates", "rationale": "5 identical rows."},
114
+ {"op": "drop_empty_rows"},
115
+ {"op": "drop_empty_columns", "columns": ["notes2"]}
116
+ ],
117
+ "columns": [
118
+ {
119
+ "name": "country",
120
+ "detected_semantic_type": "country",
121
+ "issues": ["inconsistent_categories", "whitespace", "casing"],
122
+ "operations": [
123
+ {"op": "strip_whitespace"},
124
+ {"op": "canonicalize_categories",
125
+ "mapping": {"usa": "United States", "u.s.a.": "United States",
126
+ "us": "United States", "uk": "United Kingdom"},
127
+ "rationale": "Unified 4 spellings into 2 canonical country names."}
128
+ ],
129
+ "confidence": 0.93
130
+ },
131
+ {
132
+ "name": "amount",
133
+ "detected_semantic_type": "currency",
134
+ "issues": ["numeric_stored_as_text", "currency_symbols"],
135
+ "operations": [
136
+ {"op": "parse_currency", "rationale": "Stripped $ and thousands separators; → float."}
137
+ ],
138
+ "confidence": 0.97
139
+ }
140
+ ],
141
+ "flags": [
142
+ {"column": "age", "row_hint": "value 999", "issue": "out_of_range",
143
+ "action": "flag_only", "rationale": "Likely placeholder; left for human review."}
144
+ ]
145
+ }
146
+ ```
147
+
148
+ ### Operation vocabulary (executor must implement)
149
+ Safe-by-default: `strip_whitespace`, `collapse_internal_whitespace`,
150
+ `normalize_disguised_nulls`, `standardize_case`, `parse_currency`,
151
+ `parse_number`, `parse_percent`, `parse_date`, `standardize_boolean`,
152
+ `standardize_phone`, `normalize_email`, `drop_exact_duplicates`,
153
+ `drop_empty_rows`, `drop_empty_columns`, `canonicalize_categories`.
154
+ Opt-in (assumptive): `impute_missing`, `drop_near_duplicates`,
155
+ `rename_columns_snake_case`, `coerce_outliers`.
156
+ Flag-only: `flag_out_of_range`, `flag_invalid_format`, `flag_type_mismatch`.
157
+
158
+ ## 6. Success metric for the demo (Backyard AI judging)
159
+
160
+ A real office person uploads a real ugly export, clicks one button, and says
161
+ "oh thank god" — then trusts the result enough to use it, because the report
162
+ told them exactly what changed. That sentence is the bar.
README.md ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ScrubData
3
+ emoji: 🏔️
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 6.16.0
8
+ app_file: server.py
9
+ pinned: true
10
+ license: mit
11
+ tags:
12
+ - track:backyard
13
+ - sponsor:openai
14
+ - sponsor:modal
15
+ - achievement:offgrid
16
+ - achievement:welltuned
17
+ - achievement:offbrand
18
+ - achievement:llama
19
+ - achievement:sharing
20
+ - achievement:fieldnotes
21
+ ---
22
+ # ScrubData — hands-off data cleaning, with the receipts
23
+
24
+ Entry for the **Build Small Hackathon** (Gradio · Hugging Face), 🏡 Backyard AI track.
25
+ Runs a ≤4B model — a local-runnable GGUF, no third-party AI APIs → also in the running for
26
+ **Tiny Titan**, **Off-Brand**, **Best Demo**, **Best Agent**, and **Bonus Quest Champion**
27
+ (all six quests claimed above).
28
+
29
+ <!-- SUBMISSION LINKS (all set for June 15):
30
+ Demo video: https://www.loom.com/share/2fa868147527496e8097d82dd546d663 [DONE]
31
+ Social post: https://x.com/ric_alanis/status/2066598533738692983 [DONE]
32
+ These links + this write-up are required by the build-small-hackathon /submit tool. -->
33
+
34
+ > **Hosted demo vs. local — read this.** This Space is a **no-install demo** that cleans with
35
+ > the real **Qwen3-4B fine-tune** by default (served on an A100 GPU, ~1 min/clean warm; first
36
+ > run after idle ~2 min on cold start) — the whole point
37
+ > is the small model doing the work. Your file is processed on Hugging Face / the GPU endpoint
38
+ > (sent to no third-party API, not stored); untick the box for an instant deterministic pass.
39
+ > The **privacy story is a property of running it yourself**: `SCRUBDATA_MODEL=scrubdata-ft uv
40
+ > run server.py` reads and cleans your file on-device with the same fine-tune — nothing leaves
41
+ > your machine. The app labels its own mode honestly (the ribbon says which one you're using).
42
+ > Same auditable plan→verify→execute pipeline either way.
43
+
44
+ > **Modal** (`sponsor:modal`): the hosted Space cleans with the Qwen3-4B fine-tune served from a
45
+ > **scale-to-zero Modal GPU endpoint** (`scripts/modal_serve.py`, Ollama on an A100; $0 when idle,
46
+ > pre-warmed on page load to hide the cold start). Modal also drove the headless training +
47
+ > evaluation loop behind the published model. The deterministic planner is the silent fallback
48
+ > if the GPU is cold or down, so the demo never hard-fails.
49
+
50
+ > **Drop a messy export. Get clean data back — every change named, reversible, and
51
+ > explained. Anything sensitive is protected locally. The judgment calls stay yours.**
52
+ >
53
+ > For the office/ops person trying to do their job while their data is a mess.
54
+
55
+ **Built by:** [@ricalanis](https://huggingface.co/ricalanis) (solo) · 🤗 Hugging Face: `ricalanis`
56
+ **Live Space:** https://huggingface.co/spaces/build-small-hackathon/scrubdata
57
+ **Code (open source):** https://github.com/ricalanis/scrubdata-hackathon
58
+ **Demo video:** https://www.loom.com/share/2fa868147527496e8097d82dd546d663
59
+ **Write-up / post:** https://x.com/ric_alanis/status/2066598533738692983
60
+
61
+ ## How it works
62
+
63
+ A small local model is the **planner**, never a row-by-row editor:
64
+
65
+ 1. **Profile** — pandas aggregates each column into a value–frequency distribution
66
+ (scale-invariant: a million rows profile like a hundred).
67
+ 2. **Plan** — the model reads the profile and emits a structured JSON cleaning plan:
68
+ canonicalization mappings, format fixes, dedup, anomaly flags.
69
+ 3. **Ground** — canonical forms are never invented: values reconcile against reference
70
+ taxonomies (GeoNames 196k cities, ISO countries/states, and a pluggable **entity
71
+ reference** built from harvested vocabularies — ToughTables/MusicBrainz/Wikidata/ROR,
72
+ ~100k entities) with fuzzy retrieval; ambiguous matches **abstain** and surface for
73
+ human review (calibrated: 90% precision at the default threshold, ≥95% at 0.91).
74
+ Profiles carry **suspect_values** — rare anomalous surfaces with evidence-backed
75
+ candidates — so high-cardinality columns are no longer invisible to the planner
76
+ (measured: five all-unique-surface benchmark tables went 0.0 → 0.96 F1 at zero damage).
77
+ 4. **Verify** — every model-proposed mapping is scored by deterministic evidence
78
+ (errors-are-rare frequency gates, variant similarity, reference agreement); entries
79
+ below the confidence threshold (`SCRUBDATA_TAU`, default 0.5) become review flags
80
+ instead of edits. The shipped **verified union planner** (gated model plan ∪ grounded
81
+ heuristic) measures **0.905 precision @ 0.413 coverage** on hospital's 509 real errors
82
+ — the gated model plan alone is 0.993 @ 0.287.
83
+ 5. **Protect** — PII is detected locally (Luhn/IBAN checksums + a 44M OpenMed-PII
84
+ classifier): cards/SSNs masked format-preservingly, contacts flagged, **0/360 residual
85
+ PII** after masking in our leak test.
86
+ 6. **Execute** — deterministic pandas applies the plan. No silent edits, by construction;
87
+ every run exports an audit trail (OpenTelemetry-GenAI spans + open traces).
88
+
89
+ **Model:** `Qwen3-4B-Instruct-2507` (Tiny Titan), QLoRA fine-tuned on **execution-verified**
90
+ synthetic + real-derived data (every training plan provably recovers the clean table),
91
+ runnable via llama.cpp GGUF.
92
+
93
+ ## The app (what judges see)
94
+ A custom `gr.Server` frontend (no default Gradio chrome — the **Off-Brand** quest), built
95
+ around the trust story:
96
+ - **YOUR CALL cards** — when the model is genuinely torn (e.g. *Slovia → Slovakia 86% vs
97
+ Slovenia 86%*) it abstains and hands you the tie with both candidates; pick the right one
98
+ and **stage several decisions**, then "✓ Clean now" replays them as one plan.
99
+ - **Named, reversible receipts** — every edit shows as a row in the audit grid with its op +
100
+ rationale and a before/after diff; nothing is silent.
101
+ - **PII review cards** — embedded cards/SSNs (Luhn/strict-regex) flagged and masked
102
+ format-preservingly, on-device.
103
+ - **Save / replay recipe** — export the cleaning plan as JSON and re-apply it to next week's
104
+ export in one click (the "Monday ritual").
105
+ - **Honest, self-aware copy** — the app injects its own runtime state and the ribbon says
106
+ exactly which planner ran and where your data was processed.
107
+ - **A fun, size-aware ETA timer** + cold-start readiness gate + page-load GPU pre-warm, so
108
+ the model path feels responsive and never lies about progress.
109
+ - Drag-and-drop, two bundled sample exports, mobile-responsive layout.
110
+
111
+ ## What real users told us (and what we changed)
112
+
113
+ Before submission we put the live Space in front of people who **aren't** data folks — the
114
+ exact audience the tool is for — and sent the link with one line: *"if you have a messy
115
+ spreadsheet, try it."* The most useful finding wasn't a bug. It was that the word
116
+ **"cleaning" didn't land**:
117
+
118
+ - One tester read "clean my Excel" as *deleting* data:
119
+ *"¿Te refieres a que elimine algo de algún archivo?"* — "You mean it removes something
120
+ from the file?"
121
+ - Another didn't know where to begin:
122
+ *"¿eso del Excel te lo subimos ahí o cómo?"* — "the Excel thing, do we upload it there,
123
+ or how?"
124
+ - The clearest explanation in the whole thread was one we had to type by hand in chat:
125
+ *"it fixes text errors — names, phones, emails, cities."* That sentence wasn't anywhere
126
+ in the product.
127
+
128
+ So we changed the product to **show** what cleaning means instead of naming it:
129
+
130
+ - the hero now leads with a literal before→after strip
131
+ (`nigeia → Nigeria`, `Calfornia → California`, `Ana@GMAIL.com → ana@gmail.com`,
132
+ `415.555.0192 → (415) 555-0192`) so the value is obvious *before* any upload;
133
+ - the headline is the sentence that worked in chat — **"Fix the messy text in your
134
+ spreadsheet"** — and the copy says plainly **"I never delete your data"** (killing the
135
+ "does it erase things?" misread);
136
+ - a one-click **"watch it run on a sample file"** path removes the "where do I start?" wall;
137
+ - jargon labels are gone ("HR payroll (with PII)" → "an HR file with sensitive data").
138
+
139
+ n is small and informal (friends-and-network, ~3 people), so this isn't a usability *study* —
140
+ but the feedback was real, it pointed at a failure of the *framing* rather than the engine,
141
+ and it changed the build. The persona "Maria" below is the controlled walk-through; the
142
+ quotes above are verbatim from people we know.
143
+
144
+ ## Measured (not vibes)
145
+
146
+ - **Canonicalization micro-F1 0.90 (best single run; 0.80 ± 0.01 over 3 training seeds)** for the 4B
147
+ fine-tune vs **0.45** for a much larger generic model vs **0.15** for rules.
148
+ - Real errors (5-benchmark macro): grounded cleaning reaches REAL-F1 **0.225**, 3.9×
149
+ OpenRefine kNN (0.058) and 5.7× fingerprint (0.039); the verified-union gate repairs
150
+ 41% of hospital's 509 real errors at **0.905 precision**, every declined merge
151
+ surfaced for review.
152
+ - Evaluated on a **65-dataset suite** (Raha benchmarks + seeded error injection over 15
153
+ open-data domains) with a churn-neutral metric that can't be gamed by mass rewriting.
154
+ - Full write-up: `docs/paper/` (preprint draft) · details in `eval/README.md`.
155
+
156
+ ## Run it
157
+
158
+ ```bash
159
+ uv sync
160
+ uv run server.py # gr.Server + custom UI (grounded heuristic)
161
+
162
+ # fine-tuned model as planner (needs Ollama + the GGUF, see notebooks/Modelfile):
163
+ ollama pull hf.co/ricalanis/scrubdata-qwen3-4b-v6-q8:Q8_0
164
+ ollama create scrubdata-ft -f notebooks/Modelfile
165
+ SCRUBDATA_MODEL=scrubdata-ft uv run server.py # model planner, heuristic fallback (on-device)
166
+
167
+ SCRUBDATA_PII_NER=1 uv run server.py # +44M NER for name/address columns
168
+ uv run python -m scrubdata.cli messy.csv -o clean.csv --plan plan.json
169
+ uv run pytest tests/ # engine + scorer tests (69)
170
+ ```
171
+
172
+ The hosted Space serves the same fine-tune from a scale-to-zero **Modal A100**
173
+ (`scripts/modal_serve.py`) and the planner adds `format=json` on that path
174
+ (`SCRUBDATA_OLLAMA_FORMAT_JSON=1`) to grammar-constrain the GGUF on the A100's kernels.
175
+ `scripts/modal_warm.py on|off` pins/un-pins a warm container (no cold start) without a
176
+ redeploy — leave it `off` (scale-to-zero, $0 idle), flip `on` for a live judging window.
177
+
178
+ ## Repo map
179
+ - `scrubdata/` — `profiler` · `planner` · `reconcile` (reference grounding + abstain) ·
180
+ `grounded` (RACOON wrapper) · `verifier` (selective prediction + union planner) ·
181
+ `pair_profile` (candidate-constrained canonicalization, opt-in) · `pii` (checksum +
182
+ NER tiers, mask/hash/pseudonymize) · `executor` · `observability` · `trace` ·
183
+ `baselines` (OpenRefine) · `cli`.
184
+ - `training/` — execution-verified synthetic generator + real-data derivation
185
+ (`real_data.py`: paired benchmarks + frequency-derived unpaired open data).
186
+ - `eval/` — frozen gold · wide suite + double-macro north-star (`run_real_multi.py`) ·
187
+ ablations · calibration (risk–coverage) · PII leak test.
188
+ - `docs/paper/` — preprint: *Verified Cleaning Plans: Plan-Level Selective Prediction
189
+ Turns Local LLM Planners into Trustworthy Table Cleaners*.
190
+ - `scripts/` — Modal train/eval (headless GPU loop), trace publishing.
191
+
192
+ ## Research & resources
193
+ Everything behind the demo is public:
194
+ - 🚀 **Live Space** — https://huggingface.co/spaces/build-small-hackathon/scrubdata
195
+ - 💻 **Code (open source)** — https://github.com/ricalanis/scrubdata-hackathon
196
+ - 🧠 **Fine-tuned model** — https://huggingface.co/ricalanis/scrubdata-qwen3-4b
197
+ (Q8_0 GGUF: https://huggingface.co/ricalanis/scrubdata-qwen3-4b-v6-q8)
198
+ - 📊 **WildClean dataset** (real-world dirty tables + injected-error benches) —
199
+ https://huggingface.co/datasets/ricalanis/wildclean
200
+ - 🔍 **Agent traces** (OpenTelemetry-GenAI spans from real runs) —
201
+ https://huggingface.co/datasets/build-small-hackathon/scrubdata-traces
202
+ - 📄 **Preprint** — *Verified Cleaning Plans: Plan-Level Selective Prediction Turns Local
203
+ LLM Planners into Trustworthy Table Cleaners* (`docs/paper/main.pdf`)
204
+ - 📓 **Field notes** (the build story, failures included) — `docs/FIELD_NOTES.md`
205
+ - 🛠️ **Tool reference** (the whole system, end to end) — `docs/TOOL_REFERENCE.md`
206
+
207
+ ## Built with Codex
208
+ The final review-and-refine pass used **OpenAI Codex** (gpt-5.5) as a reviewer / last
209
+ refiner — not to write the product, but to harden it. It added the executor's
210
+ never-corrupt-clean-data regression tests, made column sanitization collision-proof,
211
+ did the accessibility pass (ARIA + keyboard + reduced-motion + focus-visible), and wrote
212
+ characterization tests for the reference matcher. Every change was human-reviewed and
213
+ verified green (84 tests, golden behavior unchanged) before commit; the commits are
214
+ attributed to `@codex` in the git history.
215
+
216
+ ## Submission checklist (verified against the build-small-hackathon `/submit` tool)
217
+ - [x] Public Gradio Space in the `build-small-hackathon` org
218
+ - [x] Every model ≤ 32B (here ≤ 4B → **Tiny Titan**-eligible): `Qwen3-4B-Instruct-2507`
219
+ - [x] README `tags:` set — `track:backyard` + all six `achievement:*` quests (above)
220
+ - [x] **Off the Grid** (`offgrid`) — no third-party AI APIs; the planner is a local-runnable GGUF (Qwen3-4B). Self-hosted = fully on-device (zero external egress); the hosted demo serves the *same* model from a self-managed Modal GPU, not a SaaS API
221
+ - [x] **Well-Tuned** (`welltuned`) — fine-tune published: `ricalanis/scrubdata-qwen3-4b` (+ `-v6-q8` GGUF)
222
+ - [x] **Off-Brand** (`offbrand`) — custom `gr.Server` HTML/CSS frontend, not default Gradio
223
+ - [x] **Llama Champion** (`llama`) — runs through llama.cpp (Q8_0 GGUF)
224
+ - [x] **Sharing is Caring** (`sharing`) — agent traces on the Hub: `build-small-hackathon/scrubdata-traces`
225
+ - [x] **Field Notes** (`fieldnotes`) — build report: `docs/FIELD_NOTES.md`
226
+ - [x] Write-up in this README (idea + tech)
227
+ - [x] **Demo video** link in README: https://www.loom.com/share/2fa868147527496e8097d82dd546d663
228
+ - [x] **Social post** link in README: https://x.com/ric_alanis/status/2066598533738692983
229
+ - [x] Confirm deadline time/timezone — **June 15 2026, 23:59 UTC** (confirmed on the hackathon page)
230
+
231
+ Judged (no tag needed, just qualify): Tiny Titan · Off-Brand prize · Best Demo · Best Agent · Bonus Quest Champion.
TRANSFER.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Machine transfer guide
2
+
3
+ Everything needed to continue this project on a new machine.
4
+
5
+ ## 1. Clone + deps
6
+ ```bash
7
+ git clone https://github.com/ricalanis/scrubdata-hackathon.git ~/Dev/hackaton-small
8
+ cd ~/Dev/hackaton-small && uv sync
9
+ uv run pytest tests/ # 25 tests should pass
10
+ ```
11
+
12
+ ## 2. Restore Claude Code memory (IMPORTANT)
13
+ The agent's persistent memory is bundled in `project-memory/`. On the new machine, after
14
+ opening the project in Claude Code once (so the project dir exists):
15
+ ```bash
16
+ cp project-memory/*.md ~/.claude/projects/-Users-<USER>-Dev-hackaton-small/memory/
17
+ ```
18
+ (Adjust the path-keyed directory name to the new machine's project path. `MEMORY.md` is the
19
+ index; the rest are the knowledge base — data-loop-playbook.md and arxiv-paper.md are the
20
+ operational core.)
21
+
22
+ ## 3. Cloud auth (state lives in the cloud, just re-authenticate)
23
+ ```bash
24
+ uv run modal token new # Modal: adapters in volume scrubdata-v5-adapter
25
+ # (/v5 = v5, /v5_seed21 = v6/mixA winner, seeds 1-3,25,26)
26
+ # results Dicts: scrubdata-train-results (seedN keys),
27
+ # scrubdata-eval-v5-results, scrubdata-suite-results
28
+ hf auth login # HF: Space build-small-hackathon/scrubdata, model repos
29
+ # ricalanis/scrubdata-qwen3-4b{,-v6-q8}, traces dataset
30
+ gh auth login # GitHub
31
+ ```
32
+
33
+ ## 4. Local model (optional, 4.3GB)
34
+ ```bash
35
+ ollama pull hf.co/ricalanis/scrubdata-qwen3-4b-v6-q8:Q8_0
36
+ ollama create scrubdata-ft-v6 -f notebooks/Modelfile
37
+ SCRUBDATA_MODEL=scrubdata-ft-v6 uv run server.py
38
+ ```
39
+
40
+ ## 5. Regenerable data (data/ is gitignored)
41
+ Harvested alias vocabularies + paired examples are PRESERVED in `training/harvests/` —
42
+ copy them back so the generator finds them:
43
+ ```bash
44
+ mkdir -p data && cp training/harvests/*.jsonl data/
45
+ ```
46
+ Big training mixes are regenerable:
47
+ ```bash
48
+ uv run python -m training.build_dataset --n 1600 --out data/v5_synth.jsonl --seed 5
49
+ uv run python -m training.real_data --datasets hospital beers movies_1 --per-dataset 80 --out data/v6_paired_big.jsonl
50
+ # mix recipe (mixA = winner): synth + paired*4, shuffled -> data/v5_train.jsonl
51
+ ```
52
+ The eval suite re-fetches Raha benchmarks automatically; harvested gov/GitHub CSVs
53
+ (data/real/cache) re-download via training/unpaired_sources.json.
54
+
55
+ ## 6. In-flight at transfer time
56
+ - mixH (additive-composition test, seed 30): Modal call `fc-01KTRXTHJKW3G81BT4Q0FZET8G`,
57
+ result lands in Dict `scrubdata-train-results` key `seed30`. Retrieve from any machine:
58
+ ```bash
59
+ uv run python -c "import modal; print(modal.Dict.from_name('scrubdata-train-results').get('seed30'))"
60
+ ```
61
+ - Open question it answers: whether the vocab-mix regressions (mixE/F/G ~0.57-0.59 vs mixA
62
+ 0.748) were eval-coverage shift. See project-memory/data-loop-playbook.md.
63
+
64
+ ## 7. Where everything lives
65
+ - Paper: `docs/paper/main.tex` (+ numbers.tex, fig) — compiles with pdflatex; COMPLETE.
66
+ - Submission kit: `docs/SUBMISSION.md` (demo script + social post), `docs/FIELD_NOTES.md`.
67
+ - Live Space: https://huggingface.co/spaces/build-small-hackathon/scrubdata
68
+ - arXiv next steps: cs.DB endorser etc. — project-memory/arxiv-paper.md.
69
+ - Hackathon deadline: 2026-06-15 (demo video + social post remain).
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ScrubData — hands-off data cleaning (Gradio app).
2
+
3
+ Runnable MOCK demo on gr.Blocks: upload → profile → plan → clean → diff +
4
+ report → download. The planner is a heuristic stand-in for the fine-tuned ≤4B
5
+ model; the rest of the pipeline is real. Final version will port this flow to
6
+ gr.Server + a custom HTML frontend for the Off-Brand bonus quest.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import tempfile
12
+ from pathlib import Path
13
+
14
+ import gradio as gr
15
+ import pandas as pd
16
+
17
+ from scrubdata import apply_plan, mock_plan, profile_dataframe, render_report
18
+ from scrubdata.active import get_planner
19
+ from scrubdata.trace import log_run
20
+
21
+ PLANNER = get_planner() # fine-tuned model if SCRUBDATA_MODEL is set, else heuristic
22
+
23
+ SAMPLE = Path(__file__).parent / "samples" / "dirty_contacts.csv"
24
+
25
+
26
+ def _read_any(path: str) -> pd.DataFrame:
27
+ """Read CSV or Excel as raw strings (cleaning decides the real types)."""
28
+ p = Path(path)
29
+ if p.suffix.lower() in {".xlsx", ".xls"}:
30
+ return pd.read_excel(p, dtype=str)
31
+ return pd.read_csv(p, dtype=str, keep_default_na=False)
32
+
33
+
34
+ def clean(file_path: str):
35
+ if not file_path:
36
+ return (gr.update(), gr.update(), "Upload a CSV or Excel file to begin.", None)
37
+
38
+ raw = _read_any(file_path)
39
+ before = profile_dataframe(raw)
40
+ plan = PLANNER(raw)
41
+ cleaned, log = apply_plan(raw, plan)
42
+ after = profile_dataframe(cleaned)
43
+ report = render_report(plan, log, before, after)
44
+
45
+ out = Path(tempfile.gettempdir()) / "scrubbed.csv"
46
+ cleaned.to_csv(out, index=False)
47
+
48
+ try: # best-effort agent-trace capture (Open trace bonus quest)
49
+ log_run(before, raw, plan, log, model=plan.get("_generated_by", "mock_planner"))
50
+ except Exception:
51
+ pass
52
+
53
+ return raw, cleaned, report, str(out)
54
+
55
+
56
+ def load_sample():
57
+ return str(SAMPLE)
58
+
59
+
60
+ with gr.Blocks(title="ScrubData") as demo:
61
+ gr.Markdown(
62
+ "# 🧽 ScrubData\n"
63
+ "**Upload your dirty spreadsheet. Get clean data back. No config.**\n\n"
64
+ "_Mock demo — heuristic planner standing in for the fine-tuned model._"
65
+ )
66
+
67
+ with gr.Row():
68
+ file_in = gr.File(label="Upload CSV / Excel", file_types=[".csv", ".xlsx", ".xls"],
69
+ type="filepath")
70
+ with gr.Column():
71
+ run_btn = gr.Button("🧽 Clean it", variant="primary")
72
+ sample_btn = gr.Button("Use the messy sample")
73
+
74
+ with gr.Row():
75
+ with gr.Column():
76
+ gr.Markdown("### Before")
77
+ before_df = gr.Dataframe(label="Original", interactive=False, wrap=True)
78
+ with gr.Column():
79
+ gr.Markdown("### After")
80
+ after_df = gr.Dataframe(label="Cleaned", interactive=False, wrap=True)
81
+
82
+ report_md = gr.Markdown()
83
+ download = gr.File(label="Download cleaned file")
84
+
85
+ run_btn.click(clean, inputs=file_in, outputs=[before_df, after_df, report_md, download])
86
+ sample_btn.click(load_sample, outputs=file_in)
87
+
88
+
89
+ if __name__ == "__main__":
90
+ demo.launch(theme=gr.themes.Soft())
design/mockups/calm/index.html ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="es">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>ScrubData — Tu lista, ordenada con calma</title>
7
+ <style>
8
+ :root{
9
+ --paper:#fbf7f0;
10
+ --paper-2:#fffdf9;
11
+ --ink:#3a3530;
12
+ --ink-soft:#6f675d;
13
+ --line:#ece4d6;
14
+ --accent:#7ba087; /* single calm sage accent */
15
+ --accent-soft:#e8f0ea;
16
+ --accent-deep:#5e8470;
17
+ --warm:#d8a25e; /* gentle merit-badge gold, used sparingly */
18
+ --shadow:0 14px 40px -22px rgba(80,70,55,.45);
19
+ --radius:26px;
20
+ }
21
+ *{box-sizing:border-box;}
22
+ html,body{margin:0;padding:0;}
23
+ body{
24
+ font-family:"Iowan Old Style","Palatino Linotype",Palatino,Georgia,"Times New Roman",serif;
25
+ background:
26
+ radial-gradient(120% 80% at 50% -10%, #fffdf9 0%, var(--paper) 55%, #f5efe4 100%);
27
+ color:var(--ink);
28
+ line-height:1.6;
29
+ -webkit-font-smoothing:antialiased;
30
+ min-height:100vh;
31
+ display:flex;
32
+ flex-direction:column;
33
+ align-items:center;
34
+ padding:34px 20px 70px;
35
+ }
36
+ ::selection{background:var(--accent-soft);}
37
+
38
+ /* ---------- top bar ---------- */
39
+ .topbar{
40
+ width:100%;
41
+ max-width:760px;
42
+ display:flex;
43
+ align-items:center;
44
+ justify-content:space-between;
45
+ margin-bottom:30px;
46
+ }
47
+ .brand{display:flex;align-items:center;gap:11px;}
48
+ .leaf{width:34px;height:34px;flex:none;}
49
+ .brand-name{font-size:1.18rem;font-weight:600;letter-spacing:.2px;}
50
+ .brand-name small{display:block;font-size:.72rem;color:var(--ink-soft);letter-spacing:.4px;font-weight:400;}
51
+ .lang{
52
+ display:flex;background:var(--paper-2);border:1px solid var(--line);
53
+ border-radius:999px;padding:3px;font-family:system-ui,sans-serif;font-size:.8rem;
54
+ }
55
+ .lang button{
56
+ border:none;background:transparent;color:var(--ink-soft);
57
+ padding:6px 14px;border-radius:999px;cursor:pointer;font-weight:600;letter-spacing:.3px;
58
+ }
59
+ .lang button.on{background:var(--accent);color:#fff;}
60
+
61
+ /* ---------- shared card ---------- */
62
+ .stage{width:100%;max-width:760px;}
63
+ .card{
64
+ background:var(--paper-2);
65
+ border:1px solid var(--line);
66
+ border-radius:var(--radius);
67
+ box-shadow:var(--shadow);
68
+ padding:46px 44px;
69
+ }
70
+ .screen{display:none;}
71
+ .screen.active{display:block;animation:rise .6s ease both;}
72
+ @keyframes rise{from{opacity:0;transform:translateY(14px);}to{opacity:1;transform:none;}}
73
+
74
+ h1{font-size:2.05rem;line-height:1.25;margin:0 0 12px;font-weight:600;letter-spacing:.2px;}
75
+ .lede{font-size:1.18rem;color:var(--ink-soft);margin:0 0 30px;max-width:46ch;}
76
+
77
+ /* persistent safety strip */
78
+ .safety{
79
+ display:flex;align-items:center;gap:12px;
80
+ background:var(--accent-soft);
81
+ border-radius:18px;
82
+ padding:14px 18px;
83
+ margin-top:26px;
84
+ font-family:system-ui,sans-serif;
85
+ font-size:.95rem;
86
+ color:var(--accent-deep);
87
+ }
88
+ .safety svg{flex:none;}
89
+ .safety b{font-weight:600;}
90
+
91
+ /* ---------- screen 1: drop ---------- */
92
+ .drop{
93
+ border:2px dashed #cdbfa6;
94
+ background:linear-gradient(180deg,#fffefb,#fbf6ec);
95
+ border-radius:24px;
96
+ padding:54px 30px;
97
+ text-align:center;
98
+ cursor:pointer;
99
+ transition:border-color .25s, background .25s, transform .25s;
100
+ }
101
+ .drop:hover{border-color:var(--accent);background:#fbfaf4;transform:translateY(-2px);}
102
+ .drop .basket{font-size:2.6rem;display:block;margin-bottom:10px;}
103
+ .drop .big{font-size:1.32rem;font-weight:600;margin-bottom:4px;}
104
+ .drop .sub{color:var(--ink-soft);font-family:system-ui,sans-serif;font-size:.95rem;}
105
+ .filechip{
106
+ display:inline-flex;align-items:center;gap:9px;margin-top:22px;
107
+ background:#fff;border:1px solid var(--line);border-radius:14px;
108
+ padding:9px 15px;font-family:system-ui,sans-serif;font-size:.9rem;color:var(--ink);
109
+ }
110
+ .filechip .dot{width:9px;height:9px;border-radius:50%;background:var(--accent);}
111
+
112
+ .btn{
113
+ font-family:system-ui,sans-serif;font-size:1.06rem;font-weight:600;
114
+ border:none;border-radius:16px;cursor:pointer;padding:16px 30px;
115
+ transition:transform .15s, box-shadow .25s, background .2s;
116
+ }
117
+ .btn-primary{
118
+ background:var(--accent);color:#fff;
119
+ box-shadow:0 10px 24px -12px rgba(94,132,112,.9);
120
+ width:100%;margin-top:26px;
121
+ }
122
+ .btn-primary:hover{background:var(--accent-deep);transform:translateY(-2px);}
123
+ .btn-ghost{
124
+ background:transparent;color:var(--accent-deep);border:1px solid #cfe0d4;
125
+ }
126
+ .btn-ghost:hover{background:var(--accent-soft);}
127
+
128
+ /* ---------- screen 2: working ---------- */
129
+ .working{text-align:center;padding:30px 10px 14px;}
130
+ .breath{
131
+ width:120px;height:120px;margin:6px auto 26px;border-radius:50%;
132
+ background:radial-gradient(circle at 50% 50%, var(--accent-soft), #fff);
133
+ border:1px solid var(--line);
134
+ display:flex;align-items:center;justify-content:center;
135
+ animation:breathe 3.4s ease-in-out infinite;
136
+ }
137
+ .breath span{font-size:2.4rem;}
138
+ @keyframes breathe{0%,100%{transform:scale(1);box-shadow:0 0 0 0 rgba(123,160,135,.25);}50%{transform:scale(1.07);box-shadow:0 0 0 18px rgba(123,160,135,0);}}
139
+ .working h1{font-size:1.7rem;}
140
+ .steps{list-style:none;padding:0;margin:24px auto 0;max-width:380px;text-align:left;font-family:system-ui,sans-serif;}
141
+ .steps li{
142
+ display:flex;align-items:center;gap:12px;padding:9px 0;color:var(--ink-soft);font-size:1rem;
143
+ opacity:.35;transition:opacity .4s;
144
+ }
145
+ .steps li.done{opacity:1;color:var(--ink);}
146
+ .steps li .tick{
147
+ width:22px;height:22px;border-radius:50%;border:2px solid #d8cdb8;flex:none;
148
+ display:flex;align-items:center;justify-content:center;font-size:.8rem;color:#fff;background:transparent;
149
+ }
150
+ .steps li.done .tick{background:var(--accent);border-color:var(--accent);}
151
+
152
+ /* ---------- screen 3: result ---------- */
153
+ .result-head{display:flex;align-items:flex-start;gap:16px;margin-bottom:8px;}
154
+ .badge{
155
+ width:62px;height:62px;flex:none;
156
+ }
157
+ .h-eyebrow{font-family:system-ui,sans-serif;font-size:.82rem;letter-spacing:1.4px;text-transform:uppercase;color:var(--accent-deep);font-weight:700;}
158
+
159
+ .summary{
160
+ background:var(--paper);
161
+ border:1px solid var(--line);
162
+ border-radius:20px;
163
+ padding:24px 26px;
164
+ margin:22px 0 8px;
165
+ font-size:1.12rem;
166
+ }
167
+ .summary p{margin:0 0 14px;}
168
+ .summary p:last-child{margin-bottom:0;}
169
+ .summary .num{color:var(--accent-deep);font-weight:600;}
170
+
171
+ .section-title{
172
+ font-family:system-ui,sans-serif;font-size:.95rem;font-weight:700;
173
+ color:var(--ink-soft);letter-spacing:.4px;margin:34px 0 14px;
174
+ display:flex;align-items:center;gap:9px;
175
+ }
176
+ .section-title .pill{font-size:.7rem;background:var(--accent-soft);color:var(--accent-deep);padding:3px 9px;border-radius:999px;font-weight:700;}
177
+
178
+ /* change cards */
179
+ .change{
180
+ border:1px solid var(--line);border-radius:18px;background:#fff;
181
+ padding:18px 20px;margin-bottom:14px;
182
+ }
183
+ .change .lead{font-size:1.08rem;margin:0 0 12px;}
184
+ .change .lead b{color:var(--ink);}
185
+ .ba{display:flex;gap:10px;flex-wrap:wrap;font-family:system-ui,sans-serif;font-size:.9rem;}
186
+ .chip{
187
+ padding:7px 13px;border-radius:12px;border:1px solid var(--line);
188
+ background:var(--paper);color:var(--ink-soft);
189
+ }
190
+ .chip.after{background:var(--accent-soft);border-color:#cfe0d4;color:var(--accent-deep);font-weight:600;}
191
+ .arrow{align-self:center;color:#bdb3a1;font-family:system-ui,sans-serif;}
192
+
193
+ /* gentle question card */
194
+ .ask{
195
+ border:1px solid #e7dcc4;background:linear-gradient(180deg,#fffdf6,#fbf3e3);
196
+ border-radius:18px;padding:20px 22px;margin-bottom:14px;
197
+ }
198
+ .ask .q{font-size:1.1rem;margin:0 0 6px;}
199
+ .ask .why{font-family:system-ui,sans-serif;font-size:.92rem;color:var(--ink-soft);margin:0 0 16px;}
200
+ .ask .row{display:flex;gap:10px;}
201
+ .ask .btn{padding:11px 20px;font-size:.95rem;}
202
+
203
+ /* honest flags */
204
+ .flag{
205
+ display:flex;gap:12px;align-items:flex-start;
206
+ background:#fff;border:1px dashed #d8cdb8;border-radius:16px;padding:16px 18px;margin-bottom:12px;
207
+ }
208
+ .flag .mark{font-size:1.2rem;flex:none;}
209
+ .flag p{margin:0;font-size:1rem;}
210
+ .flag .small{font-family:system-ui,sans-serif;font-size:.88rem;color:var(--ink-soft);}
211
+
212
+ /* bonus card */
213
+ .bonus{
214
+ background:linear-gradient(180deg,#f4faf6,#eaf3ed);
215
+ border:1px solid #d3e6da;border-radius:20px;padding:22px 24px;margin-top:8px;
216
+ display:flex;gap:16px;align-items:center;
217
+ }
218
+ .bonus .ic{font-size:2rem;flex:none;}
219
+ .bonus h3{margin:0 0 4px;font-size:1.15rem;}
220
+ .bonus p{margin:0;color:var(--ink-soft);font-size:1.02rem;}
221
+
222
+ /* download zone */
223
+ .download{
224
+ margin-top:30px;text-align:center;
225
+ border-top:1px solid var(--line);padding-top:30px;
226
+ }
227
+ .download .btn-primary{width:auto;display:inline-block;padding:18px 44px;font-size:1.12rem;}
228
+ .download .aside{font-family:system-ui,sans-serif;font-size:.92rem;color:var(--ink-soft);margin-top:14px;}
229
+ .download .aside a{color:var(--accent-deep);text-decoration:underline;cursor:pointer;}
230
+
231
+ .reset{display:block;margin:26px auto 0;background:none;border:none;color:var(--ink-soft);
232
+ font-family:system-ui,sans-serif;font-size:.85rem;cursor:pointer;text-decoration:underline;}
233
+
234
+ @media(max-width:560px){
235
+ .card{padding:32px 24px;}
236
+ h1{font-size:1.7rem;}
237
+ .lede{font-size:1.05rem;}
238
+ }
239
+ </style>
240
+ </head>
241
+ <body>
242
+
243
+ <div class="topbar">
244
+ <div class="brand">
245
+ <svg class="leaf" viewBox="0 0 40 40" fill="none">
246
+ <path d="M20 36C8 30 6 16 12 8c8 2 18 8 16 22-1 4-4 6-8 6z" fill="#e8f0ea" stroke="#7ba087" stroke-width="1.6"/>
247
+ <path d="M20 34c-1-8 0-16 6-22" stroke="#7ba087" stroke-width="1.6" stroke-linecap="round"/>
248
+ <path d="M18 24c-2-1-4-3-5-6M22 18c2 0 5 0 7-1" stroke="#7ba087" stroke-width="1.4" stroke-linecap="round"/>
249
+ </svg>
250
+ <div class="brand-name">ScrubData<small>tu lista, ordenada con calma</small></div>
251
+ </div>
252
+ <div class="lang" aria-label="idioma">
253
+ <button class="on">ES</button>
254
+ <button>EN</button>
255
+ </div>
256
+ </div>
257
+
258
+ <div class="stage">
259
+
260
+ <!-- ============ SCREEN 1 : WELCOME + DROP ============ -->
261
+ <section class="screen active" id="s1">
262
+ <div class="card">
263
+ <h1>Hola, Doña Lupe.<br/>Vamos a ordenar su lista, sin prisa.</h1>
264
+ <p class="lede">Suelte aquí su archivo y yo le echo un ojo. Usted no tiene que configurar nada.</p>
265
+
266
+ <div class="drop" onclick="goWork()">
267
+ <span class="basket">🧺</span>
268
+ <div class="big">Suelte su archivo aquí</div>
269
+ <div class="sub">o toque para buscarlo en su computadora · Excel o CSV</div>
270
+ <div class="filechip"><span class="dot"></span> ventas-del-mes.xlsx · listo para revisar</div>
271
+ </div>
272
+
273
+ <button class="btn btn-primary" onclick="goWork()">Ordénalo por mí</button>
274
+
275
+ <div class="safety">
276
+ <svg width="22" height="22" viewBox="0 0 24 24" fill="none"><path d="M12 2l8 3v6c0 5-3.5 8-8 9-4.5-1-8-4-8-9V5l8-3z" stroke="#5e8470" stroke-width="1.6"/><path d="M9 12l2 2 4-4" stroke="#5e8470" stroke-width="1.6" stroke-linecap="round" stroke-linejoin="round"/></svg>
277
+ <div><b>Su original se queda igualito.</b> Hago una copia limpia aparte. Nada sale de esta computadora.</div>
278
+ </div>
279
+ </div>
280
+ </section>
281
+
282
+ <!-- ============ SCREEN 2 : WORKING ============ -->
283
+ <section class="screen" id="s2">
284
+ <div class="card working">
285
+ <div class="breath"><span>🍃</span></div>
286
+ <h1>Trabajando aquí mismo, en su computadora…</h1>
287
+ <p class="lede" style="margin:8px auto 0;">Respire tranquila. Su original está a salvo. Esto toma un momentito.</p>
288
+ <ul class="steps" id="steps">
289
+ <li data-i="0"><span class="tick">✓</span> Leyendo su lista con cuidado</li>
290
+ <li data-i="1"><span class="tick">✓</span> Juntando los tacos que están escritos de varias formas</li>
291
+ <li data-i="2"><span class="tick">✓</span> Revisando teléfonos, fechas y espacios en blanco</li>
292
+ <li data-i="3"><span class="tick">✓</span> Apuntando lo que no estoy segura, para preguntarle</li>
293
+ </ul>
294
+ </div>
295
+ </section>
296
+
297
+ <!-- ============ SCREEN 3 : RESULT ============ -->
298
+ <section class="screen" id="s3">
299
+ <div class="card">
300
+ <div class="result-head">
301
+ <svg class="badge" viewBox="0 0 64 64" fill="none">
302
+ <circle cx="32" cy="32" r="29" fill="#fff" stroke="#d8a25e" stroke-width="2" stroke-dasharray="3 3"/>
303
+ <circle cx="32" cy="32" r="22" fill="#e8f0ea" stroke="#7ba087" stroke-width="1.6"/>
304
+ <path d="M24 33l5 5 11-12" stroke="#5e8470" stroke-width="2.6" stroke-linecap="round" stroke-linejoin="round"/>
305
+ </svg>
306
+ <div>
307
+ <div class="h-eyebrow">Listo · su resumen</div>
308
+ <h1 style="margin-top:2px;">Esto fue lo que encontré.</h1>
309
+ </div>
310
+ </div>
311
+
312
+ <div class="summary">
313
+ <p>Revisé su lista de <b>ventas-del-mes</b> con calma. Esto fue lo que arreglé:</p>
314
+ <p>· <span class="num">«Al pastor»</span> estaba escrito de 4 maneras distintas. Lo junté todo: <span class="num">1,204 vendidos</span>.</p>
315
+ <p>· <span class="num">23 personas</span> aparecían dos veces en su lista. Las reuní para que usted las mire.</p>
316
+ <p>· Puse todos los <span class="num">teléfonos</span> y las <span class="num">fechas</span> escritos igualito, fáciles de leer.</p>
317
+ <p>· <span class="num">14 espacios</span> decían «N/A» o solo un guion — los tomé como vacíos.</p>
318
+ <p style="font-family:system-ui,sans-serif;font-size:.98rem;color:var(--ink-soft);">Puede leerlo en voz alta a Yolanda o imprimirlo. Nada de esto tocó su archivo original.</p>
319
+ </div>
320
+
321
+ <!-- already-done change card (mechanical, safe) -->
322
+ <div class="section-title">Lo que ya dejé arreglado <span class="pill">hecho</span></div>
323
+
324
+ <div class="change">
325
+ <p class="lead">El mismo taco, escrito de varias formas — lo conté junto:</p>
326
+ <div class="ba">
327
+ <span class="chip">al pastor</span>
328
+ <span class="chip">Al Pastor</span>
329
+ <span class="chip">pastor</span>
330
+ <span class="chip">al pastór</span>
331
+ <span class="arrow">→</span>
332
+ <span class="chip after">Al pastor · 1,204</span>
333
+ </div>
334
+ </div>
335
+
336
+ <div class="change">
337
+ <p class="lead">Los teléfonos ahora se ven todos iguales:</p>
338
+ <div class="ba">
339
+ <span class="chip">55-1234 5678</span>
340
+ <span class="chip">5512345678</span>
341
+ <span class="arrow">→</span>
342
+ <span class="chip after">(55) 1234-5678</span>
343
+ </div>
344
+ </div>
345
+
346
+ <!-- gentle confirms (money / identity) -->
347
+ <div class="section-title">Antes de seguir, dos preguntitas <span class="pill" style="background:#f6ecd6;color:#a9742f;">usted decide</span></div>
348
+
349
+ <div class="ask">
350
+ <p class="q">Encontré <b>31 filas en $0.00</b> — parece un error del sistema.</p>
351
+ <p class="why">Si las dejo dentro, bajan su total del mes. ¿Las saco de la suma?</p>
352
+ <div class="row">
353
+ <button class="btn btn-primary" style="width:auto;margin:0;" onclick="this.closest('.ask').style.opacity=.55;this.closest('.row').innerHTML='<span style=&quot;font-family:system-ui;color:#5e8470;font-weight:600&quot;>✓ Hecho — las dejé fuera del total.</span>'">Sí, sácalas</button>
354
+ <button class="btn btn-ghost" onclick="this.closest('.row').innerHTML='<span style=&quot;font-family:system-ui;color:#6f675d&quot;>De acuerdo, las dejo en la suma.</span>'">No, déjalas</button>
355
+ </div>
356
+ </div>
357
+
358
+ <div class="ask">
359
+ <p class="q">Estas dos parecen <b>la misma persona</b>: «Yolanda R.» y «Yolanda Reyes».</p>
360
+ <p class="why">¿Las cuento como una sola, o son personas distintas?</p>
361
+ <div class="row">
362
+ <button class="btn btn-primary" style="width:auto;margin:0;" onclick="this.closest('.row').innerHTML='<span style=&quot;font-family:system-ui;color:#5e8470;font-weight:600&quot;>✓ Las junté en una.</span>'">Sí, es la misma</button>
363
+ <button class="btn btn-ghost" onclick="this.closest('.row').innerHTML='<span style=&quot;font-family:system-ui;color:#6f675d&quot;>Las dejé separadas.</span>'">Son distintas</button>
364
+ </div>
365
+ </div>
366
+
367
+ <!-- honest flags -->
368
+ <div class="section-title">No estuve segura de esto — se lo dejé a usted</div>
369
+
370
+ <div class="flag">
371
+ <span class="mark">🤔</span>
372
+ <p>Dos teléfonos tenían solo 7 dígitos. No quise inventar los que faltan.<br/>
373
+ <span class="small">Los dejé tal cual para que usted los revise contra su libreta.</span></p>
374
+ </div>
375
+ <div class="flag">
376
+ <span class="mark">🧮</span>
377
+ <p>El total de su caja dice <b>$48,920</b>, pero su lista suma <b>$48,655</b>.<br/>
378
+ <span class="small">No cuadran por $265 — aquí se lo marco para que lo compare con su efectivo.</span></p>
379
+ </div>
380
+
381
+ <!-- bonus -->
382
+ <div class="bonus">
383
+ <span class="ic">🌶️</span>
384
+ <div>
385
+ <h3>Ah, y una cosita más…</h3>
386
+ <p>Se le está acabando el <b>adobo de pastor</b> — fue el más vendido del mes. Quizá conviene pedir más antes del finde.</p>
387
+ </div>
388
+ </div>
389
+
390
+ <!-- download -->
391
+ <div class="download">
392
+ <button class="btn btn-primary">Descargar mi copia limpia</button>
393
+ <p class="aside">
394
+ Su original sigue a salvo en su USB. ·
395
+ <a onclick="alert('Su archivo original nunca se tocó — está justo donde lo dejó.')">Devolver todo como estaba</a><br/>
396
+ También puede <a onclick="window.print()">imprimir este resumen</a> para Yolanda.
397
+ </p>
398
+ </div>
399
+
400
+ <div class="safety" style="margin-top:30px;">
401
+ <svg width="22" height="22" viewBox="0 0 24 24" fill="none"><path d="M12 2l8 3v6c0 5-3.5 8-8 9-4.5-1-8-4-8-9V5l8-3z" stroke="#5e8470" stroke-width="1.6"/><path d="M9 12l2 2 4-4" stroke="#5e8470" stroke-width="1.6" stroke-linecap="round" stroke-linejoin="round"/></svg>
402
+ <div><b>Buen trabajo, Doña Lupe.</b> Su lista quedó en buen estado, y usted la revisó con sus propios ojos. Nada salió de esta computadora.</div>
403
+ </div>
404
+
405
+ <button class="reset" onclick="reset()">Empezar de nuevo con otro archivo</button>
406
+ </div>
407
+ </section>
408
+
409
+ </div>
410
+
411
+ <script>
412
+ function show(id){
413
+ document.querySelectorAll('.screen').forEach(s=>s.classList.remove('active'));
414
+ document.getElementById(id).classList.add('active');
415
+ window.scrollTo({top:0,behavior:'smooth'});
416
+ }
417
+ function goWork(){
418
+ show('s2');
419
+ const lis=document.querySelectorAll('#steps li');
420
+ lis.forEach(l=>l.classList.remove('done'));
421
+ let i=0;
422
+ const t=setInterval(()=>{
423
+ if(i<lis.length){lis[i].classList.add('done');i++;}
424
+ else{clearInterval(t);setTimeout(()=>show('s3'),650);}
425
+ },720);
426
+ }
427
+ function reset(){show('s1');}
428
+ </script>
429
+ </body>
430
+ </html>
design/mockups/cozy/index.html ADDED
@@ -0,0 +1,526 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="es">
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1">
6
+ <title>ScrubData — tu ayudante de cocina para los números</title>
7
+ <style>
8
+ :root{
9
+ --paper:#fbf4e7;
10
+ --paper-2:#f5ead4;
11
+ --card:#fffaf0;
12
+ --ink:#4a3a2c;
13
+ --ink-soft:#7a6a58;
14
+ --line:#e6d6ba;
15
+ --moss:#6f8f5a;
16
+ --moss-deep:#52733f;
17
+ --berry:#c4694e;
18
+ --gold:#d9a441;
19
+ --sky:#8aa9b8;
20
+ --shadow:0 10px 30px rgba(120,90,50,.12);
21
+ --shadow-soft:0 4px 14px rgba(120,90,50,.10);
22
+ --radius:22px;
23
+ }
24
+ *{box-sizing:border-box}
25
+ html,body{margin:0}
26
+ body{
27
+ font-family:"Iowan Old Style","Palatino Linotype","Book Antiqua",Georgia,"Segoe UI",serif;
28
+ color:var(--ink);
29
+ background:
30
+ radial-gradient(circle at 15% 8%, #fdf8ee 0%, transparent 40%),
31
+ radial-gradient(circle at 90% 92%, #f6ecd6 0%, transparent 45%),
32
+ var(--paper);
33
+ line-height:1.55;
34
+ -webkit-font-smoothing:antialiased;
35
+ min-height:100vh;
36
+ }
37
+ /* faint paper grain + dotted-trail texture */
38
+ body::before{
39
+ content:"";position:fixed;inset:0;pointer-events:none;z-index:0;opacity:.5;
40
+ background-image:radial-gradient(rgba(180,150,100,.10) 1px, transparent 1.4px);
41
+ background-size:22px 22px;
42
+ }
43
+ .wrap{position:relative;z-index:1;max-width:880px;margin:0 auto;padding:28px 20px 80px}
44
+
45
+ /* ---------- top bar ---------- */
46
+ .topbar{display:flex;align-items:center;justify-content:space-between;gap:12px;margin-bottom:14px}
47
+ .brand{display:flex;align-items:center;gap:12px}
48
+ .logo{width:46px;height:46px;flex:0 0 auto}
49
+ .brand h1{font-size:1.32rem;margin:0;letter-spacing:.2px}
50
+ .brand .tag{margin:0;font-size:.86rem;color:var(--ink-soft);font-style:italic}
51
+ .lang{display:flex;background:var(--card);border:1.5px solid var(--line);border-radius:999px;padding:3px;box-shadow:var(--shadow-soft)}
52
+ .lang button{border:0;background:transparent;font:inherit;font-size:.85rem;color:var(--ink-soft);padding:5px 13px;border-radius:999px;cursor:pointer}
53
+ .lang button.on{background:var(--moss);color:#fff;box-shadow:0 2px 6px rgba(80,110,60,.35)}
54
+
55
+ /* ---------- persistent safety ribbon ---------- */
56
+ .safe{
57
+ display:flex;align-items:center;gap:12px;
58
+ background:linear-gradient(180deg,#f2f7ec,#eaf2e0);
59
+ border:1.5px solid #d6e3c4;border-radius:16px;
60
+ padding:11px 16px;margin-bottom:22px;box-shadow:var(--shadow-soft);
61
+ }
62
+ .safe svg{flex:0 0 auto}
63
+ .safe p{margin:0;font-size:.92rem;color:var(--moss-deep)}
64
+ .safe b{color:var(--moss-deep)}
65
+
66
+ /* ---------- cards / screens ---------- */
67
+ .screen{display:none;animation:rise .5s ease both}
68
+ .screen.active{display:block}
69
+ @keyframes rise{from{opacity:0;transform:translateY(14px)}to{opacity:1;transform:none}}
70
+
71
+ .card{
72
+ background:var(--card);border:1.5px solid var(--line);
73
+ border-radius:var(--radius);box-shadow:var(--shadow);
74
+ padding:30px 30px 32px;position:relative;
75
+ }
76
+ .card + .card{margin-top:20px}
77
+
78
+ /* ---------- welcome / drop ---------- */
79
+ .hello{text-align:center}
80
+ .hello h2{font-size:1.75rem;margin:6px 0 6px}
81
+ .hello .sub{color:var(--ink-soft);font-size:1.05rem;margin:0 auto 24px;max-width:520px}
82
+ .drop{
83
+ border:2.5px dashed #d8b873;border-radius:20px;
84
+ background:linear-gradient(180deg,#fffdf6,#fdf3df);
85
+ padding:38px 24px;text-align:center;cursor:pointer;transition:.2s;
86
+ }
87
+ .drop:hover{border-color:var(--gold);background:#fff8e8;transform:translateY(-2px)}
88
+ .drop .basket{font-size:0;line-height:0;margin-bottom:10px}
89
+ .drop h3{margin:8px 0 4px;font-size:1.2rem}
90
+ .drop p{margin:0;color:var(--ink-soft);font-size:.95rem}
91
+ .or{color:var(--ink-soft);font-size:.85rem;margin:14px 0 4px}
92
+ .filechip{
93
+ display:inline-flex;align-items:center;gap:10px;background:#fff;border:1.5px solid var(--line);
94
+ border-radius:14px;padding:9px 14px;margin-top:6px;font-size:.92rem;box-shadow:var(--shadow-soft)
95
+ }
96
+ .filechip .x{color:var(--ink-soft);font-size:.8rem}
97
+ .btn{
98
+ border:0;font:inherit;cursor:pointer;border-radius:16px;font-size:1.06rem;
99
+ padding:14px 30px;font-weight:600;letter-spacing:.2px;transition:.16s;
100
+ }
101
+ .btn-go{background:var(--berry);color:#fff;box-shadow:0 6px 16px rgba(196,105,78,.35);margin-top:24px}
102
+ .btn-go:hover{transform:translateY(-2px);box-shadow:0 9px 22px rgba(196,105,78,.42)}
103
+ .btn-ghost{background:#fff;color:var(--ink);border:1.5px solid var(--line)}
104
+ .btn-ghost:hover{background:#fffdf6}
105
+
106
+ /* ---------- tidying ---------- */
107
+ .tidy{text-align:center;padding:54px 30px}
108
+ .tidy h2{font-size:1.5rem;margin:18px 0 6px}
109
+ .tidy p{color:var(--ink-soft);margin:0 auto;max-width:440px}
110
+ .scene{width:160px;height:120px;margin:0 auto 6px;position:relative}
111
+ .broom{position:absolute;left:46px;top:6px;transform-origin:78px 12px;animation:sweep 1.1s ease-in-out infinite}
112
+ @keyframes sweep{0%,100%{transform:rotate(-13deg)}50%{transform:rotate(13deg)}}
113
+ .spk{position:absolute;font-size:0;animation:twinkle 1.4s ease-in-out infinite}
114
+ .spk:nth-child(2){left:24px;top:70px;animation-delay:.1s}
115
+ .spk:nth-child(3){left:120px;top:54px;animation-delay:.5s}
116
+ .spk:nth-child(4){left:70px;top:96px;animation-delay:.8s}
117
+ @keyframes twinkle{0%,100%{opacity:.2;transform:scale(.7)}50%{opacity:1;transform:scale(1.1)}}
118
+ .bar{height:12px;background:#efe2c8;border-radius:99px;overflow:hidden;max-width:340px;margin:22px auto 0;border:1px solid var(--line)}
119
+ .bar i{display:block;height:100%;width:0;background:linear-gradient(90deg,var(--moss),var(--gold));border-radius:99px;animation:fill 4.2s ease forwards}
120
+ @keyframes fill{to{width:100%}}
121
+ .tidy .micro{font-size:.85rem;color:var(--moss-deep);margin-top:14px}
122
+
123
+ /* ---------- result ---------- */
124
+ .result-head{text-align:center;margin-bottom:6px}
125
+ .badge-row{display:flex;justify-content:center;gap:10px;margin-bottom:8px}
126
+ .merit{display:flex;flex-direction:column;align-items:center;gap:4px;font-size:.72rem;color:var(--moss-deep);width:84px;text-align:center}
127
+ .result-head h2{font-size:1.6rem;margin:6px 0 2px}
128
+ .result-head .sub{color:var(--ink-soft);margin:0 0 6px}
129
+
130
+ .summary{background:linear-gradient(180deg,#fffdf6,#fbf3e0);border:1.5px solid var(--line)}
131
+ .summary h3{margin:0 0 4px;font-size:1.22rem}
132
+ .summary .read{font-size:.82rem;color:var(--ink-soft);font-style:italic;margin:0 0 14px}
133
+ .sline{display:flex;gap:13px;align-items:flex-start;padding:11px 0;border-top:1px dotted var(--line)}
134
+ .sline:first-of-type{border-top:0}
135
+ .sline .ic{flex:0 0 auto;margin-top:2px}
136
+ .sline p{margin:0;font-size:1.02rem}
137
+ .sline b{color:var(--moss-deep)}
138
+
139
+ .secttitle{font-size:1.05rem;color:var(--ink-soft);margin:26px 4px 10px;display:flex;align-items:center;gap:8px;font-style:italic}
140
+
141
+ /* change cards */
142
+ .chg{padding:18px 20px}
143
+ .chg.done{border-left:6px solid var(--moss)}
144
+ .chg.ask{border-left:6px solid var(--gold);background:linear-gradient(180deg,#fffdf3,#fdf6e2)}
145
+ .chg.flag{border-left:6px solid var(--sky)}
146
+ .chg h4{margin:0 0 10px;font-size:1.08rem;display:flex;align-items:center;gap:9px}
147
+ .chk{font-size:.72rem;background:#eaf2e0;color:var(--moss-deep);padding:2px 9px;border-radius:99px;font-weight:600;letter-spacing:.3px}
148
+ .pill-ask{font-size:.72rem;background:#f7ead0;color:#9a7a2e;padding:2px 9px;border-radius:99px;font-weight:600;letter-spacing:.3px}
149
+ .pill-flag{font-size:.72rem;background:#e3edf2;color:#5b7d8c;padding:2px 9px;border-radius:99px;font-weight:600;letter-spacing:.3px}
150
+
151
+ .ba{display:flex;gap:12px;align-items:stretch;flex-wrap:wrap}
152
+ .ba .col{flex:1 1 200px;border:1.5px solid var(--line);border-radius:14px;overflow:hidden;background:#fff}
153
+ .ba .col .ttl{font-size:.74rem;letter-spacing:.6px;text-transform:uppercase;color:var(--ink-soft);padding:7px 12px;background:#faf4e6;border-bottom:1px solid var(--line)}
154
+ .ba .col.after .ttl{background:#eef5e6;color:var(--moss-deep)}
155
+ .row{display:flex;justify-content:space-between;gap:10px;padding:7px 12px;font-size:.95rem;border-top:1px dashed #efe6d2}
156
+ .row:first-of-type{border-top:0}
157
+ .row .q{color:var(--ink-soft)}
158
+ .ba .col.before .was{color:#a9947d}
159
+ .ba .col.after .now{color:var(--moss-deep);font-weight:600}
160
+ .arrow{display:flex;align-items:center;color:var(--gold);font-size:1.3rem}
161
+ @media(max-width:560px){.arrow{transform:rotate(90deg)}}
162
+
163
+ .askbtns{display:flex;gap:10px;margin-top:14px;flex-wrap:wrap}
164
+ .askbtns .yes{background:var(--moss);color:#fff;padding:9px 18px;border-radius:13px;border:0;font:inherit;font-weight:600;cursor:pointer}
165
+ .askbtns .no{background:#fff;color:var(--ink);border:1.5px solid var(--line);padding:9px 18px;border-radius:13px;font:inherit;cursor:pointer}
166
+ .askbtns .yes:hover{background:var(--moss-deep)}
167
+ .answered{display:none;align-items:center;gap:8px;color:var(--moss-deep);font-size:.92rem;margin-top:12px;background:#eef5e6;padding:8px 12px;border-radius:11px}
168
+
169
+ /* bonus card */
170
+ .bonus{background:linear-gradient(135deg,#fdf6e6,#f6efe0);border:1.5px solid #ecd9b0}
171
+ .bonus h4{margin:0 0 6px;font-size:1.12rem;display:flex;align-items:center;gap:9px}
172
+ .bonus ul{margin:8px 0 0;padding-left:4px;list-style:none}
173
+ .bonus li{padding:5px 0;font-size:1rem;display:flex;gap:9px;align-items:center}
174
+ .bonus li .dot{width:9px;height:9px;border-radius:99px;background:var(--berry);flex:0 0 auto}
175
+
176
+ /* download footer */
177
+ .getit{text-align:center;background:linear-gradient(180deg,#f2f7ec,#e9f1de);border:1.5px solid #d6e3c4}
178
+ .getit h3{margin:0 0 4px;font-size:1.3rem;color:var(--moss-deep)}
179
+ .getit p{margin:0 0 18px;color:var(--ink-soft)}
180
+ .getit .btns{display:flex;gap:12px;justify-content:center;flex-wrap:wrap}
181
+ .btn-dl{background:var(--moss);color:#fff;box-shadow:0 6px 16px rgba(80,110,60,.32)}
182
+ .btn-dl:hover{transform:translateY(-2px)}
183
+ .undo{margin-top:18px;font-size:.9rem;color:var(--moss-deep)}
184
+ .undo a{color:var(--berry);text-decoration:underline;cursor:pointer}
185
+
186
+ .restart{display:block;margin:26px auto 0;color:var(--ink-soft);background:none;border:0;font:inherit;font-size:.85rem;text-decoration:underline;cursor:pointer}
187
+ .footnote{text-align:center;color:var(--ink-soft);font-size:.8rem;margin-top:30px;font-style:italic}
188
+ .es{display:none}
189
+ body.es-on .en{display:none}
190
+ body.es-on .es{display:inline}
191
+ body.es-on .es.block{display:block}
192
+ </style>
193
+ </head>
194
+ <body class="es-on">
195
+ <div class="wrap">
196
+
197
+ <!-- top bar -->
198
+ <div class="topbar">
199
+ <div class="brand">
200
+ <svg class="logo" viewBox="0 0 48 48" fill="none">
201
+ <path d="M24 4c7 0 12 4 12 4s-2 8-2 14c0 9-5 18-10 18S14 31 14 22c0-6-2-14-2-14s5-4 12-4z" fill="#7e9f63" stroke="#52733f" stroke-width="1.6"/>
202
+ <path d="M24 9v28" stroke="#52733f" stroke-width="1.4"/>
203
+ <path d="M24 18l6-5M24 24l-6-5M24 30l6-5" stroke="#52733f" stroke-width="1.3"/>
204
+ </svg>
205
+ <div>
206
+ <h1>ScrubData</h1>
207
+ <p class="tag"><span class="es">tu ayudante para ordenar tus listas</span><span class="en">your little helper for tidy lists</span></p>
208
+ </div>
209
+ </div>
210
+ <div class="lang">
211
+ <button id="bES" class="on" onclick="setLang('es')">Español</button>
212
+ <button id="bEN" onclick="setLang('en')">English</button>
213
+ </div>
214
+ </div>
215
+
216
+ <!-- persistent safety ribbon -->
217
+ <div class="safe">
218
+ <svg width="26" height="26" viewBox="0 0 24 24" fill="none"><path d="M12 2l8 3v6c0 5-3.5 9-8 11C7.5 20 4 16 4 11V5l8-3z" fill="#cfe0bd" stroke="#52733f" stroke-width="1.4"/><path d="M8.5 12l2.5 2.5L16 9" stroke="#52733f" stroke-width="1.7" stroke-linecap="round" stroke-linejoin="round"/></svg>
219
+ <p>
220
+ <span class="es"><b>Tu archivo original queda igualito.</b> Nada sale de esta computadora — todo se hace aquí mismo.</span>
221
+ <span class="en"><b>Your original stays exactly as it is.</b> Nothing leaves this computer — it all happens right here.</span>
222
+ </p>
223
+ </div>
224
+
225
+ <!-- ===================== SCREEN 1: WELCOME ===================== -->
226
+ <section id="s1" class="screen active">
227
+ <div class="card hello">
228
+ <div style="font-size:0;line-height:0">
229
+ <svg width="86" height="74" viewBox="0 0 86 74" fill="none" style="margin:0 auto">
230
+ <ellipse cx="43" cy="64" rx="30" ry="6" fill="#ead9b9"/>
231
+ <path d="M16 40h54l-5 22a4 4 0 0 1-4 3H25a4 4 0 0 1-4-3L16 40z" fill="#e7b86b" stroke="#b9863a" stroke-width="1.6"/>
232
+ <path d="M16 40h54" stroke="#b9863a" stroke-width="1.6"/>
233
+ <path d="M22 40c0-12 9-21 21-21s21 9 21 21" stroke="#b9863a" stroke-width="1.6" fill="#f3d9a3"/>
234
+ <circle cx="34" cy="33" r="3" fill="#c4694e"/><circle cx="50" cy="31" r="3" fill="#6f8f5a"/><circle cx="43" cy="36" r="3" fill="#d9a441"/>
235
+ </svg>
236
+ </div>
237
+ <h2>
238
+ <span class="es">Hola, Doña Lupe. ¿Le ayudo con su lista?</span>
239
+ <span class="en">Hi, Lupe. Want a hand with your list?</span>
240
+ </h2>
241
+ <p class="sub">
242
+ <span class="es">Suéltela aquí y yo la reviso con calma — sin botones raros ni cosas que configurar.</span>
243
+ <span class="en">Drop it here and I'll look it over, calmly — no strange buttons, nothing to set up.</span>
244
+ </p>
245
+
246
+ <div class="drop" onclick="pick()">
247
+ <div class="basket">
248
+ <svg width="58" height="50" viewBox="0 0 58 50" fill="none" style="margin:0 auto">
249
+ <path d="M6 22h46l-4 22a3 3 0 0 1-3 3H13a3 3 0 0 1-3-3L6 22z" fill="#f3e3c2" stroke="#c79a52" stroke-width="1.6"/>
250
+ <path d="M6 22h46M16 22l3 25M40 22l-3 25M29 22v25" stroke="#c79a52" stroke-width="1.2"/>
251
+ <path d="M16 22c0-9 6-15 13-15s13 6 13 15" stroke="#c79a52" stroke-width="1.6"/>
252
+ </svg>
253
+ </div>
254
+ <h3><span class="es">Suelte su archivo aquí</span><span class="en">Drop your file here</span></h3>
255
+ <p><span class="es">Excel o CSV — yo me encargo del resto.</span><span class="en">Excel or CSV — I'll handle the rest.</span></p>
256
+ <p class="or"><span class="es">— o —</span><span class="en">— or —</span></p>
257
+ <span class="filechip">
258
+ <svg width="16" height="16" viewBox="0 0 24 24" fill="none"><path d="M6 3h8l5 5v13a1 1 0 0 1-1 1H6a1 1 0 0 1-1-1V4a1 1 0 0 1 1-1z" fill="#f3e3c2" stroke="#c79a52" stroke-width="1.4"/></svg>
259
+ <span class="es">elegir de mi computadora</span><span class="en">choose from my computer</span>
260
+ </span>
261
+ </div>
262
+
263
+ <div style="margin-top:8px">
264
+ <span class="filechip" style="border-color:#cfe0bd;background:#f2f7ec">
265
+ <svg width="15" height="15" viewBox="0 0 24 24"><circle cx="12" cy="12" r="10" fill="#cfe0bd"/><path d="M8 12l3 3 5-6" stroke="#52733f" stroke-width="2" fill="none" stroke-linecap="round"/></svg>
266
+ ventas-mayo.xlsx <span class="x">· 1,431 <span class="es">renglones</span><span class="en">lines</span></span>
267
+ </span>
268
+ </div>
269
+
270
+ <button class="btn btn-go" onclick="go()">
271
+ <span class="es">Vamos a ordenarla ✦</span><span class="en">Let's tidy it up ✦</span>
272
+ </button>
273
+ </div>
274
+ </section>
275
+
276
+ <!-- ===================== SCREEN 2: TIDYING ===================== -->
277
+ <section id="s2" class="screen">
278
+ <div class="card tidy">
279
+ <div class="scene">
280
+ <span class="spk"><svg width="14" height="14" viewBox="0 0 24 24"><path d="M12 2l2 8 8 2-8 2-2 8-2-8-8-2 8-2 2-8z" fill="#d9a441"/></svg></span>
281
+ <span class="spk"><svg width="11" height="11" viewBox="0 0 24 24"><path d="M12 2l2 8 8 2-8 2-2 8-2-8-8-2 8-2 2-8z" fill="#6f8f5a"/></svg></span>
282
+ <span class="spk"><svg width="13" height="13" viewBox="0 0 24 24"><path d="M12 2l2 8 8 2-8 2-2 8-2-8-8-2 8-2 2-8z" fill="#c4694e"/></svg></span>
283
+ <div class="broom">
284
+ <svg width="64" height="110" viewBox="0 0 64 110" fill="none">
285
+ <rect x="30" y="2" width="5" height="64" rx="2.5" fill="#b9863a"/>
286
+ <path d="M18 64h28l6 38c0 3-3 4-6 4H18c-3 0-6-1-6-4l6-38z" fill="#e7b86b" stroke="#b9863a" stroke-width="1.6"/>
287
+ <path d="M22 78v24M30 78v26M38 78v24M46 78v22" stroke="#b9863a" stroke-width="1.3"/>
288
+ </svg>
289
+ </div>
290
+ </div>
291
+ <h2><span class="es">Ordenando con cuidado…</span><span class="en">Tidying up, gently…</span></h2>
292
+ <p>
293
+ <span class="es">Estoy aquí mismo en su computadora, sin prisas. Su archivo original sigue a salvo.</span>
294
+ <span class="en">I'm right here on your computer, taking my time. Your original is safe.</span>
295
+ </p>
296
+ <div class="bar"><i></i></div>
297
+ <p class="micro" id="step">
298
+ <span class="es">Juntando los tacos que están escritos de varias maneras…</span>
299
+ <span class="en">Gathering the items written a few different ways…</span>
300
+ </p>
301
+ </div>
302
+ </section>
303
+
304
+ <!-- ===================== SCREEN 3: RESULT ===================== -->
305
+ <section id="s3" class="screen">
306
+
307
+ <!-- merit + hero -->
308
+ <div class="card result-head">
309
+ <div class="badge-row">
310
+ <div class="merit">
311
+ <svg width="52" height="52" viewBox="0 0 52 52"><circle cx="26" cy="26" r="22" fill="#eef5e6" stroke="#6f8f5a" stroke-width="2"/><path d="M26 6l4 5 6-2-1 6 6 3-5 4 2 6-6-1-3 6-3-6-6 1 2-6-5-4 6-3-1-6 6 2 4-5z" fill="#cfe0bd"/><path d="M20 26l4 4 8-9" stroke="#52733f" stroke-width="2.4" fill="none" stroke-linecap="round" stroke-linejoin="round"/></svg>
312
+ <span><span class="es">lista ordenada</span><span class="en">tidy list</span></span>
313
+ </div>
314
+ <div class="merit">
315
+ <svg width="52" height="52" viewBox="0 0 52 52"><circle cx="26" cy="26" r="22" fill="#fdf2dc" stroke="#d9a441" stroke-width="2"/><path d="M26 14a8 8 0 0 1 8 8c0 5-8 12-8 12s-8-7-8-12a8 8 0 0 1 8-8z" fill="#f3d9a3" stroke="#b9863a" stroke-width="1.4"/><circle cx="26" cy="22" r="3" fill="#c4694e"/></svg>
316
+ <span><span class="es">nada se subió</span><span class="en">nothing uploaded</span></span>
317
+ </div>
318
+ </div>
319
+ <h2><span class="es">Listo. Esto fue lo que encontré 🌿</span><span class="en">All done. Here's what I found 🌿</span></h2>
320
+ <p class="sub"><span class="es">Léalo con calma. Usted decide lo que toca el dinero.</span><span class="en">Read it calmly. You decide anything that touches money.</span></p>
321
+ </div>
322
+
323
+ <!-- THE SUMMARY (hero) -->
324
+ <div class="card summary">
325
+ <h3><span class="es">Su resumen, en palabras sencillas</span><span class="en">Your summary, in plain words</span></h3>
326
+ <p class="read"><span class="es">— puede leerlo en voz alta a Yolanda, o imprimirlo.</span><span class="en">— you can read it aloud to Yolanda, or print it.</span></p>
327
+
328
+ <div class="sline">
329
+ <span class="ic"><svg width="22" height="22" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#eef5e6"/><path d="M7 12l3.5 3.5L17 8" stroke="#52733f" stroke-width="2.2" fill="none" stroke-linecap="round" stroke-linejoin="round"/></svg></span>
330
+ <p class="es">El <b>al pastor</b> estaba escrito de <b>4 maneras</b> (al pastor, Al Pastor, pastor, "al pstr"). Los junté todos: <b>1,204 vendidos</b>.</p>
331
+ <p class="en"><b>Al pastor</b> was written <b>4 ways</b> (al pastor, Al Pastor, pastor, "al pstr"). I counted them together: <b>1,204 sold</b>.</p>
332
+ </div>
333
+ <div class="sline">
334
+ <span class="ic"><svg width="22" height="22" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#eef5e6"/><path d="M7 12l3.5 3.5L17 8" stroke="#52733f" stroke-width="2.2" fill="none" stroke-linecap="round" stroke-linejoin="round"/></svg></span>
335
+ <p class="es">Unos espacios estaban en blanco (escritos como <b>"N/A"</b> o solo una raya). Los traté como vacíos.</p>
336
+ <p class="en">Some spots were left blank (written as <b>"N/A"</b> or just a dash). I treated those as empty.</p>
337
+ </div>
338
+ <div class="sline">
339
+ <span class="ic"><svg width="22" height="22" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#eef5e6"/><path d="M7 12l3.5 3.5L17 8" stroke="#52733f" stroke-width="2.2" fill="none" stroke-linecap="round" stroke-linejoin="round"/></svg></span>
340
+ <p class="es">Puse todos los <b>teléfonos</b> y las <b>fechas</b> igualitos, para que se lean fácil.</p>
341
+ <p class="en">I made all the <b>phone numbers</b> and <b>dates</b> match, so they're easy to read.</p>
342
+ </div>
343
+ <div class="sline">
344
+ <span class="ic"><svg width="22" height="22" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#fdf2dc"/><path d="M12 6v7M12 16.5v.5" stroke="#b9863a" stroke-width="2.2" fill="none" stroke-linecap="round"/></svg></span>
345
+ <p class="es">Hay <b>2 cositas</b> que prefiero <b>preguntarle</b> antes de tocar — porque tienen que ver con dinero. Están abajo. 👇</p>
346
+ <p class="en">There are <b>2 things</b> I'd rather <b>ask you</b> about before touching — because they involve money. They're below. 👇</p>
347
+ </div>
348
+ </div>
349
+
350
+ <!-- DONE change card with before/after -->
351
+ <div class="secttitle">
352
+ <svg width="18" height="18" viewBox="0 0 24 24"><circle cx="12" cy="12" r="10" fill="#cfe0bd"/><path d="M8 12l3 3 5-6" stroke="#52733f" stroke-width="2" fill="none" stroke-linecap="round"/></svg>
353
+ <span class="es">Lo que ya dejé arregladito</span><span class="en">What I already tidied for you</span>
354
+ </div>
355
+
356
+ <div class="card chg done">
357
+ <h4><span class="es">El mismo taco, contado junto</span><span class="en">The same taco, counted together</span> <span class="chk"><span class="es">YA HECHO</span><span class="en">DONE</span></span></h4>
358
+ <div class="ba">
359
+ <div class="col before">
360
+ <div class="ttl"><span class="es">Antes</span><span class="en">Before</span></div>
361
+ <div class="row"><span class="q was">al pastor</span><span class="was">312</span></div>
362
+ <div class="row"><span class="q was">Al Pastor</span><span class="was">520</span></div>
363
+ <div class="row"><span class="q was">pastor</span><span class="was">301</span></div>
364
+ <div class="row"><span class="q was">al pstr</span><span class="was">71</span></div>
365
+ </div>
366
+ <div class="arrow">➜</div>
367
+ <div class="col after">
368
+ <div class="ttl"><span class="es">Después</span><span class="en">After</span></div>
369
+ <div class="row"><span class="q">Al pastor</span><span class="now">1,204</span></div>
370
+ <div class="row" style="color:var(--ink-soft)"><span class="q" style="font-style:italic"><span class="es">una sola fila, bien clara</span><span class="en">one tidy line</span></span><span></span></div>
371
+ </div>
372
+ </div>
373
+ </div>
374
+
375
+ <div class="card chg done">
376
+ <h4><span class="es">Los blancos disfrazados</span><span class="en">The disguised blanks</span> <span class="chk"><span class="es">YA HECHO</span><span class="en">DONE</span></span></h4>
377
+ <div class="ba">
378
+ <div class="col before">
379
+ <div class="ttl"><span class="es">Antes</span><span class="en">Before</span></div>
380
+ <div class="row"><span class="q was">tel.</span><span class="was">N/A</span></div>
381
+ <div class="row"><span class="q was">notas</span><span class="was">—</span></div>
382
+ <div class="row"><span class="q was">extra</span><span class="was">none</span></div>
383
+ </div>
384
+ <div class="arrow">➜</div>
385
+ <div class="col after">
386
+ <div class="ttl"><span class="es">Después</span><span class="en">After</span></div>
387
+ <div class="row"><span class="q">tel.</span><span class="now"><span class="es">(vacío)</span><span class="en">(empty)</span></span></div>
388
+ <div class="row"><span class="q">notas</span><span class="now"><span class="es">(vacío)</span><span class="en">(empty)</span></span></div>
389
+ <div class="row"><span class="q">extra</span><span class="now"><span class="es">(vacío)</span><span class="en">(empty)</span></span></div>
390
+ </div>
391
+ </div>
392
+ </div>
393
+
394
+ <!-- ASK cards (money / identity) -->
395
+ <div class="secttitle">
396
+ <svg width="18" height="18" viewBox="0 0 24 24"><circle cx="12" cy="12" r="10" fill="#f3d9a3"/><path d="M12 7v6M12 16v.5" stroke="#b9863a" stroke-width="2" fill="none" stroke-linecap="round"/></svg>
397
+ <span class="es">Aquí mejor le pregunto a usted</span><span class="en">Here I'd better ask you</span>
398
+ </div>
399
+
400
+ <div class="card chg ask">
401
+ <h4><span class="es">31 renglones marcaron $0.00</span><span class="en">31 lines showed $0.00</span> <span class="pill-ask"><span class="es">¿ME DICE?</span><span class="en">YOUR CALL</span></span></h4>
402
+ <p style="margin:0 0 4px">
403
+ <span class="es">Encontré <b>31 ventas en $0.00</b> — eso casi siempre es una falla de la caja, no una venta de verdad. ¿Quiere que las <b>deje fuera del total</b> del mes?</span>
404
+ <span class="en">I found <b>31 sales at $0.00</b> — that's usually a register glitch, not a real sale. Want me to <b>leave them out of the month's total</b>?</span>
405
+ </p>
406
+ <div class="askbtns">
407
+ <button class="yes" onclick="answer(this)"><span class="es">Sí, déjalas fuera</span><span class="en">Yes, leave them out</span></button>
408
+ <button class="no" onclick="answer(this)"><span class="es">No, déjalas</span><span class="en">No, keep them</span></button>
409
+ </div>
410
+ <div class="answered">
411
+ <svg width="18" height="18" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#cfe0bd"/><path d="M7 12l3.5 3.5L17 8" stroke="#52733f" stroke-width="2.2" fill="none" stroke-linecap="round"/></svg>
412
+ <span class="es">Listo — usted decidió. Lo anoté en su resumen.</span><span class="en">Done — your call. I noted it in your summary.</span>
413
+ </div>
414
+ </div>
415
+
416
+ <div class="card chg ask">
417
+ <h4><span class="es">Dos clientes parecen el mismo</span><span class="en">Two customers look like the same one</span> <span class="pill-ask"><span class="es">¿ME DICE?</span><span class="en">YOUR CALL</span></span></h4>
418
+ <p style="margin:0 0 4px">
419
+ <span class="es"><b>"Yolanda R."</b> y <b>"Yolanda Reyes"</b> tienen el mismo teléfono. ¿Los <b>cuento como una sola persona</b>?</span>
420
+ <span class="en"><b>"Yolanda R."</b> and <b>"Yolanda Reyes"</b> share the same phone. Should I <b>count them as one person</b>?</span>
421
+ </p>
422
+ <div class="askbtns">
423
+ <button class="yes" onclick="answer(this)"><span class="es">Sí, es la misma</span><span class="en">Yes, same person</span></button>
424
+ <button class="no" onclick="answer(this)"><span class="es">No, son distintas</span><span class="en">No, keep both</span></button>
425
+ </div>
426
+ <div class="answered">
427
+ <svg width="18" height="18" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#cfe0bd"/><path d="M7 12l3.5 3.5L17 8" stroke="#52733f" stroke-width="2.2" fill="none" stroke-linecap="round"/></svg>
428
+ <span class="es">Listo — usted decidió.</span><span class="en">Done — your call.</span>
429
+ </div>
430
+ </div>
431
+
432
+ <!-- HONEST FLAGS -->
433
+ <div class="secttitle">
434
+ <svg width="18" height="18" viewBox="0 0 24 24"><circle cx="12" cy="12" r="10" fill="#e3edf2"/><path d="M12 7v5M12 15v.5" stroke="#5b7d8c" stroke-width="2" fill="none" stroke-linecap="round"/></svg>
435
+ <span class="es">No estuve segura de esto — lo dejé para usted</span><span class="en">I wasn't sure about these — I left them for you</span>
436
+ </div>
437
+ <div class="card chg flag">
438
+ <h4><span class="es">Dos teléfonos raros y una nota de catering</span><span class="en">Two odd phones and a catering note</span> <span class="pill-flag"><span class="es">PARA REVISAR</span><span class="en">FOR YOU</span></span></h4>
439
+ <p style="margin:0">
440
+ <span class="es">Dos teléfonos tienen muy pocos números, y una nota dice "evento — preguntar a Memo". No quise adivinar, así que <b>los dejé tal cual</b> para que usted los vea con calma.</span>
441
+ <span class="en">Two phones have too few digits, and one note says "event — ask Memo." I didn't want to guess, so I <b>left them exactly as they were</b> for you to peek at.</span>
442
+ </p>
443
+ </div>
444
+
445
+ <!-- BONUS -->
446
+ <div class="card bonus">
447
+ <h4>
448
+ <svg width="24" height="24" viewBox="0 0 24 24"><path d="M5 9h14l-1.3 9.2A2 2 0 0 1 15.7 20H8.3a2 2 0 0 1-2-1.8L5 9z" fill="#f3d9a3" stroke="#b9863a" stroke-width="1.3"/><path d="M8 9a4 4 0 0 1 8 0" stroke="#b9863a" stroke-width="1.3" fill="none"/></svg>
449
+ <span class="es">De pilón: lo que se le está acabando</span><span class="en">A little bonus: what you're running low on</span>
450
+ </h4>
451
+ <p style="margin:0;color:var(--ink-soft)">
452
+ <span class="es">Ya que andábamos en sus números, le aparté esto para el pedido:</span>
453
+ <span class="en">While I was in your numbers, I set this aside for your reorder:</span>
454
+ </p>
455
+ <ul>
456
+ <li><span class="dot"></span><span class="es"><b>Marinada de pastor</b> — para ~3 días. Tal vez pedir el lunes.</span><span class="en"><b>Pastor marinade</b> — about 3 days left. Maybe order Monday.</span></li>
457
+ <li><span class="dot"></span><span class="es"><b>Tortillas</b> — bajando rápido este fin de semana.</span><span class="en"><b>Tortillas</b> — going fast this weekend.</span></li>
458
+ </ul>
459
+ </div>
460
+
461
+ <!-- GET MY CLEAN COPY -->
462
+ <div class="card getit">
463
+ <svg width="58" height="58" viewBox="0 0 58 58" style="margin:0 auto 6px"><circle cx="29" cy="29" r="26" fill="#dcebcb" stroke="#6f8f5a" stroke-width="2"/><path d="M29 16v18M22 28l7 7 7-7" stroke="#52733f" stroke-width="3" fill="none" stroke-linecap="round" stroke-linejoin="round"/><path d="M19 40h20" stroke="#52733f" stroke-width="3" stroke-linecap="round"/></svg>
464
+ <h3><span class="es">¡Quedó preciosa, Doña Lupe!</span><span class="en">It looks lovely, Lupe!</span></h3>
465
+ <p><span class="es">Aquí está su copia limpia y su resumen para imprimir o mandar por correo.</span><span class="en">Here's your clean copy and your summary to print or email.</span></p>
466
+ <div class="btns">
467
+ <button class="btn btn-dl"><span class="es">Bajar mi copia limpia</span><span class="en">Get my clean copy</span></button>
468
+ <button class="btn btn-ghost"><span class="es">Imprimir el resumen</span><span class="en">Print the summary</span></button>
469
+ </div>
470
+ <p class="undo">
471
+ <svg width="15" height="15" viewBox="0 0 24 24" style="vertical-align:-2px"><path d="M12 5V2L7 7l5 5V8a6 6 0 1 1-6 6" stroke="#52733f" stroke-width="1.8" fill="none" stroke-linecap="round" stroke-linejoin="round"/></svg>
472
+ <span class="es">Su archivo original sigue a salvo. <a>Déjelo como estaba</a> cuando quiera.</span>
473
+ <span class="en">Your original is safe. <a>Put it back the way it was</a> any time.</span>
474
+ </p>
475
+ </div>
476
+
477
+ <button class="restart" onclick="reset()"><span class="es">↺ empezar de nuevo con otro archivo</span><span class="en">↺ start over with another file</span></button>
478
+ </section>
479
+
480
+ <p class="footnote">
481
+ <span class="es">Hecho con cariño para una hora tranquila en la mesa de la cocina · funciona sin internet</span>
482
+ <span class="en">Made with care for a quiet hour at the kitchen table · works without internet</span>
483
+ </p>
484
+ </div>
485
+
486
+ <script>
487
+ function setLang(l){
488
+ document.body.classList.toggle('es-on', l==='es');
489
+ document.getElementById('bES').classList.toggle('on', l==='es');
490
+ document.getElementById('bEN').classList.toggle('on', l!=='es');
491
+ document.documentElement.lang = l;
492
+ }
493
+ function show(id){
494
+ document.querySelectorAll('.screen').forEach(s=>s.classList.remove('active'));
495
+ document.getElementById(id).classList.add('active');
496
+ window.scrollTo({top:0,behavior:'smooth'});
497
+ }
498
+ function pick(){ /* mock: file already shown as chosen */ }
499
+ function go(){
500
+ show('s2');
501
+ const isES = document.body.classList.contains('es-on');
502
+ const steps = isES ? [
503
+ 'Juntando los tacos que están escritos de varias maneras…',
504
+ 'Emparejando los teléfonos y las fechas…',
505
+ 'Buscando blancos disfrazados como "N/A" o una raya…',
506
+ 'Apartando lo que mejor le pregunto a usted…'
507
+ ] : [
508
+ 'Gathering the items written a few different ways…',
509
+ 'Matching up the phone numbers and dates…',
510
+ 'Looking for blanks disguised as "N/A" or a dash…',
511
+ 'Setting aside the things I should ask you about…'
512
+ ];
513
+ let i=0;
514
+ const el = document.getElementById('step');
515
+ const t = setInterval(()=>{ i++; if(i<steps.length){ el.textContent = steps[i]; } }, 1050);
516
+ setTimeout(()=>{ clearInterval(t); show('s3'); }, 4400);
517
+ }
518
+ function answer(btn){
519
+ const card = btn.closest('.chg');
520
+ card.querySelector('.askbtns').style.display='none';
521
+ card.querySelector('.answered').style.display='flex';
522
+ }
523
+ function reset(){ show('s1'); }
524
+ </script>
525
+ </body>
526
+ </html>
design/mockups/helper/index.html ADDED
@@ -0,0 +1,517 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="es">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>ScrubData — tu ayudante de listas</title>
7
+ <style>
8
+ :root{
9
+ --paper:#fbf4e7;
10
+ --paper-2:#f4e9d4;
11
+ --card:#fffdf8;
12
+ --ink:#4a3b2e;
13
+ --ink-soft:#6f5d49;
14
+ --line:#e6d6b8;
15
+ --accent:#e07a3f; /* warm terracotta */
16
+ --accent-soft:#f6c89a;
17
+ --leaf:#6e8a5a; /* trail green */
18
+ --leaf-soft:#dfe7cf;
19
+ --sky:#8fb0c4;
20
+ --shadow:0 10px 28px rgba(120,90,50,.14);
21
+ --shadow-sm:0 4px 12px rgba(120,90,50,.10);
22
+ --radius:22px;
23
+ --font: "Segoe UI", "Helvetica Neue", system-ui, -apple-system, "Trebuchet MS", sans-serif;
24
+ }
25
+ *{box-sizing:border-box;}
26
+ html,body{margin:0;padding:0;}
27
+ body{
28
+ font-family:var(--font);
29
+ color:var(--ink);
30
+ background:
31
+ radial-gradient(1200px 600px at 80% -10%, #fdf6e8 0%, rgba(253,246,232,0) 60%),
32
+ radial-gradient(900px 500px at 0% 100%, #f6ecd6 0%, rgba(246,236,214,0) 55%),
33
+ var(--paper);
34
+ -webkit-font-smoothing:antialiased;
35
+ line-height:1.5;
36
+ min-height:100vh;
37
+ }
38
+ /* tiny hand-drawn paper texture via repeating soft dots */
39
+ body::before{
40
+ content:"";position:fixed;inset:0;pointer-events:none;z-index:0;
41
+ background-image:radial-gradient(rgba(180,150,100,.06) 1px, transparent 1px);
42
+ background-size:22px 22px;
43
+ }
44
+ .wrap{position:relative;z-index:1;max-width:880px;margin:0 auto;padding:26px 20px 80px;}
45
+
46
+ /* ---- top bar ---- */
47
+ .topbar{display:flex;align-items:center;justify-content:space-between;margin-bottom:18px;}
48
+ .brand{display:flex;align-items:center;gap:11px;font-weight:800;font-size:20px;letter-spacing:.2px;}
49
+ .brand .logo{
50
+ width:40px;height:40px;border-radius:14px;
51
+ background:linear-gradient(150deg,var(--accent),#f0a05f);
52
+ display:grid;place-items:center;color:#fff;font-size:20px;
53
+ box-shadow:var(--shadow-sm);transform:rotate(-4deg);
54
+ }
55
+ .brand small{display:block;font-weight:600;font-size:12px;color:var(--ink-soft);letter-spacing:0;}
56
+ .lang{
57
+ display:flex;background:var(--card);border:1.5px solid var(--line);
58
+ border-radius:999px;padding:4px;box-shadow:var(--shadow-sm);font-weight:700;font-size:13px;
59
+ }
60
+ .lang button{
61
+ border:0;background:transparent;color:var(--ink-soft);
62
+ padding:6px 14px;border-radius:999px;cursor:pointer;font:inherit;font-weight:700;
63
+ }
64
+ .lang button.on{background:var(--accent);color:#fff;}
65
+
66
+ /* ---- persistent safety ribbon ---- */
67
+ .safety{
68
+ display:flex;align-items:center;gap:10px;
69
+ background:var(--leaf-soft);color:#41522f;
70
+ border:1.5px solid #cdd9bb;border-radius:999px;
71
+ padding:9px 16px;font-size:14px;font-weight:600;margin-bottom:24px;
72
+ box-shadow:var(--shadow-sm);
73
+ }
74
+ .safety .dot{font-size:16px;}
75
+
76
+ /* ---- card base ---- */
77
+ .card{
78
+ background:var(--card);border:1.5px solid var(--line);
79
+ border-radius:var(--radius);box-shadow:var(--shadow);
80
+ padding:30px;margin-bottom:22px;
81
+ }
82
+ h1{font-size:30px;margin:.1em 0 .25em;line-height:1.2;}
83
+ h2{font-size:22px;margin:.1em 0 .5em;}
84
+ .lead{font-size:18px;color:var(--ink-soft);margin:0 0 6px;}
85
+
86
+ /* ---- screen toggling ---- */
87
+ .screen{display:none;}
88
+ .screen.active{display:block;animation:fade .5s ease;}
89
+ @keyframes fade{from{opacity:0;transform:translateY(8px);}to{opacity:1;transform:none;}}
90
+
91
+ /* ---- step pills ---- */
92
+ .steps{display:flex;gap:8px;justify-content:center;margin-bottom:20px;flex-wrap:wrap;}
93
+ .steps .pill{
94
+ font-size:12.5px;font-weight:700;color:var(--ink-soft);
95
+ background:var(--paper-2);border:1.5px solid var(--line);
96
+ padding:6px 13px;border-radius:999px;cursor:pointer;transition:.2s;
97
+ }
98
+ .steps .pill.on{background:var(--accent);color:#fff;border-color:var(--accent);}
99
+
100
+ /* ---- drop zone ---- */
101
+ .drop{
102
+ border:2.5px dashed var(--accent-soft);border-radius:26px;
103
+ background:linear-gradient(180deg,#fffdf7,#fdf3e2);
104
+ padding:46px 24px;text-align:center;cursor:pointer;transition:.2s;
105
+ }
106
+ .drop:hover{border-color:var(--accent);transform:translateY(-2px);box-shadow:var(--shadow);}
107
+ .drop .big{font-size:54px;line-height:1;margin-bottom:10px;}
108
+ .drop .title{font-size:21px;font-weight:800;margin-bottom:4px;}
109
+ .drop .sub{color:var(--ink-soft);font-size:15px;}
110
+ .file-chip{
111
+ display:inline-flex;align-items:center;gap:9px;margin-top:18px;
112
+ background:var(--leaf-soft);border:1.5px solid #cdd9bb;border-radius:14px;
113
+ padding:9px 15px;font-weight:700;font-size:14.5px;color:#41522f;
114
+ }
115
+
116
+ /* ---- big friendly button ---- */
117
+ .btn{
118
+ border:0;cursor:pointer;font:inherit;font-weight:800;font-size:18px;
119
+ background:linear-gradient(150deg,var(--accent),#ef9a55);color:#fff;
120
+ padding:16px 30px;border-radius:18px;box-shadow:0 8px 18px rgba(224,122,63,.30);
121
+ transition:.15s;display:inline-flex;align-items:center;gap:10px;
122
+ }
123
+ .btn:hover{transform:translateY(-2px);box-shadow:0 12px 22px rgba(224,122,63,.38);}
124
+ .btn.ghost{
125
+ background:var(--card);color:var(--ink);border:1.5px solid var(--line);
126
+ box-shadow:var(--shadow-sm);font-size:15px;padding:12px 20px;
127
+ }
128
+ .btn.ghost:hover{box-shadow:var(--shadow-sm);}
129
+ .center{text-align:center;}
130
+ .mt{margin-top:22px;}
131
+
132
+ /* ---- working state ---- */
133
+ .work{text-align:center;padding:20px 10px 6px;}
134
+ .pot{font-size:64px;display:inline-block;animation:stir 1.6s ease-in-out infinite;}
135
+ @keyframes stir{0%,100%{transform:rotate(-6deg);}50%{transform:rotate(6deg);}}
136
+ .progress{height:14px;background:var(--paper-2);border-radius:999px;overflow:hidden;margin:22px auto;max-width:430px;border:1.5px solid var(--line);}
137
+ .progress > i{display:block;height:100%;width:0;background:linear-gradient(90deg,var(--accent),var(--leaf));border-radius:999px;animation:fill 3.4s ease forwards;}
138
+ @keyframes fill{to{width:100%;}}
139
+ .work-note{color:var(--ink-soft);font-size:15px;min-height:22px;}
140
+
141
+ /* ---- summary hero ---- */
142
+ .badge-row{display:flex;align-items:center;gap:16px;flex-wrap:wrap;margin-bottom:6px;}
143
+ .merit{
144
+ width:78px;height:78px;flex:none;border-radius:50%;
145
+ background:radial-gradient(circle at 50% 35%,#fbe2c2,#f0b277);
146
+ border:3px dashed #d98b4e;display:grid;place-items:center;
147
+ color:#7a4a1f;font-size:30px;box-shadow:var(--shadow-sm);transform:rotate(-5deg);
148
+ }
149
+ .summary-list{margin:18px 0 4px;padding:0;list-style:none;display:grid;gap:12px;}
150
+ .summary-list li{
151
+ display:flex;gap:13px;align-items:flex-start;font-size:16.5px;
152
+ background:var(--paper);border:1.5px solid var(--line);border-radius:16px;padding:13px 16px;
153
+ }
154
+ .summary-list .ic{font-size:22px;flex:none;line-height:1.2;}
155
+ .summary-list b{color:var(--ink);}
156
+
157
+ /* ---- change cards (before/after) ---- */
158
+ .change{
159
+ border:1.5px solid var(--line);border-radius:18px;background:var(--card);
160
+ padding:18px 18px 16px;margin-bottom:16px;box-shadow:var(--shadow-sm);
161
+ }
162
+ .change .head{font-weight:800;font-size:17px;margin-bottom:4px;display:flex;align-items:center;gap:9px;}
163
+ .change .say{color:var(--ink-soft);font-size:14.5px;margin-bottom:14px;}
164
+ .ba{display:grid;grid-template-columns:1fr auto 1fr;gap:12px;align-items:center;}
165
+ .ba .col{background:var(--paper);border:1.5px solid var(--line);border-radius:14px;padding:12px 14px;}
166
+ .ba .lab{font-size:11.5px;font-weight:800;letter-spacing:.5px;text-transform:uppercase;color:var(--ink-soft);margin-bottom:7px;}
167
+ .ba .col.after{background:var(--leaf-soft);border-color:#cdd9bb;}
168
+ .ba .row{font-size:15px;padding:3px 0;color:var(--ink);}
169
+ .ba .row.dim{color:#a98f6e;}
170
+ .ba .arrow{font-size:26px;color:var(--accent);text-align:center;}
171
+
172
+ /* gentle confirm card */
173
+ .ask{
174
+ border:1.5px solid var(--accent-soft);background:linear-gradient(180deg,#fffaf2,#fdf1e0);
175
+ border-radius:18px;padding:18px;margin-bottom:16px;box-shadow:var(--shadow-sm);
176
+ }
177
+ .ask .q{font-weight:800;font-size:17px;margin-bottom:5px;display:flex;gap:9px;align-items:center;}
178
+ .ask .detail{color:var(--ink-soft);font-size:14.5px;margin-bottom:14px;}
179
+ .ask .actions{display:flex;gap:10px;flex-wrap:wrap;}
180
+ .yes{background:var(--leaf);color:#fff;border:0;font-weight:800;border-radius:13px;padding:11px 20px;cursor:pointer;font:inherit;font-weight:800;}
181
+ .no{background:var(--card);color:var(--ink);border:1.5px solid var(--line);font-weight:700;border-radius:13px;padding:11px 20px;cursor:pointer;font:inherit;font-weight:700;}
182
+ .answered{font-weight:800;color:var(--leaf);font-size:15px;display:none;align-items:center;gap:8px;margin-top:4px;}
183
+
184
+ /* honest flags */
185
+ .flags{background:#fcf6ea;border:1.5px dashed #e0c9a0;border-radius:18px;padding:18px;margin-bottom:16px;}
186
+ .flags .q{font-weight:800;font-size:16.5px;margin-bottom:8px;display:flex;gap:9px;align-items:center;}
187
+ .flags ul{margin:6px 0 0;padding-left:4px;list-style:none;}
188
+ .flags li{font-size:14.5px;color:var(--ink-soft);padding:6px 0;border-top:1px dashed #e7d6b6;}
189
+ .flags li:first-child{border-top:0;}
190
+
191
+ /* bonus card */
192
+ .bonus{
193
+ background:linear-gradient(150deg,#eef3e3,#e3ecd2);border:1.5px solid #cdd9bb;
194
+ border-radius:18px;padding:20px;margin-bottom:16px;display:flex;gap:15px;align-items:center;
195
+ }
196
+ .bonus .em{font-size:42px;flex:none;}
197
+ .bonus .t{font-weight:800;font-size:17px;color:#3f5230;margin-bottom:3px;}
198
+ .bonus .d{color:#4f6240;font-size:14.5px;}
199
+
200
+ /* download band */
201
+ .download{
202
+ text-align:center;background:linear-gradient(180deg,#fffdf7,#fdf2e1);
203
+ border:1.5px solid var(--line);border-radius:20px;padding:26px 20px;margin-bottom:8px;
204
+ }
205
+ .download .small{color:var(--ink-soft);font-size:13.5px;margin-top:12px;}
206
+
207
+ .section-title{font-size:14px;font-weight:800;letter-spacing:.6px;text-transform:uppercase;color:var(--ink-soft);margin:26px 4px 12px;}
208
+
209
+ .footnote{text-align:center;color:var(--ink-soft);font-size:13px;margin-top:30px;}
210
+ @media(max-width:560px){
211
+ .ba{grid-template-columns:1fr;}
212
+ .ba .arrow{transform:rotate(90deg);}
213
+ h1{font-size:25px;}
214
+ }
215
+ </style>
216
+ </head>
217
+ <body>
218
+ <div class="wrap">
219
+
220
+ <!-- TOP BAR -->
221
+ <div class="topbar">
222
+ <div class="brand">
223
+ <span class="logo">🧺</span>
224
+ <span>ScrubData<small data-es="tu ayudante de listas" data-en="your list helper">tu ayudante de listas</small></span>
225
+ </div>
226
+ <div class="lang">
227
+ <button class="on" onclick="setLang('es',this)">Español</button>
228
+ <button onclick="setLang('en',this)">English</button>
229
+ </div>
230
+ </div>
231
+
232
+ <!-- PERSISTENT SAFETY RIBBON -->
233
+ <div class="safety">
234
+ <span class="dot">🌿</span>
235
+ <span data-es="Tu archivo original no se toca. Nada sale de esta computadora. Siempre puedes dejarlo como estaba."
236
+ data-en="Your original file stays exactly as it is. Nothing leaves this computer. You can always put it back the way it was.">
237
+ Tu archivo original no se toca. Nada sale de esta computadora. Siempre puedes dejarlo como estaba.
238
+ </span>
239
+ </div>
240
+
241
+ <!-- STEP PILLS (let reviewer walk the arc) -->
242
+ <div class="steps">
243
+ <span class="pill on" onclick="go(0,this)" data-es="1 · Bienvenida" data-en="1 · Welcome">1 · Bienvenida</span>
244
+ <span class="pill" onclick="go(1,this)" data-es="2 · Acomodando" data-en="2 · Tidying">2 · Acomodando</span>
245
+ <span class="pill" onclick="go(2,this)" data-es="3 · Lo que encontré" data-en="3 · What I found">3 · Lo que encontré</span>
246
+ </div>
247
+
248
+ <!-- ============ SCREEN 1 — WELCOME + DROP ============ -->
249
+ <section class="screen active" id="s0">
250
+ <div class="card">
251
+ <h1 data-es="Hola, Lupita. Vamos a poner tu lista bonita. 🌼"
252
+ data-en="Hi, Lupita. Let's make your list nice and tidy. 🌼">
253
+ Hola, Lupita. Vamos a poner tu lista bonita. 🌼
254
+ </h1>
255
+ <p class="lead" data-es="Sube tu archivo y yo lo reviso contigo, despacito. Sin botones raros, sin configurar nada."
256
+ data-en="Drop your file and I'll look through it with you, nice and slow. No strange buttons, nothing to set up.">
257
+ Sube tu archivo y yo lo reviso contigo, despacito. Sin botones raros, sin configurar nada.
258
+ </p>
259
+
260
+ <div class="drop" onclick="go(1)">
261
+ <div class="big">📂</div>
262
+ <div class="title" data-es="Suelta tu archivo aquí — yo le echo un ojo."
263
+ data-en="Drop your file here — I'll take a look.">Suelta tu archivo aquí — yo le echo un ojo.</div>
264
+ <div class="sub" data-es="Excel o CSV está bien. Tu original se queda igualito."
265
+ data-en="Excel or CSV is fine. Your original stays exactly as it is.">Excel o CSV está bien. Tu original se queda igualito.</div>
266
+ <div class="file-chip">📄 resumen-del-mes-mayo.xlsx</div>
267
+ </div>
268
+
269
+ <div class="center mt">
270
+ <button class="btn" onclick="go(1)">
271
+ <span>🧽</span><span data-es="Acomódalo por mí" data-en="Clean it up">Acomódalo por mí</span>
272
+ </button>
273
+ </div>
274
+ </div>
275
+ </section>
276
+
277
+ <!-- ============ SCREEN 2 — WORKING ============ -->
278
+ <section class="screen" id="s1">
279
+ <div class="card work">
280
+ <div class="pot">🍲</div>
281
+ <h2 data-es="Estoy acomodando tu lista…" data-en="I'm tidying your list…">Estoy acomodando tu lista…</h2>
282
+ <div class="progress"><i></i></div>
283
+ <p class="work-note" id="workNote"
284
+ data-es="Trabajando aquí mismo, en tu computadora. Tu original está a salvo."
285
+ data-en="Working right here on your computer. Your original is safe.">
286
+ Trabajando aquí mismo, en tu computadora. Tu original está a salvo.
287
+ </p>
288
+ <div class="center mt">
289
+ <button class="btn ghost" onclick="go(2)" data-es="Ver lo que encontré →" data-en="See what I found →">Ver lo que encontré →</button>
290
+ </div>
291
+ </div>
292
+ </section>
293
+
294
+ <!-- ============ SCREEN 3 — RESULT ============ -->
295
+ <section class="screen" id="s2">
296
+
297
+ <!-- SUMMARY HERO -->
298
+ <div class="card">
299
+ <div class="badge-row">
300
+ <div class="merit">🏅</div>
301
+ <div>
302
+ <h1 style="margin:0" data-es="¡Listo! Tu lista quedó bien bonita."
303
+ data-en="All done! Your list is in great shape.">¡Listo! Tu lista quedó bien bonita.</h1>
304
+ <p class="lead" style="margin:2px 0 0" data-es="Esto fue lo que acomodé por ti — léelo en voz alta a Yolanda si quieres."
305
+ data-en="Here's what I tidied for you — read it out loud to Yolanda if you like.">
306
+ Esto fue lo que acomodé por ti — léelo en voz alta a Yolanda si quieres.
307
+ </p>
308
+ </div>
309
+ </div>
310
+
311
+ <ul class="summary-list">
312
+ <li><span class="ic">🌮</span><span data-es="<b>“Al pastor”</b> estaba escrito de 4 maneras. Los junté: <b>1,204 vendidos</b> en mayo."
313
+ data-en="<b>“Al pastor”</b> was written 4 different ways. I counted them together: <b>1,204 sold</b> in May.">
314
+ <b>“Al pastor”</b> estaba escrito de 4 maneras. Los junté: <b>1,204 vendidos</b> en mayo.</span></li>
315
+ <li><span class="ic">👥</span><span data-es="<b>3 clientes</b> aparecían dos veces. Los reuní para que los revises."
316
+ data-en="<b>3 customers</b> showed up twice. I gathered each one for you to check.">
317
+ <b>3 clientes</b> aparecían dos veces. Los reuní para que los revises.</span></li>
318
+ <li><span class="ic">📞</span><span data-es="Acomodé <b>todos los teléfonos</b> para que se lean igualito."
319
+ data-en="I made <b>all the phone numbers</b> match so they're easy to read.">
320
+ Acomodé <b>todos los teléfonos</b> para que se lean igualito.</span></li>
321
+ <li><span class="ic">🗓️</span><span data-es="Puse <b>todas las fechas</b> escritas de la misma forma."
322
+ data-en="I made <b>all the dates</b> written the same way.">
323
+ Puse <b>todas las fechas</b> escritas de la misma forma.</span></li>
324
+ <li><span class="ic">⬜</span><span data-es="Algunos espacios decían “N/A” o un guion. Los dejé como <b>vacíos</b>."
325
+ data-en="Some spots said “N/A” or just a dash. I treated those as <b>empty</b>.">
326
+ Algunos espacios decían “N/A” o un guion. Los dejé como <b>vacíos</b>.</span></li>
327
+ </ul>
328
+ </div>
329
+
330
+ <!-- CHANGE CARDS (story, not diff) -->
331
+ <div class="section-title" data-es="Aquí está lo que cambió — antes y después" data-en="Here's what changed — before and after">
332
+ Aquí está lo que cambió — antes y después
333
+ </div>
334
+
335
+ <div class="change">
336
+ <div class="head">🌮 <span data-es="El mismo taco, escrito de varias maneras" data-en="The same taco, written a few ways">El mismo taco, escrito de varias maneras</span></div>
337
+ <div class="say" data-es="La computadora por fin entiende que es el mismo taco. Los conté juntos."
338
+ data-en="The computer finally understands it's the same taco. I counted them together.">
339
+ La computadora por fin entiende que es el mismo taco. Los conté juntos.</div>
340
+ <div class="ba">
341
+ <div class="col">
342
+ <div class="lab" data-es="Antes" data-en="Before">Antes</div>
343
+ <div class="row dim">al pastor</div>
344
+ <div class="row dim">Al Pastor</div>
345
+ <div class="row dim">pastor</div>
346
+ <div class="row dim">tacos al pastor</div>
347
+ </div>
348
+ <div class="arrow">→</div>
349
+ <div class="col after">
350
+ <div class="lab" data-es="Después" data-en="After">Después</div>
351
+ <div class="row"><b>Al pastor</b></div>
352
+ <div class="row" data-es="1,204 vendidos" data-en="1,204 sold">1,204 vendidos</div>
353
+ </div>
354
+ </div>
355
+ </div>
356
+
357
+ <div class="change">
358
+ <div class="head">📞 <span data-es="Los teléfonos, todos parejitos" data-en="Phone numbers, all matching">Los teléfonos, todos parejitos</span></div>
359
+ <div class="say" data-es="Los dejé escritos igual para que sean fáciles de leer y marcar."
360
+ data-en="I made them all match so they're easy to read and dial.">
361
+ Los dejé escritos igual para que sean fáciles de leer y marcar.</div>
362
+ <div class="ba">
363
+ <div class="col">
364
+ <div class="lab" data-es="Antes" data-en="Before">Antes</div>
365
+ <div class="row dim">55-1234.5678</div>
366
+ <div class="row dim">(55) 12345678</div>
367
+ <div class="row dim">5512345678</div>
368
+ </div>
369
+ <div class="arrow">→</div>
370
+ <div class="col after">
371
+ <div class="lab" data-es="Después" data-en="After">Después</div>
372
+ <div class="row"><b>55 1234 5678</b></div>
373
+ </div>
374
+ </div>
375
+ </div>
376
+
377
+ <!-- GENTLE CONFIRM — money -->
378
+ <div class="ask" id="ask1">
379
+ <div class="q">💵 <span data-es="¿Dejo fuera del total las filas de $0.00?" data-en="Leave the $0.00 rows out of the total?">¿Dejo fuera del total las filas de $0.00?</span></div>
380
+ <div class="detail" data-es="Encontré <b>31 filas que marcan $0.00</b>. Eso parece un error del sistema, no una venta. Tú decides — yo no toco el dinero sin preguntarte."
381
+ data-en="I found <b>31 rows showing $0.00</b>. That looks like a glitch, not a sale. You decide — I won't touch money without asking.">
382
+ Encontré <b>31 filas que marcan $0.00</b>. Eso parece un error del sistema, no una venta. Tú decides — yo no toco el dinero sin preguntarte.</div>
383
+ <div class="actions">
384
+ <button class="yes" onclick="answer('ask1')" data-es="Sí, déjalas fuera" data-en="Yes, leave them out">Sí, déjalas fuera</button>
385
+ <button class="no" onclick="answer('ask1')" data-es="No, déjalas" data-en="No, keep them">No, déjalas</button>
386
+ </div>
387
+ <div class="answered" id="ans-ask1">✓ <span data-es="Anotado. Tú mandas." data-en="Got it. You're in charge.">Anotado. Tú mandas.</span></div>
388
+ </div>
389
+
390
+ <!-- GENTLE CONFIRM — duplicates -->
391
+ <div class="ask" id="ask2">
392
+ <div class="q">👥 <span data-es="¿Estos dos son la misma persona?" data-en="Are these two the same person?">¿Estos dos son la misma persona?</span></div>
393
+ <div class="detail" data-es="<b>“Yolanda Pérez”</b> y <b>“Yola Perez”</b> tienen el mismo teléfono. ¿Los junto en uno solo?"
394
+ data-en="<b>“Yolanda Pérez”</b> and <b>“Yola Perez”</b> share the same phone. Shall I count them as one?">
395
+ <b>“Yolanda Pérez”</b> y <b>“Yola Perez”</b> tienen el mismo teléfono. ¿Los junto en uno solo?</div>
396
+ <div class="actions">
397
+ <button class="yes" onclick="answer('ask2')" data-es="Sí, es la misma" data-en="Yes, same person">Sí, es la misma</button>
398
+ <button class="no" onclick="answer('ask2')" data-es="No, déjalas aparte" data-en="No, keep separate">No, déjalas aparte</button>
399
+ </div>
400
+ <div class="answered" id="ans-ask2">✓ <span data-es="Listo, como tú digas." data-en="Done, as you say.">Listo, como tú digas.</span></div>
401
+ </div>
402
+
403
+ <!-- HONEST FLAGS -->
404
+ <div class="flags">
405
+ <div class="q">🤔 <span data-es="De estas no estuve segura — te las dejé para que las veas" data-en="I wasn't sure about these — I left them for you">De estas no estuve segura — te las dejé para que las veas</span></div>
406
+ <ul>
407
+ <li data-es="Dos teléfonos tienen muy pocos números. No los cambié por si tú los conoces."
408
+ data-en="Two phone numbers have too few digits. I didn't change them in case you know them.">
409
+ Dos teléfonos tienen muy pocos números. No los cambié por si tú los conoces.</li>
410
+ <li data-es="Las notas del catering (“fiesta Sra. Mendoza”) no las entendí bien. Las dejé tal cual."
411
+ data-en="The catering notes (“Mrs. Mendoza's party”) I didn't quite understand. I left them as they were.">
412
+ Las notas del catering (“fiesta Sra. Mendoza”) no las entendí bien. Las dejé tal cual.</li>
413
+ <li data-es="El total de mayo y la suma de las filas no cuadran por $84. Aquí te lo marco para que lo cheques con tu caja."
414
+ data-en="May's total and the rows add up $84 apart. I'm flagging it so you can check it against your cash.">
415
+ El total de mayo y la suma de las filas no cuadran por $84. Aquí te lo marco para que lo cheques con tu caja.</li>
416
+ </ul>
417
+ </div>
418
+
419
+ <!-- BONUS CARD -->
420
+ <div class="bonus">
421
+ <div class="em">🫙</div>
422
+ <div>
423
+ <div class="t" data-es="De pasada: se te está acabando el adobo de pastor"
424
+ data-en="By the way: you're running low on pastor marinade">De pasada: se te está acabando el adobo de pastor</div>
425
+ <div class="d" data-es="Con lo que vendiste, te alcanza para unos 6 días. Buen momento para pedir más."
426
+ data-en="At this pace you have about 6 days left. Good time to reorder.">
427
+ Con lo que vendiste, te alcanza para unos 6 días. Buen momento para pedir más.</div>
428
+ </div>
429
+ </div>
430
+
431
+ <!-- DOWNLOAD BAND -->
432
+ <div class="download">
433
+ <button class="btn" onclick="return false">
434
+ <span>💾</span><span data-es="Dame mi copia limpia" data-en="Get my clean copy">Dame mi copia limpia</span>
435
+ </button>
436
+ <div style="margin-top:14px;">
437
+ <button class="btn ghost" onclick="return false" data-es="🖨️ Imprimir el resumen en palabras sencillas" data-en="🖨️ Print the plain-words summary">
438
+ 🖨️ Imprimir el resumen en palabras sencillas</button>
439
+ </div>
440
+ <div class="small" data-es="Tu original (resumen-del-mes-mayo.xlsx) sigue intacto. Esto es una copia nueva."
441
+ data-en="Your original (resumen-del-mes-mayo.xlsx) is untouched. This is a fresh new copy.">
442
+ Tu original (resumen-del-mes-mayo.xlsx) sigue intacto. Esto es una copia nueva.</div>
443
+ </div>
444
+
445
+ <!-- REVERSIBILITY -->
446
+ <div class="center mt">
447
+ <button class="btn ghost" onclick="return false" data-es="↩️ Mejor déjalo como estaba" data-en="↩️ Put it back the way it was">↩️ Mejor déjalo como estaba</button>
448
+ </div>
449
+
450
+ <div class="footnote" data-es="Lo hiciste tú misma, y está bien. 🌙 Buenas noches, Lupita."
451
+ data-en="You did it yourself, and it's right. 🌙 Goodnight, Lupita.">
452
+ Lo hiciste tú misma, y está bien. 🌙 Buenas noches, Lupita.
453
+ </div>
454
+ </section>
455
+
456
+ </div>
457
+
458
+ <script>
459
+ var screens = ['s0','s1','s2'];
460
+ var pills = document.querySelectorAll('.steps .pill');
461
+
462
+ function go(i, el){
463
+ screens.forEach(function(id,n){
464
+ document.getElementById(id).classList.toggle('active', n===i);
465
+ });
466
+ pills.forEach(function(p,n){ p.classList.toggle('on', n===i); });
467
+ window.scrollTo({top:0,behavior:'smooth'});
468
+ if(i===1){ runWork(); }
469
+ }
470
+
471
+ // working state: cycle reassuring notes, then auto-advance
472
+ var workTimers = [];
473
+ function runWork(){
474
+ workTimers.forEach(clearTimeout); workTimers = [];
475
+ var note = document.getElementById('workNote');
476
+ var es = [
477
+ "Trabajando aquí mismo, en tu computadora. Tu original está a salvo.",
478
+ "Estoy juntando los tacos que están escritos de varias maneras…",
479
+ "Acomodando teléfonos y fechas para que se lean igualito…",
480
+ "Casi listo — guardando una copia nueva, sin tocar tu original."
481
+ ];
482
+ var en = [
483
+ "Working right here on your computer. Your original is safe.",
484
+ "Gathering the tacos that are written a few different ways…",
485
+ "Tidying phone numbers and dates so they're easy to read…",
486
+ "Almost there — saving a fresh copy, leaving your original untouched."
487
+ ];
488
+ var k = (lang==='es') ? es : en;
489
+ var step = 0;
490
+ note.textContent = k[0];
491
+ for(var s=1;s<k.length;s++){
492
+ (function(s){ workTimers.push(setTimeout(function(){ note.textContent = k[s]; }, s*900)); })(s);
493
+ }
494
+ workTimers.push(setTimeout(function(){ if(document.getElementById('s1').classList.contains('active')) go(2); }, 3700));
495
+ }
496
+
497
+ function answer(id){
498
+ var card = document.getElementById(id);
499
+ card.querySelector('.actions').style.display='none';
500
+ document.getElementById('ans-'+id).style.display='flex';
501
+ }
502
+
503
+ // language toggle
504
+ var lang = 'es';
505
+ function setLang(l, el){
506
+ lang = l;
507
+ document.querySelectorAll('.lang button').forEach(function(b){b.classList.remove('on');});
508
+ el.classList.add('on');
509
+ document.documentElement.lang = l;
510
+ document.querySelectorAll('[data-es]').forEach(function(node){
511
+ var v = node.getAttribute('data-'+l);
512
+ if(v!=null) node.innerHTML = v;
513
+ });
514
+ }
515
+ </script>
516
+ </body>
517
+ </html>
design/mockups/office/index.html ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1">
6
+ <title>ScrubData — clean spreadsheets, with the receipts</title>
7
+ <style>
8
+ :root{
9
+ --paper:#faf7f2; --card:#fffdfa; --ink:#23201c; --ink-soft:#6b6359;
10
+ --line:#ece5da; --accent:#2f6f5e; --accent-soft:#e7f1ec;
11
+ --done:#3f7d5f; --done-bg:#eef5ef; --done-line:#cfe3d4;
12
+ --call:#b06a1f; --call-bg:#fbf1e2; --call-line:#f0dcbf;
13
+ --flag:#7a7367; --flag-bg:#f3efe8;
14
+ --shadow:0 1px 2px rgba(40,30,20,.04),0 8px 24px rgba(40,30,20,.06);
15
+ --r:15px;
16
+ }
17
+ *{box-sizing:border-box}
18
+ body{margin:0;background:var(--paper);color:var(--ink);
19
+ font-family:Inter,-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,sans-serif;
20
+ line-height:1.5;-webkit-font-smoothing:antialiased}
21
+ .wrap{max-width:760px;margin:0 auto;padding:0 22px}
22
+ a{color:var(--accent)}
23
+
24
+ /* privacy ribbon */
25
+ .ribbon{background:var(--accent-soft);color:#234e42;font-size:13.5px;
26
+ text-align:center;padding:9px 16px;border-bottom:1px solid #d6e7df}
27
+ .ribbon b{font-weight:600}
28
+
29
+ /* header */
30
+ header{padding:40px 0 8px}
31
+ .logo{display:flex;align-items:center;gap:9px;font-weight:700;font-size:18px;letter-spacing:-.2px}
32
+ .logo .mark{width:26px;height:26px;border-radius:8px;background:var(--accent);
33
+ display:grid;place-items:center;color:#fff;font-size:15px}
34
+ h1{font-size:30px;line-height:1.15;letter-spacing:-.6px;margin:22px 0 8px;font-weight:740}
35
+ .sub{color:var(--ink-soft);font-size:16.5px;max-width:560px}
36
+
37
+ /* file chip */
38
+ .filebar{display:flex;align-items:center;gap:12px;margin:26px 0 6px;
39
+ background:var(--card);border:1px solid var(--line);border-radius:var(--r);
40
+ padding:14px 16px;box-shadow:var(--shadow)}
41
+ .fileicon{width:34px;height:34px;border-radius:9px;background:#eef4f1;color:var(--accent);
42
+ display:grid;place-items:center;font-size:16px;flex:none}
43
+ .filebar .nm{font-weight:600}
44
+ .filebar .meta{color:var(--ink-soft);font-size:13.5px}
45
+ .filebar .spacer{flex:1}
46
+ .pill-done-mini{font-size:12px;font-weight:600;color:var(--done);
47
+ background:var(--done-bg);border:1px solid var(--done-line);padding:3px 9px;border-radius:20px}
48
+
49
+ /* summary */
50
+ section{margin:34px 0}
51
+ .eyebrow{font-size:12.5px;font-weight:700;letter-spacing:.06em;text-transform:uppercase;
52
+ color:var(--ink-soft);margin-bottom:13px}
53
+ .result-h{font-size:22px;font-weight:720;letter-spacing:-.3px;margin:0 0 4px}
54
+ .result-sub{color:var(--ink-soft);margin:0 0 4px}
55
+ .summary{background:var(--card);border:1px solid var(--line);border-radius:var(--r);
56
+ padding:6px 20px;box-shadow:var(--shadow)}
57
+ .summary li{list-style:none;padding:14px 0;border-bottom:1px solid var(--line);
58
+ display:flex;gap:13px;align-items:flex-start;font-size:15.5px}
59
+ .summary li:last-child{border-bottom:0}
60
+ .summary .ic{flex:none;margin-top:1px;font-size:16px}
61
+ .summary b{font-weight:650}
62
+ .handoff{color:var(--call)}
63
+
64
+ /* change cards */
65
+ .card{background:var(--card);border:1px solid var(--line);border-left-width:4px;
66
+ border-radius:var(--r);padding:17px 19px;margin:13px 0;box-shadow:var(--shadow)}
67
+ .card.done{border-left-color:var(--done)}
68
+ .card.call{border-left-color:var(--call)}
69
+ .card.flag{border-left-color:#cdbfa6}
70
+ .card-top{display:flex;align-items:center;gap:10px;margin-bottom:4px}
71
+ .card-title{font-weight:650;font-size:15.5px}
72
+ .pill{font-size:11.5px;font-weight:700;letter-spacing:.04em;padding:3px 9px;border-radius:20px;margin-left:auto;flex:none}
73
+ .pill.done{color:var(--done);background:var(--done-bg);border:1px solid var(--done-line)}
74
+ .pill.call{color:var(--call);background:var(--call-bg);border:1px solid var(--call-line)}
75
+ .pill.flag{color:var(--flag);background:var(--flag-bg);border:1px solid #e2d9c9}
76
+ .card-body{color:var(--ink-soft);font-size:14.5px}
77
+
78
+ /* before/after */
79
+ .ba{display:grid;grid-template-columns:1fr auto 1fr;gap:10px;align-items:center;margin-top:13px}
80
+ .ba .col{background:#fbf9f5;border:1px solid var(--line);border-radius:11px;padding:11px 13px}
81
+ .ba .lab{font-size:11px;text-transform:uppercase;letter-spacing:.05em;color:var(--ink-soft);margin-bottom:6px}
82
+ .ba .val{font-size:13.5px;font-family:"SF Mono",ui-monospace,Menlo,monospace}
83
+ .ba .was{color:#9a8d7c}
84
+ .ba .arrow{color:var(--accent);font-size:18px;text-align:center}
85
+ .ba .ann{color:var(--done);font-weight:600;font-size:12.5px}
86
+ .strike{text-decoration:line-through;text-decoration-color:#c9bcab;color:#9a8d7c}
87
+
88
+ /* your-call buttons */
89
+ .actions{display:flex;gap:9px;margin-top:14px}
90
+ .btn{font:inherit;font-size:14px;font-weight:600;padding:9px 15px;border-radius:10px;cursor:pointer;border:1px solid var(--line);background:#fff;color:var(--ink)}
91
+ .btn.primary{background:var(--accent);border-color:var(--accent);color:#fff}
92
+ .btn.ghost{background:transparent}
93
+
94
+ /* download */
95
+ .download{background:linear-gradient(180deg,#fffdfa,#f7f2ea);border:1px solid var(--line);
96
+ border-radius:18px;padding:26px;text-align:center;box-shadow:var(--shadow)}
97
+ .download h3{margin:0 0 4px;font-size:19px;font-weight:720}
98
+ .download p{margin:0 0 18px;color:var(--ink-soft);font-size:14.5px}
99
+ .dl-row{display:flex;gap:11px;justify-content:center;flex-wrap:wrap}
100
+ .btn.big{padding:12px 22px;font-size:15px}
101
+ .revert{margin-top:16px;font-size:13px;color:var(--ink-soft)}
102
+
103
+ footer{padding:30px 0 50px;text-align:center;color:#9a8d7c;font-size:13px;border-top:1px solid var(--line);margin-top:36px}
104
+ .restart{display:inline-block;margin-top:22px;font-size:14px;color:var(--accent);font-weight:600;text-decoration:none}
105
+ </style>
106
+ </head>
107
+ <body>
108
+
109
+ <div class="ribbon">🔒 <b>Runs entirely on your machine.</b> Your original file is untouched — nothing is uploaded.</div>
110
+
111
+ <div class="wrap">
112
+ <header>
113
+ <div class="logo"><span class="mark">✦</span> ScrubData</div>
114
+ <h1>Done. Here's what changed.</h1>
115
+ <p class="sub">I did the tedious part — matching spellings, fixing formats, finding the blanks. Everything below is reversible, and I left the judgment calls for you.</p>
116
+ </header>
117
+
118
+ <div class="filebar">
119
+ <div class="fileicon">▦</div>
120
+ <div>
121
+ <div class="nm">crm-export-may.csv</div>
122
+ <div class="meta">3,840 rows · 11 columns · cleaned in 4.2s, locally</div>
123
+ </div>
124
+ <div class="spacer"></div>
125
+ <div class="pill-done-mini">6 fixes applied</div>
126
+ </div>
127
+
128
+ <!-- SUMMARY -->
129
+ <section>
130
+ <div class="eyebrow">The summary, in plain English</div>
131
+ <ul class="summary">
132
+ <li><span class="ic">🗂️</span><div><b>Unified 4 spellings of "United States"</b> (US, U.S., usa, United States) into one. 2,108 rows affected.</div></li>
133
+ <li><span class="ic">🏷️</span><div><b>Merged 4 ways of writing the same deal stage</b> ("Closed Won", "closed-won", "Won", "CW") into one. 1,204 rows.</div></li>
134
+ <li><span class="ic">⬜</span><div><b>Treated 47 disguised blanks</b> ("N/A", "none", "—") as empty, so your counts and filters behave.</div></li>
135
+ <li><span class="ic">📅</span><div><b>Standardized all dates to YYYY-MM-DD</b> and phone numbers to one format.</div></li>
136
+ <li class="handoff"><span class="ic">✋</span><div><b>2 changes touch money or identity, so I didn't make them.</b> They're below for your call.</div></li>
137
+ </ul>
138
+ </section>
139
+
140
+ <!-- DONE -->
141
+ <section>
142
+ <div class="eyebrow">Handled — already applied (and reversible)</div>
143
+
144
+ <div class="card done">
145
+ <div class="card-top"><span class="card-title">Same country, counted as one</span><span class="pill done">DONE</span></div>
146
+ <div class="card-body">Four spellings were splitting your "United States" rows across the report.</div>
147
+ <div class="ba">
148
+ <div class="col"><div class="lab">Before</div>
149
+ <div class="val was">US · U.S. · usa<br>United States</div></div>
150
+ <div class="arrow">→</div>
151
+ <div class="col"><div class="lab">After</div>
152
+ <div class="val">United States</div><div class="ann">one value · 2,108 rows</div></div>
153
+ </div>
154
+ </div>
155
+
156
+ <div class="card done">
157
+ <div class="card-top"><span class="card-title">Phone numbers, one format</span><span class="pill done">DONE</span></div>
158
+ <div class="card-body">Mixed formats standardized so lookups and dedupes line up.</div>
159
+ <div class="ba">
160
+ <div class="col"><div class="lab">Before</div>
161
+ <div class="val was">(415) 555.0192<br>415-555-0147<br>+1 415 555 0188</div></div>
162
+ <div class="arrow">→</div>
163
+ <div class="col"><div class="lab">After</div>
164
+ <div class="val">(415) 555-0192<br>(415) 555-0147<br>(415) 555-0188</div></div>
165
+ </div>
166
+ </div>
167
+ </section>
168
+
169
+ <!-- YOUR CALL -->
170
+ <section>
171
+ <div class="eyebrow">Needs your call — I didn't touch these</div>
172
+
173
+ <div class="card call">
174
+ <div class="card-top"><span class="card-title">31 deals show $0.00</span><span class="pill call">YOUR CALL</span></div>
175
+ <div class="card-body">Usually a sync glitch, not a real deal. Leaving them in drags your win total down. Exclude them from the total?</div>
176
+ <div class="actions">
177
+ <button class="btn primary">Leave them out</button>
178
+ <button class="btn ghost">Keep them</button>
179
+ </div>
180
+ </div>
181
+
182
+ <div class="card call">
183
+ <div class="card-top"><span class="card-title">Possible duplicate contact</span><span class="pill call">YOUR CALL</span></div>
184
+ <div class="card-body">"Yolanda R." and "Yolanda Reyes" share an email (y.reyes@northwind.co). Count them as one contact?</div>
185
+ <div class="actions">
186
+ <button class="btn primary">Merge them</button>
187
+ <button class="btn ghost">Keep both</button>
188
+ </div>
189
+ </div>
190
+ </section>
191
+
192
+ <!-- FLAGGED -->
193
+ <section>
194
+ <div class="eyebrow">Worth a look — left exactly as they were</div>
195
+ <div class="card flag">
196
+ <div class="card-top"><span class="card-title">3 cells I wouldn't guess at</span><span class="pill flag">FLAGGED</span></div>
197
+ <div class="card-body">Two phone numbers have too few digits, and one note reads <span style="font-family:ui-monospace,monospace;font-size:13px">"follow up?? — check w/ Dana"</span>. I didn't guess. Left them untouched for you to check.</div>
198
+ </div>
199
+ </section>
200
+
201
+ <!-- DOWNLOAD -->
202
+ <section>
203
+ <div class="download">
204
+ <h3>Your clean copy is ready</h3>
205
+ <p>Take the cleaned file and the change log. Both are yours to keep.</p>
206
+ <div class="dl-row">
207
+ <button class="btn primary big">↓ Download clean file</button>
208
+ <button class="btn big">Export change log</button>
209
+ </div>
210
+ <div class="revert">Your original is untouched. Revert any change — or all of them — whenever you want.</div>
211
+ </div>
212
+ <div style="text-align:center"><a class="restart" href="#">← Clean another file</a></div>
213
+ </section>
214
+ </div>
215
+
216
+ <footer>Runs locally. Nothing leaves your machine, ever.</footer>
217
+
218
+ </body>
219
+ </html>
docs/DATASETS.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset inventory — every source the system trains on, evaluates on, or must clean
2
+
3
+ Stage-3 consolidated registry (2026-06-11). Assignment discipline: a source is
4
+ TRAIN, EVAL, or BENCH — never both sides of train/eval.
5
+
6
+ ## Paired dirty/clean (27 — eval/paired_bench.py → docs/PAIRED_BENCH.md)
7
+
8
+ | source | origin | license | assignment | notes |
9
+ |---|---|---|---|---|
10
+ | hospital, beers, movies_1 | Raha (BigDaMa) | Apache-2.0 | TRAIN | champion mix since v6 |
11
+ | flights, rayyan | Raha | Apache-2.0 | EVAL (GEN) | held-out real errors |
12
+ | tax | Raha | Apache-2.0 | unused | numeric-heavy, huge |
13
+ | ed2_restaurants | BigDaMa ED2 | research | EVAL (GEN) | real NYC variants; errors past row 2k |
14
+ | fodors_zagats | Magellan EM | BSD-ish data | TRAIN | variant-masked EM table |
15
+ | dblp_acm, dblp_scholar | Magellan EM | research | BENCH only | out-of-regime (unique titles / convention-mismatch gold) |
16
+ | cleanml_company, cleanml_movie | CleanML | research | TRAIN | Company = org canon |
17
+ | gidcl_imdb | SICS-FRC GIDCL | none stated | TRAIN (v9+) | 1M-row pair; 57k errors; subset 86k rows |
18
+ | zeroed_billionaire, zeroed_tax100k | WelkinNi/ZeroED | none stated | BENCH | injected; rich categoricals |
19
+ | dgov_* (5 tables) | LUH-DBS Matelda | Apache-2.0 | BENCH | real data.gov tables, injected typos (6,692 more available) |
20
+ | tt_* (8 tables) | ToughTables 2T_WD | CC-BY-4.0 | BENCH | gold-anchored entity misspellings, 370–33.5k corrections each |
21
+
22
+ ## Wild messy tables (35 — eval/wild_bench.py → docs/WILD_BENCH.md)
23
+
24
+ 24 portal tables (training/unpaired_sources.json cache: NYC/Chicago/SF/LA/Seattle/TX/WA
25
+ portals, spotify, billboard, titanic, worldcities, airlines) + 12 stage-3 additions
26
+ (training/harvest_wild.py): bx_books (mojibake), salary_survey, fec_indiv80 (PII,
27
+ headerless), acnc_charities (AU), uk_price_paid (headerless UK), irs_eo1,
28
+ glassdoor_jobs (multiline cells), paris_trees (FR), online_retail, bl_flickr_books,
29
+ open_food_facts (211 cols), ct_real_estate. Backlog: CMS doctors (API 400), NHTSA
30
+ FLAT_CMPL (multi-GB), Canada contracts (627MB).
31
+
32
+ ## Alias vocabularies (training generator material)
33
+
34
+ | vocab | size | license | regime |
35
+ |---|---|---|---|
36
+ | toughtables_aliases | 49,629 | CC-BY-4.0 | real entity misspellings (gold-anchored) |
37
+ | musicbrainz_hint_aliases | 34,017 | CC0 | community-recorded artist misspellings |
38
+ | rxnorm_aliases | 17,701 | public domain | drug name synonyms |
39
+ | ror_aliases | 73k orgs | CC0 | research orgs |
40
+ | geonames_city_aliases | 80k cities | CC-BY | city aliases |
41
+ | wikidata_company_aliases | 10.2k | CC0 | company aliases |
42
+ | onet_jobtitle_aliases | 1,016 | CC-BY-4.0 | job titles |
43
+ | nickname_aliases | 555 | Apache-2.0 | first names |
44
+ | openflights_airports | 7,698 | ODbL/DbCL | airports reference |
45
+ | libpostal_aliases | — | MIT | address abbreviations |
46
+
47
+ ## Measured conclusions that govern future widening
48
+
49
+ 1. Pre-paired corpus discovery is SATURATED (3 verified hunts) — synthesis from
50
+ vocabularies is the widening path.
51
+ 2. Pair volume / vocab training does NOT move held-out generalization (v7–v9, 4
52
+ retrains + tt-transfer test): the planner's value_counts cap (80) structurally
53
+ hides high-cardinality dirty cells. The unlock is architectural: error-suspect /
54
+ windowed profiling and cross-row entity voting.
55
+ 3. The deterministic side (grounding + ops + verifier union) carries never-seen
56
+ tables today; every op added from a measured regime (normalize_punctuation)
57
+ moved GEN; convention/encoding ops are the cheapest remaining wins.
docs/DEGENERATE_BASELINES.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Degenerate baselines + cost-weighted damage (W4.3 + W4.4)
2
+
3
+ Same 42 dirty/clean pairs as `eval/paired_bench.py`, scored with `run_real_multi.score()` (churn-neutral F1 + damage). The degenerate policies pin
4
+ the metric: no-op = floor (F1 0, damage 0), oracle = ceiling (F1 1, damage 0),
5
+ random-edit (seeded, 5% of cells) = vandalism the metric must punish. Abstain-all
6
+ is score-identical to no-op — the repair metric is flag-blind by design.
7
+
8
+ | policy | macro F1 | macro P | macro R | macro damage | fixed | damage cells |
9
+ |---|---|---|---|---|---|---|
10
+ | no-op | 0.000 | 1.000 | 0.000 | 0.0000 | 0 | 0 |
11
+ | abstain-all | 0.000 | 1.000 | 0.000 | 0.0000 | 0 | 0 |
12
+ | random-edit | 0.000 | 0.001 | 0.001 | 0.0485 | 39 | 80042 |
13
+ | oracle | 1.000 | 1.000 | 1.000 | 0.0000 | 163607 | 0 |
14
+ | shipped | 0.343 | 0.576 | 0.308 | 0.0229 | 83543 | 61679 |
15
+
16
+ ## Cost-weighted scores (Effective-Reliability style, W4.4)
17
+
18
+ score_c = fixes − c·damage_cells, micro-summed over all pairs; per-error =
19
+ score_c / 163607 total benchmark errors.
20
+
21
+ | policy | c=1 (per-error) | c=5 (per-error) | c=10 (per-error) |
22
+ |---|---|---|---|
23
+ | no-op | 0 (+0.000) | 0 (+0.000) | 0 (+0.000) |
24
+ | abstain-all | 0 (+0.000) | 0 (+0.000) | 0 (+0.000) |
25
+ | random-edit | -80003 (-0.489) | -400171 (-2.446) | -800381 (-4.892) |
26
+ | oracle | 163607 (+1.000) | 163607 (+1.000) | 163607 (+1.000) |
27
+ | shipped | 21864 (+0.134) | -224852 (-1.374) | -533247 (-3.259) |
28
+
29
+ Acceptance: oracle F1 = 1.0 on all pairs: **True** · no-op damage = 0.0 on all pairs: **True**
30
+ Repro: `uv run python -m eval.degenerate` (seed 7, edit fraction 0.05).
docs/FIELD_NOTES.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Field notes — building ScrubData small, on purpose
2
+
3
+ *Build Small Hackathon, June 2026. A ≤4B model, a Gradio Space, and two weeks of
4
+ finding out what "small but honest" actually costs.*
5
+
6
+ ## The bet
7
+
8
+ The person who most needs data cleaning — the ops coordinator with a messy CRM export
9
+ and a Monday deadline — will never write a pandas script, and shouldn't have to ship
10
+ her customer data to a frontier API either. The bet: a 4B model running locally is
11
+ enough, **if you stop asking it to edit data and start asking it to plan**.
12
+
13
+ So the model never touches a cell. It reads an aggregated profile (per-value frequency
14
+ counts — so the model sees a bounded, fixed-size summary whether the table has a hundred
15
+ rows or a million) and emits a JSON plan; deterministic pandas executes it. Every change is named, reversible, and logged. Silent edits are
16
+ impossible by construction. That decomposition turned out to be the whole project.
17
+
18
+ ## Things that broke, in order
19
+
20
+ **The fine-tune that aced the test and failed the job.** v4 hit canonicalization F1
21
+ 0.90 on held-out synthetic data — and scored exactly 0.000 on real hospital typos. It
22
+ had never seen a high-cardinality real column. Fix: derive training pairs from real
23
+ dirty/clean benchmark tables by cell alignment, keeping only *learnable*
24
+ canonicalizations (a surface form that's a string variant of its target and never a
25
+ legitimate value elsewhere). Real repair recall: 0.00 → 0.42. Synthetic data teaches
26
+ the format; real data teaches the job.
27
+
28
+ **The GGUF that lobotomized the model.** Same adapter, two exports: Q8_0 worked
29
+ perfectly, Q4_K_M degenerated into `<tool_call>` loops. Hours of template debugging
30
+ later: the quantization itself was corrupting the export. Then the bf16 path had its
31
+ own version — training converged (loss 0.16) but free-running generation *still*
32
+ emitted tool-call loops, because Qwen3's tool-calling prior dominates the first token.
33
+ The fix is two tokens long: `suppress_tokens=[151657, 151658]`.
34
+
35
+ **The model that invented cities.** Asked for canonical forms, a generative model
36
+ generates — including `guntxrsvillx → huntsville` (wrong town). Frequency clustering
37
+ can't fix this either: a lone column has no signal to vote against the error (GARF
38
+ proves this structurally). The fix came from the literature: never free-generate a
39
+ canonical. Retrieve candidates from a reference taxonomy (GeoNames, ISO), require a
40
+ similarity threshold *and* an ambiguity margin, and **abstain** when unsure. `boxz` is
41
+ equally close to `Box` and `Boaz` — so the system declines and asks. We measured the
42
+ abstention: precision rises monotonically with the threshold (90% at the default, 95%
43
+ at 0.91). Knowing when not to act turned out to be the most valuable feature.
44
+
45
+ **The eval that graded itself too kindly — twice.** Our own ablations caught two metric
46
+ artifacts: (1) convention-tolerant scoring counted bulk case-rewrites as "good
47
+ changes," inflating precision — removing case-matching *gained* +0.12 until we made
48
+ the metric churn-neutral; (2) our adversarial traps included `Boazz`, which grounding
49
+ correctly maps to the real city Boaz — the trap was punishing correct behavior. Both
50
+ fixes are reported in the paper as results, because an eval you haven't tried to break
51
+ is an eval you can't trust.
52
+
53
+ **The honest negative result.** On *injected* typos, classical frequency clustering
54
+ remains a strong baseline — by construction: injection puts the canonical in the
55
+ column, which is clustering's ideal regime. Grounding's edge is real errors, tail
56
+ entities, and not wrong-merging. We report both slices separately rather than
57
+ averaging the difference away.
58
+
59
+ **The verifier that made the model shippable.** The fine-tune's hospital numbers told
60
+ an awkward story: recall 0.475 (best we'd measured for a local model) at precision
61
+ 0.185 — it fixed errors *and* invented merges. Instead of retraining, we scored every
62
+ proposed mapping with three deterministic gates distilled from its actual failures: a
63
+ value occurring ≥3 times is data, not a typo (*errors are rare*); a repair target must
64
+ dominate its source in frequency (no mapping one typo onto another); digit-bearing
65
+ codes only repair when the letter part is near-identical (`amix-2 → ami-2` yes,
66
+ `ak_ → al_` no). The gated model plan alone: **0.993 precision at 0.287 coverage** —
67
+ 146 of 147 changes correct. Union it with the grounded heuristic and you get **0.905
68
+ precision at 0.413 coverage** on hospital's 509 real errors. Every dropped mapping
69
+ becomes a review flag, not a silent skip. That composition — verify the model's
70
+ output, never trust it — is what the app now ships as its default planner.
71
+
72
+ ## The PII turn
73
+
74
+ A friend pointed at the OpenMed project (small Apache-2.0 token classifiers; their
75
+ paper is the sister result to our thesis — small specialized beats big generic). Their
76
+ 44M PII model, trained on clinical *sentences*, turned out to transfer perfectly to
77
+ bare CSV cells: 100% on names and addresses, no prompt template needed. We put it
78
+ behind a sensitive-type allowlist and a column-level vote, added a deterministic
79
+ checksum tier (Luhn, IBAN mod-97 — math, not vibes), and made masking an executor
80
+ operation. Leak test: 0/360 residual detectable PII after masking. OOD type detection:
81
+ 5/5 with 0/7 false positives. The privacy ribbon at the top of the app — "nothing
82
+ leaves this machine" — now describes the PII handling too, not just the inference.
83
+
84
+ ## The word that broke the demo
85
+
86
+ We shipped the engine, then sent the live Space to people who actually have messy
87
+ spreadsheets and aren't data people. The most useful feedback wasn't a bug report — it
88
+ was that the word **"cleaning" didn't mean anything to them**. One tester read "clean my
89
+ Excel" as *deleting* data: *"¿Te refieres a que elimine algo de algún archivo?"* ("you
90
+ mean it removes something from the file?"). Another didn't know where to start: *"¿eso
91
+ del Excel te lo subimos ahí o cómo?"* ("the Excel thing — do we upload it there, or
92
+ how?"). The clearest explanation of the whole product turned out to be a sentence we
93
+ typed by hand in a chat reply — *"it fixes text errors: names, phones, emails, cities"* —
94
+ and that sentence was nowhere in the app.
95
+
96
+ The engine was fine. The *framing* was the failure. So we changed the product to **show**
97
+ what cleaning is instead of naming it: the hero now opens with a literal before→after
98
+ strip (`nigeia → Nigeria`, `Calfornia → California`) before any upload, the headline is
99
+ the sentence that worked in chat ("Fix the messy text in your spreadsheet"), the copy
100
+ says plainly "I never delete your data," jargon labels are gone ("with PII" → "with
101
+ sensitive data"), and a one-click "watch it run on a sample" path removes the "where do I
102
+ even start" wall. One honesty footnote from the rewrite: our first before→after example
103
+ added a `+52` country code to a phone number — which the executor doesn't actually do — so
104
+ we cut it. The demo strip can only show what the engine truly does.
105
+
106
+ n was small and informal (~3 people we know), so this isn't a usability study. But you
107
+ only need to watch one person mistake your tool for a delete button to learn the lesson:
108
+ the people who most need the tool don't share your vocabulary, and the demo has to teach
109
+ the concept before it can show the feature.
110
+
111
+ ## What we'd tell the next person
112
+
113
+ 1. **Planner/executor is the trust unlock.** Auditability isn't a feature you add;
114
+ it's a decomposition you choose.
115
+ 2. **Verify supervision by executing it.** Every training example we kept provably
116
+ recovers the clean table. Bad plans can't become labels.
117
+ 3. **Ground generation in references and budget for abstention.** A small model that
118
+ declines correctly beats a big model that guesses confidently.
119
+ 4. **Attack your own eval before reviewers do.** Both of our metric bugs were found by
120
+ ablations we almost didn't run.
121
+ 5. **Small models are enough more often than you think** — and roughly $35 of GPU
122
+ credit covers an embarrassing number of mistakes if each one teaches you something.
123
+ 6. **Test the framing on someone outside your vocabulary.** The engine can be correct and
124
+ the product still unusable if the first screen assumes a word — "cleaning" — that your
125
+ user doesn't have. Show the concept before you name the feature.
126
+
127
+ — Built with a ≤4B planner, a 44M PII classifier, checksums, and a reference gazetteer.
128
+ Total model weight: under 4.1B parameters. Total cloud spend: about $35.
docs/GITTABLES_AUDIT.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GitTables N=250 audit — trust contract at scale
2
+
3
+ Shipped pipeline over 239 real GitHub tables (Matelda GitTables-subsets,
4
+ Apache-2.0). IMPORTANT framing: this subset is a CLEAN LAKE (dirty == clean for
5
+ 238/239 tables), so the repair-F1 dimension is void and `macro_damage` is NOT
6
+ damage — it is an INTERVENTION-RATE upper bound (any semantic normalization the
7
+ pipeline performs counts against gold=input, including intended format parsing).
8
+ What this audit certifies: robustness (0 pipeline failures), schema validity
9
+ (239/239), and ZERO silent edits across 239 arbitrary real-world tables — the
10
+ trust contract at scale. The ~5.5% intervention rate (43 tables untouched) is
11
+ the conservative measure of how much the pipeline chooses to act on arbitrary
12
+ tables.
13
+
14
+ | metric | value |
15
+ |---|---|
16
+ | tables_audited | 239 |
17
+ | pipeline_failures | 0 |
18
+ | plan_valid | 239 |
19
+ | tables_with_silent_edits | 0 |
20
+ | tables_with_errors | 1 |
21
+ | macro_f1_on_errored | 0.0 |
22
+ | macro_damage | 0.055 |
23
+ | zero_damage_tables | 43 |
24
+ | seconds | 796.9 |
docs/PAIRED_BENCH.md ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Paired Bench — shipped system on every cell-aligned pair
2
+
3
+ Churn-neutral repairs metric + variant-class recall; `seen` = source fed
4
+ the champion's training mix (flagged, not hidden).
5
+
6
+ | dataset | seen | rows×cols | errors | variant | F1 | precision | recall | VR | damage |
7
+ |---|---|---|---|---|---|---|---|---|---|
8
+ | dgov_2_10_budget_presentation_award_summary | | 16×6 | 9 | 9 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
9
+ | dgov_emergency_operating_center_tools | | 7×3 | 4 | 3 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
10
+ | dgov_illinois_obesity_by_county | | 102×5 | 17 | 17 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
11
+ | fodors_zagats | ✓ | 112×6 | 206 | 206 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0536 |
12
+ | rayyan | | 1000×11 | 948 | 171 | 0.0 | 0.0 | 0.0 | 0.0 | 0.1178 |
13
+ | zeroed_tax100k | | 20000×15 | 952 | 117 | 0.0 | 0.0 | 0.006 | 0.051 | 0.0822 |
14
+ | ed2_restaurants | | 20000×15 | 309 | 76 | 0.001 | 0.0 | 0.026 | 0.105 | 0.0718 |
15
+ | dblp_acm | | 2224×4 | 2128 | 2128 | 0.003 | 0.273 | 0.001 | 0.001 | 0.001 |
16
+ | cleanml_movie | ✓ | 9329×8 | 4779 | 8 | 0.008 | 0.019 | 0.005 | 0.0 | 0.0172 |
17
+ | dblp_scholar | | 2408×4 | 3099 | 3099 | 0.008 | 0.012 | 0.006 | 0.006 | 0.233 |
18
+ | tt_cn5wvwhh | | 8302×5 | 370 | 370 | 0.021 | 0.046 | 0.014 | 0.014 | 0.0025 |
19
+ | beers | ✓ | 2410×11 | 4362 | 693 | 0.026 | 0.042 | 0.019 | 0.117 | 0.0044 |
20
+ | dgov_mva_vehicle_sales_counts_by_month_for_ca | | 248×6 | 43 | 24 | 0.042 | 0.2 | 0.023 | 0.042 | 0.0 |
21
+ | zeroed_billionaire | | 2614×22 | 5248 | 1146 | 0.103 | 0.232 | 0.067 | 0.305 | 0.0042 |
22
+ | dgov_field_listings | | 122×20 | 317 | 250 | 0.106 | 0.133 | 0.088 | 0.112 | 0.0523 |
23
+ | flights | | 2376×7 | 4920 | 1049 | 0.164 | 0.265 | 0.119 | 0.247 | 0.0839 |
24
+ | dgov_grocery_stores_2013 | | 506×17 | 420 | 332 | 0.21 | 0.265 | 0.174 | 0.193 | 0.0192 |
25
+ | cleanml_company | ✓ | 20000×9 | 65 | 65 | 0.243 | 0.147 | 0.708 | 0.708 | 0.0015 |
26
+ | dgov_median_household_income | | 174×19 | 138 | 83 | 0.25 | 0.579 | 0.159 | 0.265 | 0.0 |
27
+ | hospital | ✓ | 1000×20 | 509 | 379 | 0.258 | 0.169 | 0.542 | 0.607 | 0.0662 |
28
+ | dgov_louisville_metro_ky_inspection_results_p | | 521×18 | 1126 | 1044 | 0.31 | 0.933 | 0.186 | 0.2 | 0.0002 |
29
+ | dgov_la_county_covid_cases | | 975×14 | 579 | 579 | 0.34 | 0.983 | 0.206 | 0.206 | 0.0 |
30
+ | dgov_allegheny_county_tobacco_vendors | | 1248×12 | 2392 | 2109 | 0.343 | 0.882 | 0.213 | 0.242 | 0.0008 |
31
+ | dgov_legislative_bridge_names | | 252×16 | 415 | 396 | 0.358 | 0.614 | 0.253 | 0.265 | 0.0091 |
32
+ | tt_co23z7go | | 15477×4 | 33542 | 33542 | 0.36 | 0.929 | 0.223 | 0.223 | 0.0004 |
33
+ | dgov_louisville_metro_ky_permitted_hotels_and | | 131×13 | 191 | 182 | 0.424 | 0.898 | 0.277 | 0.291 | 0.0007 |
34
+ | dgov_health_conditions_among_children_under_a | | 2744×16 | 2900 | 2844 | 0.426 | 0.357 | 0.528 | 0.539 | 0.0569 |
35
+ | gidcl_imdb | ✓ | 20000×6 | 13320 | 7890 | 0.438 | 0.489 | 0.396 | 0.669 | 0.0297 |
36
+ | tt_uma1dnf6 | | 8302×5 | 5080 | 5080 | 0.442 | 0.911 | 0.292 | 0.292 | 0.0026 |
37
+ | dgov_medicare_part_d_opioid_prescribing_rates | | 677×17 | 547 | 547 | 0.447 | 0.775 | 0.314 | 0.314 | 0.0026 |
38
+ | dgov_access_control | | 4928×13 | 4180 | 4161 | 0.551 | 0.933 | 0.391 | 0.392 | 0.0 |
39
+ | dgov_3_09_census_acs_post_secondary_education | | 53×17 | 82 | 82 | 0.552 | 0.941 | 0.39 | 0.39 | 0.0 |
40
+ | dgov_305b_assessed_lake_2020 | | 182×23 | 442 | 424 | 0.556 | 0.766 | 0.437 | 0.455 | 0.0139 |
41
+ | dgov_ah_provisional_diabetes_death_counts_for | | 226×16 | 142 | 141 | 0.571 | 0.951 | 0.408 | 0.411 | 0.0 |
42
+ | dgov_jefferson_county_ky_post_offices | | 32×9 | 26 | 26 | 0.651 | 0.824 | 0.538 | 0.538 | 0.0115 |
43
+ | dgov_national_obesity_by_state_1 | | 52×5 | 13 | 13 | 0.7 | 1.0 | 0.538 | 0.538 | 0.0 |
44
+ | movies_1 | ✓ | 7390×17 | 7006 | 5567 | 0.705 | 0.639 | 0.786 | 0.989 | 0.0226 |
45
+ | tt_3n6s2fcx | | 9396×3 | 9510 | 9510 | 0.955 | 0.998 | 0.916 | 0.916 | 0.0 |
46
+ | tt_2zwsmotj | | 10855×3 | 10977 | 10977 | 0.956 | 0.997 | 0.918 | 0.918 | 0.0 |
47
+ | tt_8yinkydr | | 14008×3 | 14188 | 14188 | 0.956 | 0.997 | 0.918 | 0.918 | 0.0 |
48
+ | tt_dvnkv0xu | | 15477×4 | 15676 | 15676 | 0.956 | 0.997 | 0.919 | 0.919 | 0.0 |
49
+ | tt_00e2h310 | | 12285×3 | 12433 | 12433 | 0.957 | 0.998 | 0.919 | 0.919 | 0.0 |
docs/PAPER.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ > **SUPERSEDED SCAFFOLD (2026-06-12).** The paper was reframed; current title:
2
+ > "Verified Cleaning Plans: Plan-Level Selective Prediction Turns Local LLM
3
+ > Planners into Trustworthy Table Cleaners". This file is the original outline,
4
+ > kept for history. The live paper is docs/paper/main.tex.
5
+
6
+ # ScrubData — paper scaffold & related-work map
7
+
8
+ **Working title:** *Small fine-tuned planners with execution-verified data and calibrated
9
+ abstention match larger models on tabular canonicalization.*
10
+
11
+ **One-line claim (measured):** a ≤4B fine-tune that emits a *cleaning plan* (not edited cells)
12
+ reaches `canon_f1 0.86` on alias-level canonicalization vs `0.45` for a large generic model and
13
+ `0.13` for a rule heuristic — and, with reference grounding + calibrated abstention, beats the
14
+ tool people actually use (OpenRefine) on a wide validation suite at far lower damage.
15
+
16
+ ## Contributions (the combination is the novelty — not "LLM cleans data")
17
+ 1. **Planner/executor decomposition.** The model proposes a structured JSON plan; deterministic
18
+ pandas executes it. Auditable, reversible, **no silent edits** (`observability.py`,
19
+ `trace.py`). This is the trust/monitorability contract.
20
+ 2. **Execution-self-verified synthetic SFT.** Every training example's plan is checked to
21
+ actually recover the known-clean original by *running the executor* (`training/build_dataset.py`).
22
+ A clean, citable data-generation method (drops non-recovering examples).
23
+ 3. **Reference grounding + calibrated abstention.** Canonicalization is reconciled against a
24
+ type-scoped taxonomy (GeoNames/pycountry; `reconcile.py`, `grounded.py`); the system ABSTAINS
25
+ under ambiguity instead of hallucinating a canonical (`eval/calibration.py`: risk-coverage +
26
+ ECE). Structural fix for the over-correction larger models also exhibit.
27
+ 4. **Aggregation + column-batching.** Prompt size scales with *distinct values*, not rows
28
+ (`profiler.py` value_counts + `model_planner.make_batched_planner`).
29
+
30
+ ## Related work (position against — reviewers know this field)
31
+ - **Error detection/repair:** Raha & Baran (Mahdavi et al.), HoloClean (Rekatsinas et al. 2017,
32
+ `arXiv 1702.00820`), GARF — we *use* their hospital/beers/flights/rayyan as OOD eval and cite
33
+ GARF as the frequency-only baseline our grounding beats (it cannot supply a canonical for a lone
34
+ column).
35
+ - **LLMs for data wrangling:** "Can Foundation Models Wrangle Your Data?" (Narayan et al. 2022),
36
+ Jellyfish, Table-GPT/TableLlama (`2311.09206`), RetClean (`2303.16909`). We differ by being a
37
+ *small fine-tuned planner* + grounding + abstain, not a large zero-shot value-editor.
38
+ - **Grounding / entity disambiguation:** RACOON (`2409.14556`), TURL (`2006.14806`), Belotti et al.
39
+ table-EL (`2408.06423`), MTab — motivate retrieval-then-abstain and warn against memorizing
40
+ canonicals into weights (TURL ~40% OOD collapse). See `taxonomy-grounding.md`.
41
+ - **The tool we beat:** **OpenRefine** clustering — fingerprint (key collision) + nearest-neighbor
42
+ (kNN/edit-distance), reimplemented as `scrubdata/baselines.py` for head-to-head.
43
+ - **Selective prediction:** calibrated abstention / risk-coverage (El-Yaniv & Wiener; Geifman &
44
+ El-Yaniv) — our ECE/AURC study; also the AI-safety monitorability framing.
45
+
46
+ ## Experiments
47
+ - **Headline:** canon_f1 vs large-generic vs heuristic on frozen synthetic gold (Layer 1).
48
+ - **Wide north-star (`eval/run_real_multi.py`):** double-macro (error-type × domain) F1 + damage +
49
+ abstain over Raha real-error sets **+ seeded error-injection** on 20+ harvested gov/GitHub clean
50
+ domains (`eval/inject.py`); multi-seed 95% CIs. Hospital is 1 dataset of many.
51
+ - **Money result:** grounded vs OpenRefine fingerprint & kNN on the same suite (grounded wins F1 +
52
+ damage; kNN over-merges — higher recall, low precision, high damage).
53
+ - **Calibration (`eval/calibration.py`):** risk-coverage, AURC, ECE; operating point for ≥95%
54
+ precision via the abstain threshold.
55
+ - **Ablations to add:** −grounding, −abstain, −execution-verification, −aggregation.
56
+
57
+ ## Honest limitations (the integrity reviewers reward)
58
+ - Reference *coverage* is the recall ceiling (Belotti) — uncovered entities abstain by design.
59
+ - Convention vs error: standardization (date→ISO, `%`→fraction) is product value, not damage —
60
+ the metric is case/whitespace-normalized but a format-aware variant is future work.
61
+ - ECE shows mild over-confidence (difflib-ratio scores) — temperature/Platt scaling is future work.
62
+ - Some benchmark sources gated (CleanML/TableEG behind Dropbox/Drive; licenses noted).
63
+
64
+ ## To-do before submission
65
+ multi-seed CIs (running) · −ablations · OpenRefine table with CIs · cs.DB endorser (primary cs.DB, cross-list cs.CL+cs.LG; endorser targets = the data-cleaning authors we cite) · selective-
66
+ prediction figure · keep the eval README's convention-vs-error honesty.
docs/SCALING_ARM.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # W1.c — ≤32B Zero-Label Repair Scaling Arm (multi-family, zero-shot)
2
+
3
+ First scaling measurement for the verified-union planner: vanilla (NOT fine-tuned)
4
+ 20–31B open-weights models dropped into the EXACT hospital pipeline the 4B fine-tune
5
+ gate used — batched raw planner (batch_size=4, same `scrubdata/prompt.py` contract,
6
+ temperature 0) → `verify_plan(tau=0.5)` → union with the grounded heuristic
7
+ (`mock_plan`). Scored against hospital's 509 real errors with the
8
+ `eval/precision_curve.py` repairs-only churn-neutral protocol. Protocol parity was
9
+ verified by re-scoring the captured v6 plan through the same scorer: it reproduces the
10
+ prior gate numbers exactly (gated 0.993/0.287, union 0.905/0.413).
11
+
12
+ Disclosure: ≤32B open-weights models measured via hosted inference for speed; all are
13
+ locally deployable in principle.
14
+
15
+ | model | params (B) | family | gated P @ C | union P @ C | validity | kept/dropped | runtime (s) |
16
+ |---|---|---|---|---|---|---|---|
17
+ | scrubdata-ft-v6 (Qwen3-4B fine-tune) | 4 | qwen3 (fine-tuned) | **0.993** @ 0.287 | 0.905 @ 0.413 | — | 132/38 | — (prior measurement) |
18
+ | gpt-oss:20b | 20 | openai/gpt-oss | 1.0 @ 0.000* | 0.845 @ 0.257* | 0.0 | 0/0 | 360 |
19
+ | devstral-small-2:24b | 24 | mistral/devstral | 0.943 @ 0.426 | 0.915 @ **0.485** | 1.0 | 208/87 | 135 |
20
+ | nemotron-3-nano:30b | 30 | nvidia/nemotron | 1.0 @ 0.138 | 0.877 @ 0.336 | 0.4 | 63/6 | 114 |
21
+ | gemma4:31b | 31 | google/gemma | 0.943 @ 0.426 | **0.915 @ 0.485** | 1.0 | 209/28 | 104 |
22
+
23
+ \* gpt-oss:20b is a serving-path failure, not a measured capability: the model
24
+ generated ~4.8k tokens per planning call (`done_reason=stop`) but the Ollama Cloud
25
+ proxy returned empty `content` and empty `thinking` on all 5 calls at both
26
+ num_predict=4000 and 8000 (simple prompts work) — its "gated" point is the degenerate
27
+ empty plan and its "union" point is the heuristic backstop alone. nemotron-3-nano
28
+ produced valid JSON on only 2/5 batch calls at num_predict=8000 (long-thinking
29
+ truncation); validity is part of the measurement.
30
+
31
+ **Interpretation.** Zero-shot capability at 24–31B does close — and slightly
32
+ exceed — the 4B fine-tune's gap inside the same verifier harness: devstral-24B and
33
+ gemma4-31B both land at union 0.915 precision @ 0.485 coverage vs the fine-tune's
34
+ 0.905 @ 0.413, though the fine-tune remains the most precise gated planner
35
+ (0.993 vs 0.943) and the only ≤4B point, while two of the four bigger families
36
+ (gpt-oss, nemotron) fail on plan-schema validity before capability even gets
37
+ measured. Gemma4-31B is the best family on balance: same gate point as devstral but
38
+ cleaner raw plans (verifier dropped 28 entries vs devstral's 87 — vs 38 for the 4B
39
+ fine-tune) and the fastest wall-clock (104s). The union still dominates everywhere:
40
+ every model's union point adds coverage over its gated point at gate-passing
41
+ precision, and it floors even the broken planners (nemotron 0.877 @ 0.336) because
42
+ the grounded heuristic covers whatever the model misses.
43
+
44
+ Artifacts: `eval/results/scaling_arm.json` (rows + provenance),
45
+ `eval/results/scaling_<model>_hospital_raw_plan.json` (captured raw plans),
46
+ runner: `eval/scaling_arm.py`.
docs/TOOL_REFERENCE.md ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ScrubData — The Profound Tool Reference
2
+
3
+ > The single local document that explains the whole system: what it is, why every
4
+ > piece exists, where every number comes from, and what we learned building it.
5
+ > Written at the close of the research domain (2026-06-12). The paper
6
+ > (`docs/paper/main.tex`) is the citable account; THIS file is the operational one.
7
+
8
+ ---
9
+
10
+ ## 1. What ScrubData is
11
+
12
+ ScrubData is a **zero-config, zero-label, local** tabular data-cleaning system built
13
+ around one architectural commitment: **the model never touches data**.
14
+
15
+ A profiler aggregates each column into a bounded value-frequency profile; a small
16
+ (≤4B, locally-run) fine-tuned planner *proposes* a JSON cleaning plan; a
17
+ deterministic pandas executor *applies* it. The plan is the complete, inspectable,
18
+ reversible specification of every change. Three consequences define the product:
19
+
20
+ 1. **No silent edits by construction** — every changed cell traces to a named,
21
+ logged operation (verified at scale: 0 silent edits across 35 wild tables and a
22
+ 239-table GitTables trust audit).
23
+ 2. **Abstention is first-class** — anything below confidence becomes a review flag
24
+ ("YOUR CALL" card in the UI), never a quiet skip and never a guess.
25
+ 3. **Profile-not-rows scaling** — the prompt scales with *distinct values*, not
26
+ rows; a million-row table profiles like a hundred-row one, and no cell values
27
+ leave the machine.
28
+
29
+ ### The central finding (load-bearing, repeatedly measured)
30
+
31
+ **Model weights contribute approximately nothing to never-seen-table
32
+ generalization in this protocol class.** Five SFT retrains (v7–v10 + mixes, 109k
33
+ harvested real alias pairs) and a three-arm GRPO pilot (executor as verifiable
34
+ reward, including a random-reward control that reproduced the same format drift)
35
+ all failed to move held-out generalization. Every measured gain came from
36
+ **deterministic machinery gated by the plan-level verifier** (§5). Corroborated
37
+ independently by Spreadsheet-RL, arXiv:2601.05009, and arXiv:2606.02866.
38
+ Practical corollary: *to improve ScrubData, write a deterministic capability and
39
+ gate it with the verifier; do not collect more training data.*
40
+
41
+ ---
42
+
43
+ ## 2. The shipped pipeline (`scrubdata/active.py::get_planner`)
44
+
45
+ ```
46
+ ┌──────────────────────────────────────────────┐
47
+ df ──► profiler ──► │ model path (only if SCRUBDATA_MODEL is set) │
48
+ (bounded │ batched (4 cols/call) local Ollama planner │
49
+ profile incl. │ → per-batch fallback to heuristic on error │
50
+ suspects) │ → grounded (reference taxonomies, RACOON) │
51
+ │ → verify_plan(tau=SCRUBDATA_TAU, def 0.5) │
52
+ └───────────────┬──────────────────────────────┘
53
+ │ union_plans (model wins per surface;
54
+ │ inherits deterministic ops + table ops)
55
+ heuristic mock_plan ───────────┘
56
+
57
+ executor.apply_plan → (clean_df, change_log)
58
+
59
+ report.render_report · trace.log_run · observability
60
+ ```
61
+
62
+ - **No model configured** → `mock_plan` (grounded deterministic heuristic) alone.
63
+ The app always produces a plan; the model is an upgrade, never a dependency.
64
+ - **Measured operating point** (hospital, 509 real errors): union **0.905
65
+ precision @ 0.413 coverage**; gated model alone 0.993 @ 0.287; 3-seed
66
+ 0.891±0.012 @ 0.396±0.025. Precision flat 0.89–0.91 for τ∈[0.2,0.8].
67
+
68
+ Entry points: `uv run server.py` (FastAPI + UI), `app.py` (HF Space/Gradio),
69
+ `scrubdata/cli.py` (`scrubdata <file.csv> -o out.csv --report r.md --plan p.json`).
70
+
71
+ ### Environment variables
72
+
73
+ | Var | Default | Meaning |
74
+ |---|---|---|
75
+ | `SCRUBDATA_MODEL` | unset | local Ollama model id (e.g. `scrubdata-ft-v6`); unset = heuristic only |
76
+ | `SCRUBDATA_TAU` | `0.5` | per-entry verifier threshold on model mappings |
77
+ | `SCRUBDATA_HC_TAU` | `0.8` | stricter bar for heuristic suspect-mappings (no model cross-check there) |
78
+ | `SCRUBDATA_PAIR_PROFILES` | off | WS2 candidate-constrained planning (measured redundant with verifier; off by default) |
79
+ | `SCRUBDATA_PII_NER` | off | OpenMed-PII 44M NER tier on top of deterministic validators |
80
+
81
+ ---
82
+
83
+ ## 3. Module map (`scrubdata/`)
84
+
85
+ | Module | Role | Key facts |
86
+ |---|---|---|
87
+ | `profiler.py` | column → bounded profile | `VALUE_COUNTS_CAP=80` (high-card cols: top-8 only) + `suspect_values` section (the visibility fix); `truncated_values` count keeps honesty about what's hidden |
88
+ | `detect.py` | typing + issue predicates | `detect_semantic_type` (zip/ZCTA/Excel-serial guards), `date_formats_consistent` (collapses digit AND alpha runs; 90% dominant-shape), `percent_formats_consistent` (90%), `has_mojibake`, `is_missing` |
89
+ | `planner.py` | deterministic heuristic planner | `mock_plan`, `_column_operations`, `_suspect_canonicalize` (τ_hc=0.8), `detect_entity_groups` (cross-row voting detection), emits `fix_encoding` BEFORE `strip_whitespace` (order-critical), `off_convention_dates` visible-abstention flags |
90
+ | `executor.py` | the only thing that touches cells | op dispatch (§4); unknown ops are no-ops (forward-compatible); returns `(df, change_log)`; `resolve_by_majority` table op lives here |
91
+ | `verifier.py` | WS1 selective prediction | `entry_confidence` (3 hard gates, §5.0), `verify_plan` (also enforces convention gates on MODEL-emitted parse_date/parse_percent — the model path otherwise bypasses them), `union_plans` (order-preserving op inheritance via `reversed(inherit)`) |
92
+ | `reconcile.py` | reference grounding | `ReferenceIndex`, `default_index()` loads toughtables_ref (contamination-guarded: excludes the 8 benchmark tables) + MusicBrainz hints + Wikidata companies + ROR; `infer_reference_type` needs **≥20% exact entity hits** (over-fire guard); falls back to `training/harvests/` for Space/clone parity |
93
+ | `grounded.py` | RACOON wrapper | model never free-generates a canonical for a reference-typed column |
94
+ | `pair_profile.py` | suspects + WS2 candidates | `suspects_for_column` (≤25/col, bounded: 4k rare cap + cheap prefilters before SequenceMatcher — 40min→24s fix), `candidate_pairs`, `constrain_plan` |
95
+ | `model_planner.py` | Ollama backends | `make_local_ollama_planner`, `make_batched_planner(batch_size=4)`, JSON extraction |
96
+ | `prompt.py` | prompt/training contract | `_profile_for_prompt` (compact suspects), `build_chat_example` (training-data side of the same contract — change one, regenerate the other) |
97
+ | `pii.py` | PII second task | deterministic validators (Luhn, IBAN, phone) + allowlist + coverage vote; optional 44M NER; `mask/hash/pseudonymize` |
98
+ | `active.py` | THE composition | `get_planner()` — §2 |
99
+ | `cli.py` / `report.py` / `trace.py` / `observability.py` | UX + audit | CLI, markdown report, JSONL traces, monitor summary/OTel span |
100
+ | `baselines.py` | OpenRefine kNN/fingerprint reimplementations | the zero-config comparison class |
101
+ | `refdata/cities.txt` | seed gazetteer | plus everything in `training/harvests/*.jsonl` |
102
+
103
+ ---
104
+
105
+ ## 4. Operation vocabulary (the executor's closed set)
106
+
107
+ **Column ops** (`_apply_column_op`): `strip_whitespace`, `normalize_punctuation`,
108
+ `fix_encoding` (lossless cp1252/latin-1↔utf8 round-trip, mojibake-marker-reduction
109
+ gated), `normalize_disguised_nulls`, `parse_currency`, `parse_number`,
110
+ `parse_percent` (abstains on bare values — no /100 corruption),
111
+ `parse_date`, `standardize_boolean`, `standardize_phone` (7-digit → `DDD-DDDD`),
112
+ `normalize_email`, `standardize_case`, `canonicalize_categories` (mapping-driven;
113
+ the verifier's subject), `flag_pii` (log-only), `mask_pii`, `hash_pii`,
114
+ `pseudonymize_pii`. Unknown op → no-op.
115
+
116
+ **Table ops**: `drop_empty_columns`, `drop_empty_rows`, `drop_exact_duplicates`,
117
+ `resolve_by_majority` (§5.3).
118
+
119
+ Op-order invariant: **`fix_encoding` must precede whitespace/punctuation ops** —
120
+ they destroy the UTF-8 byte patterns repair needs (grader-reproduced bug; fixed in
121
+ both heuristic emission and union inheritance).
122
+
123
+ ---
124
+
125
+ ## 5. The five deterministic capabilities (what actually generalizes)
126
+
127
+ ### 5.0 Plan-level verifier (WS1) — `verifier.entry_confidence`
128
+ Every non-grounded `canonicalize_categories` entry `raw→canon` is scored with
129
+ three HARD gates, each killing a measured hospital failure class:
130
+ - **errors are rare**: `freq(raw) ≥ 3` → 0.0 (frequent = legit data; "de kalb"×92)
131
+ - **repair to dominance only**: `freq(canon) < max(2, 2·freq(raw))` → 0.0
132
+ ("yex→yexu", typo mapped to a worse typo)
133
+ - **code discipline**: digit-bearing values repair only if letter-part similarity
134
+ ≥0.85 AND digits identical (allows `amix-2→ami-2`, blocks `ak_→al_`)
135
+ Survivors score `sim × (0.5 + 0.5·support)`; below-τ entries become review flags.
136
+
137
+ ### 5.1 Suspect surfacing (visibility) — `pair_profile.suspects_for_column`
138
+ The 80-value profile cap structurally hides high-cardinality dirty cells from ANY
139
+ planner (proved by the v8/v9 retrains: more data couldn't fix what the model
140
+ couldn't see). Every text-ish column profile now carries ≤25 `suspect_values`:
141
+ rare surfaces + evidence-backed candidates (frequency dominance, edit similarity,
142
+ reference membership). The heuristic maps suspects clearing `entry_confidence ≥
143
+ SCRUBDATA_HC_TAU=0.8`; the rest become flags.
144
+
145
+ ### 5.2 Generic entity reference — `reconcile.default_index`
146
+ Open vocabularies (ToughTables-derived ref [8 bench tables excluded], MusicBrainz
147
+ search-hint misspellings, RxNorm, Wikidata companies, ROR, GeoNames, OpenFlights,
148
+ O*NET, nicknames) as a pluggable reference type. Typing requires **≥20% exact
149
+ hits** of distinct values (fuzzy coverage alone over-fires on name-like columns —
150
+ measured). Cracked the all-unique regime: 5 ToughTables tables **0 → 0.955–0.957
151
+ F1 at 0.0000 damage** (~62k corrections) — where no in-column frequency signal
152
+ exists at all.
153
+
154
+ ### 5.3 Cross-row majority voting — `planner.detect_entity_groups` + `resolve_by_majority`
155
+ Tables repeating a real-world entity across rows (flights reported by many
156
+ sources) carry their own repair signal. Detection: compact-token key columns,
157
+ median multiplicity 3–30, ≥2 votable string columns with majority-bearing
158
+ disagreement + ≥2 distinct majorities, date-share ≤0.3 guard. Execution: resolve
159
+ thin dissenting minorities to group majority; skips missing-like keys;
160
+ min_share/min_group clamped. **False-consensus guard**: mean minority share ≥0.25
161
+ → decline (legitimate correlated updates, not reporting errors — a flat volume cap
162
+ was measured to destroy the legitimate regime and replaced). Measured: flights
163
+ heuristic 0.044→**0.164** F1; hospital heuristic 0.092→**0.186**.
164
+
165
+ ### 5.4 Convention conservatism — `detect.*_formats_consistent` + `verify_plan`
166
+ Never re-format an internally consistent column: date/percent ops gated on
167
+ dominant-shape inconsistency (digit+alpha runs collapsed, 90% rule); zip/postal
168
+ names never typed phone/date; Excel-serial typing needs a date-suggestive name.
169
+ Suppressed minorities surface as `off_convention_dates` flags. The verifier
170
+ enforces the same gates on model plans at the verification boundary (the model
171
+ path otherwise bypasses heuristic emission gates entirely).
172
+
173
+ ---
174
+
175
+ ## 6. Evaluation (how every number regenerates)
176
+
177
+ One scoring contract — `eval/run_real_multi.py::score()` — **churn-neutral,
178
+ convention-tolerant**: sem-equal = numeric-tolerant OR strip+casefold equal; pure
179
+ case/whitespace churn counts as nothing; a fix requires acting; **damage** =
180
+ clean cells corrupted / clean cells; **silent edits** = changed columns minus
181
+ log-attributed columns (must be 0).
182
+
183
+ | Harness | Command | What it measures | Current numbers |
184
+ |---|---|---|---|
185
+ | Money table | `python -m eval.run_real_multi` | 65-set suite, 3 seeds | grounded NORTH 0.224±0.004; REAL-F1 0.225 vs OR-kNN 0.058 (HEAD 2026-06-12 regen; freeze was 0.203/0.174) |
186
+ | WS1 gate | `python -m eval.precision_curve --plan eval/results/v6_hospital_raw_plan.json --union` | precision–coverage curve | **0.905 @ 0.413** (τ=0.5) |
187
+ | Paired bench | `python -m eval.paired_bench` | 42 dirty/gold pairs | unseen-35 macro F1 **0.363** @ dmg **0.0219** |
188
+ | Wild bench | `python -m eval.wild_bench` | 35 uncurated tables, behavioral + inject-recovery | recovery 0.207; **0 silent edits** |
189
+ | Trust audit | `python -m eval.gittables_audit` | 239 GitTables clean-lake | 239/239 valid, 0 crashes, 0 silent edits |
190
+ | Generalization | `python -m eval.generalization` | held-out-source (train: hospital/beers/movies_1 · eval: flights/rayyan/ed2) | GEN-F1 0.058, VR 0.108, dmg 0.036 |
191
+ | RADAR board | `python -m eval.radar_bench` | regime boundaries by artifact type | abstains on missingness ✓; reasoning-class = frontier territory |
192
+ | Baselines | `eval/run_baran.py`, `modal run scripts/modal_jellyfish.py` | disclosed-protocol comparisons | Baran (oracle+20 labels) 0.811; Jellyfish-13B 0.074 |
193
+ | Calibration / PII | `eval.calibration`, `eval.pii_leak` | abstention quality / leak test | AURC 0.120, ECE 0.169; 0/360 residual PII |
194
+
195
+ **Eval-source discipline**: TRAIN_SOURCES["v6"]={hospital,beers,movies_1};
196
+ EVAL_SOURCES={flights,rayyan,ed2_restaurants}. Never crossed.
197
+
198
+ ---
199
+
200
+ ## 7. Model & artifacts
201
+
202
+ | Artifact | Where | Notes |
203
+ |---|---|---|
204
+ | Champion adapter | Modal volume `scrubdata-v5-adapter` `/v5_seed21` (= "v6") | survived v7–v10 challenges + GRPO |
205
+ | Merged model | `hf.co/ricalanis/scrubdata-qwen3-4b` | card carries the v2 finding |
206
+ | Q8 GGUF | `hf.co/ricalanis/scrubdata-qwen3-4b-v6-q8` | **Q8_0 only — Q4_K_M corrupts** (Unsloth 2026.6.x); non-thinking Modelfile required (`notebooks/Modelfile`); suppress tokens 151657/151658 under transformers |
207
+ | Benchmark | `hf.co/datasets/ricalanis/wildclean` | 33 redistributable pairs + loaders.py for 9 license-gated + gittables250 + 10 vocabs + frozen results; first cleaning bench with damage + silent-edit accounting |
208
+ | Demo | `hf.co/spaces/build-small-hackathon/scrubdata` | deploy = `HfApi.upload_folder` of `git archive HEAD` — **NO GitHub auto-sync** |
209
+ | Paper | `docs/paper/main.tex` + `numbers.tex` | compile: `~/.local/bin/tectonic main.tex` (no pdflatex on this machine) |
210
+ | Vocabs | `training/harvests/*.jsonl` (15MB, 13 files) | loader falls back here for clone parity |
211
+
212
+ Modal patterns: `--detach` for anything long; results land in Modal Dicts
213
+ (`scrubdata-train-results`, `scrubdata-eval-v5-results`, `scrubdata-suite-results`).
214
+ **Budget status at domain close: ~$187 of $212 ceiling — Modal HALTED.**
215
+
216
+ ---
217
+
218
+ ## 8. Negative results ledger (measured, do not re-litigate)
219
+
220
+ 1. **v7–v10 SFT retrains**: 109k harvested alias pairs, episode mixes, suspects
221
+ contract — GEN flat/worse. Mixing harvested pairs **dilutes** executor-verified
222
+ synthetic skill (monotonic dilution law across mix ratios; mixH 0.677).
223
+ 2. **GRPO pilot, 3 arms** (main, KL-anchored v2, random-reward control): all
224
+ degrade format at 4B/LoRA/$30 scale; the control proved the drift is an RL
225
+ artifact (cf. "Spurious Rewards"). Published RLVR wins used real infra
226
+ (verl, 4×H100×40h). Episodes corpus (600, `training/build_grpo_episodes.py`) +
227
+ hand-rolled loop (`scripts/modal_grpo.py`) committed for a future attempt.
228
+ 3. **Uniform verification of existing low-card mappings** (A1 per-class
229
+ thresholds): 0.905→0.890 — reverted.
230
+ 4. **Strict entity-typing thresholds** (0.90/0.05): cost more than bought — reverted.
231
+ 5. **WS2 candidate constraining composed with verifier**: 0.876 @ 0.387 < union at
232
+ same τ — redundant gating of the same failure class; available, off by default.
233
+ 6. **Flat volume cap on cross-row voting**: destroyed the legitimate
234
+ dense-disagreement regime — replaced by the false-consensus guard.
235
+ 7. **Frozen-gold synthetic yardstick predates the suspects prompt contract** —
236
+ regenerate gold before ever quoting synthetic canon_f1 again.
237
+
238
+ ## 9. Known-open (graded non-blocking)
239
+
240
+ `_parse_date` per-value dayfirst; i18n name guards; mojibake fixpoint /
241
+ sequence-plausibility; backlog sources: CMS API, NHTSA, Canada contracts, Matelda
242
+ ~6,670 pairs, GLEIF/USDA vocabs, WDVC-16. Reasoning-class artifacts (RADAR) are
243
+ explicitly out of protocol class — frontier-model territory.
244
+
245
+ ## 10. Where deeper detail lives
246
+
247
+ `docs/PRODUCT.md` (trust contract) · `docs/SOTA.md` + `docs/ROADMAP_SOTA2.md`
248
+ (position + research map) · `docs/CAPABILITY_GRADES.md` (12-agent adversarial
249
+ grading + must-fix ledger) · `docs/WILD_BENCH.md` / `docs/PAIRED_BENCH.md` /
250
+ `docs/GITTABLES_AUDIT.md` / `docs/DATASETS.md` (per-bench detail + licenses) ·
251
+ `docs/NIGHT_LOG.md` (stage-3 timeline) · `project-memory/` (agent memory snapshot).
docs/WILD_BENCH.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Wild Bench — can the shipped system clean real-world tables?
2
+
3
+ Behavioral audit + seeded inject-recovery per dataset (eval/wild_bench.py).
4
+
5
+ | dataset | domain | rows×cols | valid | changes | flags | PII | silent | typo | ocr | case | ws | mean |
6
+ |---|---|---|---|---|---|---|---|---|---|---|---|---|
7
+ | airlines | aviation | 56×8 | ✓ | 413 | 1 | 1 | 0 | — | — | — | — | — |
8
+ | billboard | music-billboard | 317×83 | ✓ | 36222 | 3 | 2 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
9
+ | acnc_charities | nonprofits-au | 800×69 | ✓ | 43268 | 4 | 1 | 0 | 0.00 | 0.00 | 0.01 | 0.01 | 0.01 |
10
+ | open_food_facts | food-products | 800×211 | ✓ | 27115 | 34 | 5 | 0 | 0.02 | 0.02 | 0.02 | 0.03 | 0.02 |
11
+ | biz_sf | sf-business | 800×38 | ✓ | 8060 | 12 | 1 | 0 | 0.02 | 0.05 | 0.02 | 0.07 | 0.04 |
12
+ | irs_eo1 | nonprofits-us | 800×28 | ✓ | 16953 | 5 | 3 | 0 | 0.04 | 0.03 | 0.03 | 0.15 | 0.06 |
13
+ | permits_nyc | construction | 800×60 | ✓ | 16762 | 25 | 3 | 0 | 0.03 | 0.04 | 0.04 | 0.13 | 0.06 |
14
+ | pawnbrokers_nyc | business | 800×31 | ✓ | 8494 | 8 | 2 | 0 | 0.06 | 0.08 | 0.05 | 0.11 | 0.08 |
15
+ | proptax_sf | real-estate | 800×46 | ✓ | 9302 | 3 | 3 | 0 | 0.06 | 0.06 | 0.07 | 0.12 | 0.08 |
16
+ | biz_chicago | business-licenses | 800×37 | ✓ | 12808 | 9 | 2 | 0 | 0.05 | 0.06 | 0.06 | 0.15 | 0.08 |
17
+ | permits_seattle | seattle-permits | 800×40 | ✓ | 6878 | 9 | 2 | 0 | 0.08 | 0.13 | 0.09 | 0.14 | 0.11 |
18
+ | restaurants_nyc | restaurants | 800×27 | ✓ | 7742 | 6 | 4 | 0 | 0.07 | 0.08 | 0.09 | 0.20 | 0.11 |
19
+ | titanic | passengers | 800×12 | ✓ | 5722 | 1 | 0 | 0 | 0.00 | 0.00 | 0.09 | 0.40 | 0.12 |
20
+ | biz_la | la-business | 800×16 | ✓ | 2726 | 9 | 3 | 0 | 0.15 | 0.09 | 0.10 | 0.21 | 0.14 |
21
+ | schools_nyc | education | 800×41 | ✓ | 14387 | 7 | 5 | 0 | 0.08 | 0.14 | 0.12 | 0.22 | 0.14 |
22
+ | online_retail | ecommerce-uk | 800×8 | ✓ | 3413 | 1 | 0 | 0 | 0.26 | 0.01 | 0.01 | 0.30 | 0.14 |
23
+ | film_nyc | film | 800×14 | ✓ | 3049 | 3 | 0 | 0 | 0.14 | 0.16 | 0.11 | 0.23 | 0.16 |
24
+ | salary_survey | survey | 800×18 | ✓ | 4142 | 5 | 0 | 0 | 0.12 | 0.20 | 0.13 | 0.26 | 0.18 |
25
+ | restaurants_sf | sf-restaurants | 800×22 | ✓ | 6002 | 6 | 2 | 0 | 0.15 | 0.15 | 0.16 | 0.26 | 0.18 |
26
+ | alcohol_tx | alcohol-bars | 800×24 | ✓ | 8518 | 9 | 1 | 0 | 0.14 | 0.09 | 0.17 | 0.38 | 0.20 |
27
+ | contractors_chi | contractors | 800×116 | ✓ | 20213 | 22 | 2 | 0 | 0.17 | 0.20 | 0.16 | 0.33 | 0.21 |
28
+ | fhv_nyc | transport | 800×23 | ✓ | 3789 | 4 | 2 | 0 | 0.10 | 0.30 | 0.14 | 0.36 | 0.23 |
29
+ | uk_price_paid | real-estate-uk | 800×16 | ✓ | 1662 | 8 | 0 | 0 | 0.14 | 0.17 | 0.26 | 0.42 | 0.25 |
30
+ | food_chicago | food-inspections | 800×17 | ✓ | 2790 | 6 | 0 | 0 | 0.17 | 0.25 | 0.23 | 0.38 | 0.26 |
31
+ | bx_books | books | 800×8 | ✓ | 1650 | 3 | 1 | 0 | 0.22 | 0.22 | 0.16 | 0.51 | 0.28 |
32
+ | bl_flickr_books | library | 800×15 | ✓ | 1769 | 6 | 1 | 0 | 0.19 | 0.28 | 0.22 | 0.43 | 0.28 |
33
+ | svc311_nyc | complaints | 800×44 | ✓ | 6299 | 16 | 2 | 0 | 0.23 | 0.30 | 0.23 | 0.37 | 0.28 |
34
+ | spotify | music | 800×23 | ✓ | 4669 | 3 | 1 | 0 | 0.20 | 0.28 | 0.30 | 0.36 | 0.28 |
35
+ | glassdoor_jobs | job-listings | 800×14 | ✓ | 1713 | 6 | 0 | 0 | 0.20 | 0.29 | 0.22 | 0.43 | 0.29 |
36
+ | ct_real_estate | real-estate-us | 800×14 | ✓ | 4840 | 4 | 0 | 0 | 0.23 | 0.29 | 0.24 | 0.40 | 0.29 |
37
+ | worldcities | geography | 800×4 | ✓ | 914 | 2 | 0 | 0 | 0.41 | 0.11 | 0.22 | 0.69 | 0.36 |
38
+ | fec_indiv80 | political-finance | 800×21 | ✓ | 4375 | 4 | 2 | 0 | 0.20 | 0.24 | 0.35 | 0.87 | 0.41 |
39
+ | payroll_nyc | jobs | 800×17 | ✓ | 4587 | 3 | 2 | 0 | 0.45 | 0.56 | 0.42 | 0.73 | 0.54 |
40
+ | paris_trees | urban-fr | 800×16 | ✓ | 3305 | 5 | 1 | 0 | 0.43 | 0.54 | 0.55 | 0.73 | 0.56 |
41
+ | ev_wa | vehicles | 800×16 | ✓ | 4085 | 5 | 2 | 0 | 0.50 | 0.56 | 0.48 | 0.91 | 0.61 |
docs/assets/space_landing.png ADDED

Git LFS Details

  • SHA256: 144649ae9a9d4546534d4a890239d4c8fb0ea2c46f8bece9fd577f91ce1685f4
  • Pointer size: 130 Bytes
  • Size of remote file: 72 kB
docs/assets/space_results.png ADDED

Git LFS Details

  • SHA256: 38c350045de7113f3a71dce1db32ba305003bef2eb1210af6ca2c4fa5ec19ae5
  • Pointer size: 131 Bytes
  • Size of remote file: 368 kB
docs/paper/fig_label_curve.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70a409349a89aa8681a6b0a4f47a68405a9391fad562bfb4b5a0de9ec573ab74
3
+ size 19327
docs/paper/fig_label_curve.png ADDED

Git LFS Details

  • SHA256: 1007f2590c88e79b80d24b0181c448c4dbc3f11fe69ec96be7fd5c945f8c8102
  • Pointer size: 130 Bytes
  • Size of remote file: 77.9 kB
docs/paper/fig_precision_coverage.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4d65c23406f21bcb82e8054cecc95d40ceb41cd08096726b85db5430cdae4a2
3
+ size 19440
docs/paper/fig_precision_coverage.png ADDED

Git LFS Details

  • SHA256: 02ecda628a984d5a27f7e270daf435156afaae6408f8c429f0f33d26fbaa9916
  • Pointer size: 130 Bytes
  • Size of remote file: 79.5 kB
docs/paper/fig_risk_coverage.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e11e0af912e0ce66da8c3f407732d62afe4c222fbc2af8e541467d4bf5f73bce
3
+ size 18227
docs/paper/fig_risk_coverage.png ADDED

Git LFS Details

  • SHA256: 9a0e46fd7f23d4224fac46a8045fc30040026bc774da0a58b4eb0f3a4ed9d6d6
  • Pointer size: 130 Bytes
  • Size of remote file: 59.2 kB
docs/paper/main.aux ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \relax
2
+ \@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{}\protected@file@percent }
3
+ \citation{raha}
4
+ \citation{holoclean}
5
+ \citation{garf}
6
+ \citation{wrangle}
7
+ \citation{jellyfish}
8
+ \citation{tablegpt}
9
+ \citation{retclean}
10
+ \citation{turl}
11
+ \citation{tablellama}
12
+ \citation{belotti}
13
+ \citation{racoon}
14
+ \citation{mtab}
15
+ \@writefile{toc}{\contentsline {section}{\numberline {2}Related Work}{2}{}\protected@file@percent }
16
+ \newlabel{sec:related}{{2}{2}}
17
+ \citation{selective}
18
+ \citation{openmed}
19
+ \@writefile{toc}{\contentsline {section}{\numberline {3}Method}{3}{}\protected@file@percent }
20
+ \newlabel{sec:method}{{3}{3}}
21
+ \@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Planner / executor decomposition}{3}{}\protected@file@percent }
22
+ \@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Execution-verified synthetic supervision}{3}{}\protected@file@percent }
23
+ \newlabel{sec:sft}{{3.2}{3}}
24
+ \@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Reference-grounded canonicalization with abstention}{3}{}\protected@file@percent }
25
+ \newlabel{sec:grounding}{{3.3}{3}}
26
+ \@writefile{toc}{\contentsline {subsection}{\numberline {3.4}PII as a second task instance}{4}{}\protected@file@percent }
27
+ \newlabel{sec:pii}{{3.4}{4}}
28
+ \@writefile{toc}{\contentsline {section}{\numberline {4}Evaluation Design}{4}{}\protected@file@percent }
29
+ \newlabel{sec:eval}{{4}{4}}
30
+ \@writefile{toc}{\contentsline {section}{\numberline {5}Results}{4}{}\protected@file@percent }
31
+ \newlabel{sec:results}{{5}{4}}
32
+ \@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Small fine-tuned planner vs.\ large generic model}{4}{}\protected@file@percent }
33
+ \@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Wide-suite comparison, 3 injection seeds, churn-neutral metric. NORTH is the double-macro harmonic mean; REAL-F1 is the real-error slice. (Filled from the final run.)}}{5}{}\protected@file@percent }
34
+ \newlabel{tab:money}{{1}{5}}
35
+ \@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Grounding vs.\ clustering}{5}{}\protected@file@percent }
36
+ \@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Ablations}{5}{}\protected@file@percent }
37
+ \@writefile{toc}{\contentsline {subsection}{\numberline {5.4}Calibration of abstention}{5}{}\protected@file@percent }
38
+ \newlabel{sec:calibration}{{5.4}{5}}
39
+ \@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Risk--coverage for grounded city reconciliation (650 probes). Operating points annotated; the confidence supports thresholded abstention.}}{6}{}\protected@file@percent }
40
+ \newlabel{fig:rc}{{1}{6}}
41
+ \@writefile{toc}{\contentsline {subsection}{\numberline {5.5}PII transfer and detection}{6}{}\protected@file@percent }
42
+ \@writefile{toc}{\contentsline {section}{\numberline {6}Limitations}{6}{}\protected@file@percent }
43
+ \@writefile{toc}{\contentsline {section}{\numberline {7}Conclusion}{6}{}\protected@file@percent }
44
+ \bibcite{raha}{{1}{}{{}}{{}}}
45
+ \bibcite{holoclean}{{2}{}{{}}{{}}}
46
+ \bibcite{garf}{{3}{}{{}}{{}}}
47
+ \bibcite{wrangle}{{4}{}{{}}{{}}}
48
+ \bibcite{jellyfish}{{5}{}{{}}{{}}}
49
+ \bibcite{tablegpt}{{6}{}{{}}{{}}}
50
+ \bibcite{retclean}{{7}{}{{}}{{}}}
51
+ \bibcite{turl}{{8}{}{{}}{{}}}
52
+ \bibcite{tablellama}{{9}{}{{}}{{}}}
53
+ \bibcite{belotti}{{10}{}{{}}{{}}}
54
+ \bibcite{racoon}{{11}{}{{}}{{}}}
55
+ \bibcite{mtab}{{12}{}{{}}{{}}}
56
+ \bibcite{selective}{{13}{}{{}}{{}}}
57
+ \bibcite{openmed}{{14}{}{{}}{{}}}
58
+ \providecommand\NAT@force@numbers{}\NAT@force@numbers
59
+ \gdef \@abspage@last{7}
docs/paper/main.log ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **
2
+ (main.tex
3
+ LaTeX2e <2021-11-15> patch level 1
4
+ L3 programming layer <2022-02-24> (article.cls
5
+ Document Class: article 2021/10/04 v1.4n Standard LaTeX document class
6
+ (size11.clo
7
+ File: size11.clo 2021/10/04 v1.4n Standard LaTeX file (size option)
8
+ )
9
+ \c@part=\count181
10
+ \c@section=\count182
11
+ \c@subsection=\count183
12
+ \c@subsubsection=\count184
13
+ \c@paragraph=\count185
14
+ \c@subparagraph=\count186
15
+ \c@figure=\count187
16
+ \c@table=\count188
17
+ \abovecaptionskip=\skip47
18
+ \belowcaptionskip=\skip48
19
+ \bibindent=\dimen138
20
+ ) (geometry.sty
21
+ Package: geometry 2020/01/02 v5.9 Page Geometry
22
+ (keyval.sty
23
+ Package: keyval 2014/10/28 v1.15 key=value parser (DPC)
24
+ \KV@toks@=\toks16
25
+ ) (ifvtex.sty
26
+ Package: ifvtex 2019/10/25 v1.7 ifvtex legacy package. Use iftex instead.
27
+ (iftex.sty
28
+ Package: iftex 2022/02/03 v1.0f TeX engine tests
29
+ ))
30
+ \Gm@cnth=\count189
31
+ \Gm@cntv=\count190
32
+ \c@Gm@tempcnt=\count191
33
+ \Gm@bindingoffset=\dimen139
34
+ \Gm@wd@mp=\dimen140
35
+ \Gm@odd@mp=\dimen141
36
+ \Gm@even@mp=\dimen142
37
+ \Gm@layoutwidth=\dimen143
38
+ \Gm@layoutheight=\dimen144
39
+ \Gm@layouthoffset=\dimen145
40
+ \Gm@layoutvoffset=\dimen146
41
+ \Gm@dimlist=\toks17
42
+ ) (amsmath.sty
43
+ Package: amsmath 2021/10/15 v2.17l AMS math features
44
+ \@mathmargin=\skip49
45
+ For additional information on amsmath, use the `?' option.
46
+ (amstext.sty
47
+ Package: amstext 2021/08/26 v2.01 AMS text
48
+ (amsgen.sty
49
+ File: amsgen.sty 1999/11/30 v2.0 generic functions
50
+ \@emptytoks=\toks18
51
+ \ex@=\dimen147
52
+ )) (amsbsy.sty
53
+ Package: amsbsy 1999/11/29 v1.2d Bold Symbols
54
+ \pmbraise@=\dimen148
55
+ ) (amsopn.sty
56
+ Package: amsopn 2021/08/26 v2.02 operator names
57
+ )
58
+ \inf@bad=\count192
59
+ LaTeX Info: Redefining \frac on input line 234.
60
+ \uproot@=\count193
61
+ \leftroot@=\count194
62
+ LaTeX Info: Redefining \overline on input line 399.
63
+ \classnum@=\count195
64
+ \DOTSCASE@=\count196
65
+ LaTeX Info: Redefining \ldots on input line 496.
66
+ LaTeX Info: Redefining \dots on input line 499.
67
+ LaTeX Info: Redefining \cdots on input line 620.
68
+ \Mathstrutbox@=\box50
69
+ \strutbox@=\box51
70
+ \big@size=\dimen149
71
+ LaTeX Font Info: Redeclaring font encoding OML on input line 743.
72
+ LaTeX Font Info: Redeclaring font encoding OMS on input line 744.
73
+ \macc@depth=\count197
74
+ \c@MaxMatrixCols=\count198
75
+ \dotsspace@=\muskip16
76
+ \c@parentequation=\count199
77
+ \dspbrk@lvl=\count266
78
+ \tag@help=\toks19
79
+ \row@=\count267
80
+ \column@=\count268
81
+ \maxfields@=\count269
82
+ \andhelp@=\toks20
83
+ \eqnshift@=\dimen150
84
+ \alignsep@=\dimen151
85
+ \tagshift@=\dimen152
86
+ \tagwidth@=\dimen153
87
+ \totwidth@=\dimen154
88
+ \lineht@=\dimen155
89
+ \@envbody=\toks21
90
+ \multlinegap=\skip50
91
+ \multlinetaggap=\skip51
92
+ \mathdisplay@stack=\toks22
93
+ LaTeX Info: Redefining \[ on input line 2938.
94
+ LaTeX Info: Redefining \] on input line 2939.
95
+ ) (amssymb.sty
96
+ Package: amssymb 2013/01/14 v3.01 AMS font symbols
97
+
98
+ (amsfonts.sty
99
+ Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support
100
+ \symAMSa=\mathgroup4
101
+ \symAMSb=\mathgroup5
102
+ LaTeX Font Info: Redeclaring math symbol \hbar on input line 98.
103
+ LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold'
104
+ (Font) U/euf/m/n --> U/euf/b/n on input line 106.
105
+ )) (booktabs.sty
106
+ Package: booktabs 2020/01/12 v1.61803398 Publication quality tables
107
+ \heavyrulewidth=\dimen156
108
+ \lightrulewidth=\dimen157
109
+ \cmidrulewidth=\dimen158
110
+ \belowrulesep=\dimen159
111
+ \belowbottomsep=\dimen160
112
+ \aboverulesep=\dimen161
113
+ \abovetopsep=\dimen162
114
+ \cmidrulesep=\dimen163
115
+ \cmidrulekern=\dimen164
116
+ \defaultaddspace=\dimen165
117
+ \@cmidla=\count270
118
+ \@cmidlb=\count271
119
+ \@aboverulesep=\dimen166
120
+ \@belowrulesep=\dimen167
121
+ \@thisruleclass=\count272
122
+ \@lastruleclass=\count273
123
+ \@thisrulewidth=\dimen168
124
+ ) (graphicx.sty
125
+ Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR)
126
+ (graphics.sty
127
+ Package: graphics 2021/03/04 v1.4d Standard LaTeX Graphics (DPC,SPQR)
128
+ (trig.sty
129
+ Package: trig 2021/08/11 v1.11 sin cos tan (DPC)
130
+ )
131
+ (graphics.cfg
132
+ File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration
133
+ )
134
+ Package graphics Info: Driver file: xetex.def on input line 107.
135
+ (xetex.def
136
+ File: xetex.def 2021/03/18 v5.0k Graphics/color driver for xetex
137
+ ))
138
+ \Gin@req@height=\dimen169
139
+ \Gin@req@width=\dimen170
140
+ ) (url.sty
141
+ \Urlmuskip=\muskip17
142
+ Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc.
143
+ ) (xcolor.sty
144
+ Package: xcolor 2021/10/31 v2.13 LaTeX color extensions (UK)
145
+ (color.cfg
146
+ File: color.cfg 2016/01/02 v1.6 sample color configuration
147
+ )
148
+ Package xcolor Info: Driver file: xetex.def on input line 227.
149
+ Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1352.
150
+ Package xcolor Info: Model `RGB' extended on input line 1368.
151
+ Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1370.
152
+ Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1371.
153
+ Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1372.
154
+ Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1373.
155
+ Package xcolor Info: Model `Gray' substituted by `gray' on input line 1374.
156
+ Package xcolor Info: Model `wave' substituted by `hsb' on input line 1375.
157
+ ) (natbib.sty
158
+ Package: natbib 2010/09/13 8.31b (PWD, AO)
159
+ \bibhang=\skip52
160
+ \bibsep=\skip53
161
+ LaTeX Info: Redefining \cite on input line 694.
162
+ \c@NAT@ctr=\count274
163
+ )
164
+ (numbers) (l3backend-xetex.def
165
+ File: l3backend-xetex.def 2022-02-07 L3 backend support: XeTeX
166
+ \c__kernel_sys_dvipdfmx_version_int=\count275
167
+ \l__color_backend_stack_int=\count276
168
+ \g__color_backend_stack_int=\count277
169
+ \g__graphics_track_int=\count278
170
+ \l__pdf_internal_box=\box52
171
+ \g__pdf_backend_object_int=\count279
172
+ \g__pdf_backend_annotation_int=\count280
173
+ \g__pdf_backend_link_int=\count281
174
+ ) (main.aux)
175
+ \openout1 = `main.aux'.
176
+
177
+ LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 16.
178
+ LaTeX Font Info: ... okay on input line 16.
179
+ LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 16.
180
+ LaTeX Font Info: ... okay on input line 16.
181
+ LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 16.
182
+ LaTeX Font Info: ... okay on input line 16.
183
+ LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 16.
184
+ LaTeX Font Info: ... okay on input line 16.
185
+ LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 16.
186
+ LaTeX Font Info: Trying to load font information for TS1+cmr on input line 1
187
+ 6.
188
+ (ts1cmr.fd
189
+ File: ts1cmr.fd 2019/12/16 v2.5j Standard LaTeX font definitions
190
+ )
191
+ LaTeX Font Info: ... okay on input line 16.
192
+ LaTeX Font Info: Checking defaults for TU/lmr/m/n on input line 16.
193
+ LaTeX Font Info: ... okay on input line 16.
194
+ LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 16.
195
+ LaTeX Font Info: ... okay on input line 16.
196
+ LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 16.
197
+ LaTeX Font Info: ... okay on input line 16.
198
+
199
+ *geometry* driver: auto-detecting
200
+ *geometry* detected driver: xetex
201
+ *geometry* verbose mode - [ preamble ] result:
202
+ * driver: xetex
203
+ * paper: <default>
204
+ * layout: <same size as paper>
205
+ * layoutoffset:(h,v)=(0.0pt,0.0pt)
206
+ * modes:
207
+ * h-part:(L,W,R)=(72.26999pt, 469.75502pt, 72.26999pt)
208
+ * v-part:(T,H,B)=(72.26999pt, 650.43001pt, 72.26999pt)
209
+ * \paperwidth=614.295pt
210
+ * \paperheight=794.96999pt
211
+ * \textwidth=469.75502pt
212
+ * \textheight=650.43001pt
213
+ * \oddsidemargin=0.0pt
214
+ * \evensidemargin=0.0pt
215
+ * \topmargin=-37.0pt
216
+ * \headheight=12.0pt
217
+ * \headsep=25.0pt
218
+ * \topskip=11.0pt
219
+ * \footskip=30.0pt
220
+ * \marginparwidth=59.0pt
221
+ * \marginparsep=10.0pt
222
+ * \columnsep=10.0pt
223
+ * \skip\footins=10.0pt plus 4.0pt minus 2.0pt
224
+ * \hoffset=0.0pt
225
+ * \voffset=0.0pt
226
+ * \mag=1000
227
+ * \@twocolumnfalse
228
+ * \@twosidefalse
229
+ * \@mparswitchfalse
230
+ * \@reversemarginfalse
231
+ * (1in=72.27pt=25.4mm, 1cm=28.453pt)
232
+
233
+ LaTeX Font Info: Trying to load font information for U+msa on input line 17.
234
+
235
+ (umsa.fd
236
+ File: umsa.fd 2013/01/14 v3.01 AMS symbols A
237
+ )
238
+ LaTeX Font Info: Trying to load font information for U+msb on input line 17.
239
+
240
+ (umsb.fd
241
+ File: umsb.fd 2013/01/14 v3.01 AMS symbols B
242
+ ) [1
243
+
244
+ ]
245
+
246
+ LaTeX Font Warning: Font shape `TU/lmr/bx/sc' undefined
247
+ (Font) using `TU/lmr/bx/n' instead on input line 92.
248
+
249
+ [2] [3] [4] [5] [6]
250
+ File: fig_precision_coverage.pdf Graphic file (type pdf)
251
+ <use fig_precision_coverage.pdf>
252
+ [7] [8] [9] [10] [11]
253
+ File: fig_label_curve.pdf Graphic file (type pdf)
254
+ <use fig_label_curve.pdf>
255
+ [12]
256
+ File: fig_risk_coverage.pdf Graphic file (type pdf)
257
+ <use fig_risk_coverage.pdf>
258
+ [13]
259
+ Underfull \hbox (badness 10000) in paragraph at lines 842--852
260
+ \TU/lmr/m/n/10.95 The model weights are public: $[][][][][] [] [] [] [][][][][]
261
+ [][][][][][] [] [][] [] [][][][][][][][][] []
262
+ []
263
+
264
+ [14] [15] [16] [17] (main.aux)
265
+
266
+ LaTeX Font Warning: Some font shapes were not available, defaults substituted.
267
+
268
+ )
269
+ Output written on main.xdv (17 pages, 553908 bytes).
docs/paper/main.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afcb564ae43329b0a7174b676446fe1204146968d9ed9a22426ab82454039e70
3
+ size 201091
docs/paper/main.tex ADDED
@@ -0,0 +1,1021 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \documentclass[11pt]{article}
2
+ \usepackage[utf8]{inputenc} % no-op on TeXLive >= 2018 (arXiv pdflatex); explicit for safety
3
+ \usepackage[margin=1in]{geometry}
4
+ \usepackage{amsmath,amssymb}
5
+ \usepackage{booktabs}
6
+ \usepackage{graphicx}
7
+ \usepackage{url}
8
+ \usepackage{xcolor}
9
+ \usepackage[numbers]{natbib}
10
+ \input{numbers}
11
+
12
+ \title{Verified Cleaning Plans: Plan-Level Selective Prediction Turns Local LLM
13
+ Planners into Trustworthy Table Cleaners}
14
+ \author{Ricardo Alanis\\ \small{\texttt{ricardo.alanis@gmail.com}}}
15
+ \date{June 2026}
16
+
17
+ \begin{document}
18
+ \maketitle
19
+
20
+ \begin{abstract}
21
+ Cleaning messy tabular data---particularly \emph{canonicalization}, the merging of
22
+ inconsistent surface forms such as \texttt{USA}/\texttt{U.S.A}/\texttt{united states}
23
+ into one canonical value---resists rule-based automation and is routinely done by hand.
24
+ We present ScrubData, an architecture built around a trust contract. A local LLM
25
+ \emph{planner} reads an aggregated column profile (per-value frequency counts,
26
+ invariant to row count) and \emph{proposes} a structured JSON cleaning plan; a
27
+ deterministic executor \emph{applies} it, making every change auditable and reversible;
28
+ and \emph{plan-level selective prediction} --- a deterministic verifier that scores
29
+ every proposed mapping and drops low-confidence entries to review flags --- extends
30
+ abstention from cell-level confidence to the plan itself. The verified union of the
31
+ gated model plan with a reference-grounded heuristic is the architecture's operating
32
+ point: a zero-configuration, zero-label system that repairs 41\% of the hospital
33
+ benchmark's 509 real errors at \unionGatePrec{} precision (strongest of three training
34
+ seeds; 3-seed mean \unionGateThreeSeedPrec{} at \unionGateThreeSeedCov{} coverage,
35
+ $\pm$ = 95\% CI),
36
+ with every declined merge surfaced for review. Four deterministic capabilities ---
37
+ profile-level \emph{suspect surfacing} for high-cardinality columns, reconciliation
38
+ against a pluggable \emph{entity reference} built from open vocabularies,
39
+ \emph{cross-row majority voting} over repeated-entity groups, and
40
+ \emph{convention-conservatism} gates --- carry the system to never-seen tables:
41
+ macro F1 \unseenMacroF{} at \unseenMacroDamage{} damage over the 35 unseen-source
42
+ pairs of a \nPairs-pair benchmark, and \emph{zero silent edits} across \nWild{} wild
43
+ tables plus a \nTrust-table trust audit, released together as the \textsc{WildClean}
44
+ benchmark.
45
+
46
+ Finally, we report where the capability lives. \emph{Execution-verified} synthetic
47
+ supervision --- a training example is kept only if executing its plan provably
48
+ recovers the known-clean table --- buys the 4B fine-tune real in-distribution skill
49
+ and the most precise gated planner at usable coverage (\modelGatePrec{} precision at
50
+ \modelGateCov{} coverage); but five further retrains and a three-arm GRPO pilot leave
51
+ held-out generalization statistically bounded (TOST against a pre-registered margin),
52
+ while two of three zero-shot 24--31B open-weights planners (devstral-24B, gemma4-31B)
53
+ dropped into the \emph{identical} harness exceed the fine-tune's operating point
54
+ (\scalePrecBig{} precision at \scaleCovBig{} coverage) with no task training. The
55
+ architecture is planner-agnostic: it converts capability gains into trustworthy
56
+ operating points without retraining. The shipped system runs entirely locally on commodity hardware;
57
+ no data leaves the machine (the scaling-arm planners were measured via hosted
58
+ endpoints; all are locally deployable open weights).
59
+ \end{abstract}
60
+
61
+ \section{Introduction}
62
+
63
+ A large share of practical data work is cleaning: a sales export where the same country
64
+ is spelled four ways, a hospital roster where \texttt{birminghxm} should be
65
+ \texttt{birmingham}, a CRM dump with mixed date formats and duplicated contacts. The
66
+ fuzzy half of this work---recognizing that distinct surface forms denote the same
67
+ entity---is exactly what rules do poorly and humans do slowly.
68
+
69
+ Large language models can do this fuzzy matching, but deploying them as cell editors has
70
+ three problems. First, \emph{trust}: a model that edits cells directly can silently
71
+ corrupt data, and its errors are unauditable. Second, \emph{cost and privacy}: shipping
72
+ every row of a private table to a hosted frontier model is expensive and often
73
+ unacceptable. Third, \emph{hallucination}: asked for a canonical form, a generative model
74
+ will invent one, and on tail entities it will invent wrong ones.
75
+
76
+ ScrubData addresses all three with an architecture in which the model never touches
77
+ data. A profiler aggregates each column into a value-frequency distribution; a small
78
+ local model reads the profile and \emph{proposes} a JSON cleaning plan; a deterministic
79
+ pandas executor \emph{applies} it. The plan is the complete, inspectable, reversible
80
+ specification of every change---there are no silent edits by construction
81
+ (\S\ref{sec:method}). Because the prompt scales with the number of \emph{distinct}
82
+ values rather than rows, a million-row table profiles like a hundred-row one.
83
+
84
+ This paper makes five contributions:
85
+ \begin{enumerate}
86
+ \item \textbf{A planner/executor decomposition with plan-level selective prediction}:
87
+ the model proposes, a deterministic engine executes with full lineage, and a
88
+ deterministic verifier gates every proposed mapping, extending abstention to the plan
89
+ itself. The verified union of the gated model plan with a reference-grounded
90
+ heuristic repairs 41\% of hospital's \hospErrors{} real errors at \unionGatePrec{}
91
+ precision with zero configuration and zero labels (\S\ref{sec:method},
92
+ \S\ref{sec:verifier}, \S\ref{sec:ws1results}).
93
+ \item \textbf{\textsc{WildClean} and an un-gameable evaluation}: a 65-dataset suite
94
+ (real-error benchmarks plus seeded error injection over 15 harvested open-data
95
+ domains) scored with a churn-neutral, convention-tolerant metric that cannot be
96
+ inflated by mass rewriting, with damage and silent edits scored alongside repair F1,
97
+ degenerate baselines pinning the metric's floor and ceiling, and the scorer itself
98
+ validated against 30 adversarial known-by-construction cases (\S\ref{sec:eval},
99
+ \S\ref{sec:degenerate}).
100
+ \item \textbf{Four deterministic capabilities that carry never-seen-table
101
+ generalization}: bounded suspect surfacing for high-cardinality columns, generic
102
+ entity-reference reconciliation with an exact-hit typing floor, cross-row majority
103
+ voting with a false-consensus guard, and convention-conservatism gates --- each
104
+ motivated by a measured failure regime and gated by the verifier
105
+ (\S\ref{sec:capabilities}, \S\ref{sec:wild}).
106
+ \item \textbf{Execution-verified synthetic supervision}, the training method behind
107
+ the 4B planner instantiation: every training example is validated by running the
108
+ executor on the (dirty table, plan) pair and checking that the known-clean table is
109
+ recovered; non-recovering examples are discarded (\S\ref{sec:sft}).
110
+ \item \textbf{A unified finding on where capability lives in this architecture}: five
111
+ further supervised fine-tunes and a three-arm GRPO pilot with the executor as a
112
+ verifiable reward leave held-out generalization statistically bounded (TOST), while
113
+ two of three zero-shot 24--31B planners dropped into the same harness exceed the
114
+ fine-tune's operating point --- deterministic machinery plus plan-level verification carry the
115
+ generalization that exists, and raw planner capability, not task fine-tuning, scales
116
+ it (\S\ref{sec:negative}, \S\ref{sec:scaling}).
117
+ \end{enumerate}
118
+
119
+ We deliberately report a negative-flavored finding alongside the positive ones: on
120
+ \emph{injected} typos, classical frequency clustering remains a strong baseline---by
121
+ construction, injection places the canonical form in the column, which is clustering's
122
+ ideal regime. The advantage of grounding is concentrated where it matters: real errors,
123
+ tail entities absent from the column, and adversarial near-misses where acting at all is
124
+ wrong (\S\ref{sec:results}).
125
+
126
+ \section{Related Work}
127
+ \label{sec:related}
128
+
129
+ \textbf{Error detection and repair.} Raha and Baran~\cite{raha} established
130
+ configuration-free error detection and correction benchmarks (hospital, beers, flights,
131
+ rayyan), which we adopt as out-of-distribution evaluation. HoloClean~\cite{holoclean}
132
+ combines integrity constraints, external reference data, and statistics in probabilistic
133
+ repair, demonstrating that external signals can veto statistically plausible but wrong
134
+ repairs---an insight our reference-veto inherits. GARF~\cite{garf} learns repair rules
135
+ self-supervised from the data itself; it also demonstrates the structural limit we
136
+ observe for frequency-only methods: a lone categorical column offers no co-occurring
137
+ signal to vote against an error.
138
+
139
+ \textbf{The 2025--26 landscape.} Post-Cocoon work concentrates on zero-label
140
+ \emph{detection}: ZeroED~\cite{zeroed} (cloud-LLM cluster labeling, hospital
141
+ detection F1 0.81, collapsing to 0.27 on smaller models), ForestED~\cite{forested}
142
+ (LLM-induced decision trees, 0.756), and Auto-Test~\cite{autotest} (corpus-mined
143
+ semantic-domain constraints, no LLM at inference) --- none performs zero-label
144
+ \emph{repair}. GIDCL~\cite{gidcl} sets the labeled-class repair ceiling
145
+ (hospital \gidclHosp{} with 20 labels and a LoRA trained per cleaned table);
146
+ Cocoon~\cite{cocoon} remains an unreproduced preprint (15 citing papers, none a
147
+ reproduction). Two concurrent results corroborate facets of this paper's central
148
+ negative finding that machinery, not weights, carries cleaning generalization: a
149
+ study showing even frontier models cannot correct table distortions without
150
+ explicit priors~\cite{distort}, and a large multi-agent-debate evaluation in which
151
+ LLM self-critique \emph{degrades} repair and only an adversarially separate,
152
+ execution-grounded critic helps~\cite{debate} --- the architecture our verifier
153
+ instantiates. Spreadsheet-RL~\cite{spreadsheetrl} reports the complementary
154
+ positive case: with full-scale RL infrastructure and execution-verified rewards,
155
+ a 4B model's spreadsheet-manipulation skill \emph{does} move (12.0\%
156
+ $\rightarrow$ 23.4\%) --- consistent with our reading that the gap between our
157
+ \$30 pilot and such results is infrastructure scale, a boundary we state rather
158
+ than blur (\S\ref{sec:negative}).
159
+
160
+ \textbf{LLMs for data wrangling.} Narayan et al.~\cite{wrangle} showed frontier
161
+ foundation models handle entity matching and imputation few-shot;
162
+ Jellyfish~\cite{jellyfish} and Table-GPT~\cite{tablegpt} fine-tune mid-size models for
163
+ data tasks. RetClean~\cite{retclean} is closest in spirit: retrieval from data lakes
164
+ grounds cell repair, with the key empirical split that parametric knowledge suffices on
165
+ world-known head values but collapses on the tail---motivating retrieval. Our work
166
+ differs in the planner/executor decomposition (the model emits no cell values, only
167
+ plans), in execution-verified supervision, and in the calibrated-abstention contract.
168
+
169
+ \textbf{Entity linking over tables.} TURL~\cite{turl} and TableLlama~\cite{tablellama}
170
+ inject candidate entities into table understanding; Belotti et al.~\cite{belotti}
171
+ show retriever coverage is the accuracy ceiling for table entity disambiguation and that
172
+ long candidate lists hurt smaller models. RACOON~\cite{racoon} shows inference-time KG
173
+ retrieval lifts a frozen model substantially, supporting our choice to ground at
174
+ inference rather than bake aliases into weights (TURL's out-of-domain collapse is the
175
+ cautionary result). MTab~\cite{mtab} established type-constrained matching with
176
+ abstention in semantic table annotation.
177
+
178
+ \textbf{Clustering-based cleaning tools.} The de-facto practitioner baseline is
179
+ OpenRefine: key-collision (fingerprint) clustering plus a nearest-neighbour mode; we
180
+ reimplement both faithfully, including blocking, and compare head-to-head.
181
+
182
+ \textbf{Selective prediction.} Risk--coverage analysis and calibration
183
+ metrics~\cite{selective} formalize ``knowing when not to act''; to our knowledge their
184
+ application to data-cleaning merge decisions is new.
185
+
186
+ \textbf{Small specialized models.} OpenMed~\cite{openmed} fine-tunes sub-500M encoders
187
+ to state-of-the-art biomedical NER, the sister result to our thesis that small
188
+ specialized models beat large generic ones on narrow structured tasks; we adopt their
189
+ released PII token classifiers for column typing (\S\ref{sec:pii}).
190
+
191
+ \section{Method}
192
+ \label{sec:method}
193
+
194
+ \subsection{Planner / executor decomposition}
195
+ A \emph{profiler} reduces each column to a typed summary: detected semantic type, missing
196
+ counts, issue flags, and a value--frequency distribution capped at 80 distinct values
197
+ (high-cardinality columns are summarized by their head). The \emph{planner}---either a
198
+ deterministic heuristic or our fine-tuned 4B model---maps the profile (plus three sample
199
+ rows) to a JSON plan: a list of per-column operations drawn from a closed vocabulary
200
+ (\texttt{canonicalize\_categories} with an explicit mapping, \texttt{parse\_date},
201
+ \texttt{standardize\_phone}, \texttt{mask\_pii}, \ldots), table operations, and review
202
+ flags. The \emph{executor} applies the plan with pure pandas transforms. The plan is the
203
+ only channel through which data changes: every diff is attributable to a named operation
204
+ with a rationale, the original table is never mutated, and abstentions are first-class
205
+ plan objects. We export per-run decision summaries as OpenTelemetry GenAI spans.
206
+
207
+ \subsection{Execution-verified synthetic supervision}
208
+ \label{sec:sft}
209
+ Training pairs are generated by corrupting clean synthetic tables with realistic noise
210
+ (casing, aliases, single-character typos with Zipf-distributed long-tail categorical
211
+ columns of 30--80 distinct values) while recording the ground-truth plan. The defining
212
+ step is \emph{verification by execution}: a candidate example is kept only if
213
+ $\textsc{Execute}(\text{dirty}, \text{plan}) = \text{clean}$ cell-for-cell. This closes
214
+ the loop between supervision and semantics---a plan that would not actually clean the
215
+ table can never become a training label. We augment with real supervision derived from
216
+ paired dirty/clean benchmarks by aligning cells and keeping only \emph{learnable}
217
+ canonicalizations (a surface form that is a string variant of its target and never a
218
+ legitimate value elsewhere), which excludes unlearnable per-cell corrections such as
219
+ divergent flight times. The fine-tune is QLoRA (rank 32) over Qwen3-4B-Instruct in
220
+ bf16; one practical finding is that the base model's tool-calling prior dominates
221
+ free-running generation even after convergent fine-tuning (loss 0.16) and must be
222
+ suppressed at decode time by banning the two tool-call tokens.
223
+
224
+ \subsection{Reference-grounded canonicalization with abstention}
225
+ \label{sec:grounding}
226
+ For columns whose values reconcile to a known concept type (countries, administrative
227
+ regions, cities), canonical forms are never generated: a fuzzy retriever (normalized
228
+ edit similarity with first-character blocking and length prefilters) matches each
229
+ distinct value against the type-scoped reference (ISO/pycountry; GeoNames cities500,
230
+ 196k entries). A value maps to a canonical only if (i) similarity clears a threshold
231
+ $\tau{=}0.84$, (ii) the best--second-best margin clears $0.03$ (ambiguity veto: a value
232
+ equally close to \texttt{Box} and \texttt{Boaz} abstains), and (iii) the canonical is
233
+ cast to the column's observed case convention. Near-misses ($0.70{\le}s{<}\tau$) are
234
+ surfaced as review flags. The same wrapper grounds the \emph{model} planner: for
235
+ reference-typed columns the model's free-generated mapping is replaced by the grounded
236
+ one, so the model can add coverage but never invent a canonical for a grounded type.
237
+
238
+ \subsection{Plan-level selective prediction: the verified union planner}
239
+ \label{sec:verifier}
240
+ Grounding constrains reference-typed columns, but the planner's \emph{free}
241
+ canonicalization mappings on non-grounded columns remain unguarded---and they are where
242
+ real-data precision dies (the fine-tune's raw hospital plan: \hospModelPrecVSix{}
243
+ precision at \hospModelRecallVSix{} recall). Rather than retrain, we extend abstention
244
+ to the plan itself. A deterministic \emph{verifier} scores every proposed mapping entry
245
+ $raw{\to}canon$ with contract-preserving evidence (no cell values emitted, no gold
246
+ access): three hard gates distilled from the model's measured failure classes---a value
247
+ occurring ${\ge}3$ times is data, not a typo (\emph{errors are rare}); the target must
248
+ be a frequent column value clearly dominating the source (no mapping one typo onto
249
+ another); digit-bearing codes repair only when the letter part is near-identical---then
250
+ a confidence combining edit similarity with frequency support. Entries below a
251
+ threshold $\tau$ are dropped to review flags; abstention stays first-class. Sweeping
252
+ $\tau$ yields a plan-level precision--coverage curve. The shipped composition,
253
+ the \emph{verified union planner}, is the verifier-gated model plan ($\tau{=}0.5$)
254
+ unioned with the grounded heuristic's mappings (the model wins per surface form);
255
+ the same code path is the product default.
256
+
257
+ \subsection{Visibility and consensus: four deterministic capabilities}
258
+ \label{sec:capabilities}
259
+ Four further mechanisms, each motivated by a measured failure regime on never-seen
260
+ tables, complete the deterministic machinery. \textbf{(a) Suspect surfacing.} The
261
+ profile's value-frequency view is capped, so high-cardinality columns hide their
262
+ dirty cells from any planner. Every column profile now carries a bounded
263
+ \texttt{suspect\_values} section: rare anomalous surfaces with evidence-backed
264
+ repair candidates (frequency dominance, edit similarity, reference membership).
265
+ The heuristic planner repairs from suspects under a strict verifier bar
266
+ ($\tau_{hc}{=}0.8$) and flags the rest. \textbf{(b) Generic entity reference.}
267
+ Open vocabularies (SemTab ToughTables aliases --- derived excluding our benchmark
268
+ tables; MusicBrainz search-hint misspellings; RxNorm; Wikidata; ROR) register as a
269
+ pluggable reference type. Because the reference is broad, entity-typing a column
270
+ additionally requires that ${\ge}20\%$ of its distinct values match the reference
271
+ \emph{exactly} --- fuzzy coverage alone over-fires on name-like columns (measured).
272
+ This resolves the regime where every surface in a column is unique (no in-column
273
+ frequency signal exists at all): five such benchmark tables go from 0.0 to
274
+ \ttFOne{} F1 at \emph{zero} damage. \textbf{(c) Cross-row majority voting.} Tables
275
+ that repeat a real-world entity across rows (a flight reported by many sources)
276
+ carry their own repair signal. A detection step finds compact-token key columns
277
+ with small groups (median multiplicity 3--30) and columns whose groups show
278
+ \emph{majority-bearing} disagreement with per-group information; a table-level
279
+ operation then resolves thin dissenting minorities to the group majority. A
280
+ \emph{false-consensus} guard declines when minority shares look like legitimate
281
+ correlated updates rather than reporting errors (mean minority share ${\ge}0.25$)
282
+ --- a flat volume cap was measured to destroy the legitimate dense-disagreement
283
+ regime and replaced. \textbf{(d) Convention conservatism.} The planner never
284
+ re-formats an internally consistent column: date and percent ops are gated on
285
+ dominant-shape inconsistency (digit and alpha runs collapsed; 90\% rule),
286
+ ZIP/postal-named columns are never typed as phones or dates, and Excel-serial
287
+ date typing requires a date-suggestive column name. Suppressed minority values
288
+ surface as review flags --- abstention is visible, never silent. The verifier
289
+ enforces the same gates on model-emitted plans at the verification boundary.
290
+
291
+ \subsection{PII as a second task instance}
292
+ \label{sec:pii}
293
+ The identical contract covers PII: a deterministic tier types columns by checksum and
294
+ pattern validators (Luhn, IBAN mod-97, SSN/email/phone) over distinct values; an
295
+ optional 44M OpenMed-PII token classifier~\cite{openmed} extends coverage to names and
296
+ addresses, gated by a sensitive-type allowlist and a column-level coverage vote; and
297
+ masking, salted hashing, and join-stable pseudonymization are deterministic executor
298
+ operations. Measured briefly: the classifier, though trained on sentence-level
299
+ clinical text, transfers to bare cell values --- \piiNameBare{} detection on
300
+ person-name cells and \piiAddrBare{} on address cells ($n{=}40$ sampled cells each);
301
+ the validator tier, evaluated out-of-distribution on per-type columns from the Gretel
302
+ PII test split, types 5/5 covered PII types correctly with 0/7 false positives on
303
+ negative columns drawn from real open data; and after deterministic masking,
304
+ re-running all validators over the output finds \piiLeakRate{} residual PII ---
305
+ residual PII \emph{detectable by our validators}, a circularity we note explicitly:
306
+ the leak test can only see what the validator tier sees.
307
+
308
+ \section{Evaluation Design}
309
+ \label{sec:eval}
310
+ \textbf{Suite.} Five real-error benchmarks (Raha) plus seeded error injection
311
+ (typo/OCR/case/whitespace) over 15 harvested open-data domains (NYC, Chicago, SF, LA,
312
+ Seattle, Texas, WA portals; GitHub) $\approx$ 65 datasets per seed. We aggregate as a
313
+ \emph{double macro}---mean over error types of mean over datasets, harmonically combined
314
+ with the domain macro---so no single table or error type dominates:
315
+ \begin{equation*}
316
+ \textsc{north} \;=\; \operatorname{HM}\Biggl(
317
+ \underbrace{\frac{1}{|T|}\sum_{t \in T}\frac{1}{|D_t|}\sum_{d \in D_t} F_1(d)}_{\text{error-type macro}},\;
318
+ \underbrace{\frac{1}{|G|}\sum_{g \in G}\frac{1}{|D_g|}\sum_{d \in D_g} F_1(d)}_{\text{domain macro}}
319
+ \Biggr),
320
+ \end{equation*}
321
+ where $T$ is the set of error types, $G$ the set of data domains, $D_t$ (resp.\ $D_g$)
322
+ the datasets carrying error type $t$ (domain $g$), and $\operatorname{HM}$ the harmonic
323
+ mean.
324
+
325
+ \textbf{Churn-neutral metric.} A cell change that is case/whitespace-equivalent to the
326
+ input but does not restore the gold counts as nothing: not a fix, not a change, not
327
+ damage. Without this, mass case-rewriting inflates precision (we observed $+0.12$
328
+ NORTH from \emph{removing} case matching before the correction); with it, fixing a
329
+ case-injected error requires actually acting. We additionally report
330
+ \emph{damage}---the rate of semantically corrupting clean cells---and an adversarial
331
+ \emph{abstain slice} whose traps are garbage strings (not single-edit variants of any
332
+ reference entity; an earlier trap set mis-scored grounding for correctly mapping
333
+ \texttt{Boazz}$\to$\texttt{Boaz}). We report both repairs of these metric artifacts as
334
+ evidence that gameability must be tested, not assumed.
335
+
336
+ \textbf{Real vs.\ injected.} Injected typos are in-distribution for frequency
337
+ clustering by construction (the canonical is present and dominant in the column), so we
338
+ report the real-error and injected slices separately. A TableEG-style audit
339
+ quantifies the gap (\texttt{eval/inject\_validity.py}): the injector covers three
340
+ of nine error classes (Jensen--Shannon divergence 0.526 bits from the pooled real
341
+ distribution over 163{,}607 real errors), and injected-only evaluation would
342
+ invert the fingerprint-clustering ranking --- exactly the overstatement the
343
+ separate-slice reporting prevents.
344
+
345
+ \textbf{Scorer validation.} Following GroUSE-style evaluator
346
+ testing~\cite{grouse}, the scorer itself is validated against 30 adversarial
347
+ known-by-construction cases: a no-op plan must score 0 fixes and 0 damage, an
348
+ oracle plan exactly 1.0, vandalizing $k$ of $m$ clean cells must score damage
349
+ $k/m$ at precision 0, pure churn (case/whitespace rewrites that do not restore
350
+ gold) must count as nothing although a naive scorer would count it, fixes must
351
+ require actually acting, and silent edits must trip the audit. All 30 pass
352
+ against the shipped scorer unmodified. We additionally cross-score every system
353
+ under the \emph{original} Raha/Baran cell-repair protocol side by side with ours
354
+ (\texttt{eval/cross\_scoring.py}): rankings agree at Kendall $\tau_b{=}1.0$ on
355
+ three of five datasets, and the disagreements cut both ways --- raw string
356
+ equality denies credit for numerically-correct serialization restorations (our
357
+ movies\_1 repairs), while churn-neutrality charges Baran for load-time
358
+ normalizer rewrites its own protocol hides (hospital precision
359
+ $0.908\!\to\!0.783$). Neither metric family flatters us uniformly, and our Baran
360
+ reproduction calibrates against its published Table~3 within $+0.02$ on three of
361
+ the four shared datasets.
362
+
363
+ \textbf{Contamination.} The Raha-suite benchmarks have been public on GitHub since
364
+ 2019 and sit inside every modern base model's training window; we treat them as
365
+ potentially contaminated and split our claims accordingly. A verbatim-completion
366
+ probe makes the concern concrete: prompted with five fields of a gold hospital row,
367
+ a frontier-class model reproduces \textbf{25\%} of the held-out cells exactly
368
+ (30/120 cells over 30 rows, exact-substring match), versus \textbf{0\%} (0/120) on a
369
+ date-stamped post-training-cutoff wild harvest under the identical protocol
370
+ (\texttt{eval/contamination\_probe.py}). The rate is an upper bound on memorization
371
+ --- some completions are guessable from the given fields --- but it is not zero, so
372
+ results on legacy-public benchmarks (including the all-hospital
373
+ Table~\ref{tab:scaling}, whose zero-shot planners may partially benefit from
374
+ memorized gold) carry this caveat, while the architecture's trust claims
375
+ (zero silent edits, damage accounting, abstention) rest on the date-stamped wild
376
+ and GitTables slices, where the probe finds nothing to complete.
377
+
378
+ \section{Results}
379
+ \label{sec:results}
380
+
381
+ \subsection{Plan-level selective prediction on real errors}
382
+ \label{sec:ws1results}
383
+ On hospital's \hospErrors{} real errors, the verifier transforms the fine-tune from
384
+ unshippable to precise (Figure~\ref{fig:pc}): the raw model plan repairs
385
+ \hospModelRecallVSix{} of errors at \hospModelPrecVSix{} precision; gated at $\tau{=}0.5$ it
386
+ reaches \modelGatePrec{} precision at \modelGateCov{} coverage (146 of 147 committed
387
+ changes correct). The union with the grounded heuristic buys coverage back:
388
+ \textbf{\unionGatePrec{} precision at \unionGateCov{} coverage} (\unionChanged{}
389
+ changes, \unionFixed{} correct). This turns the system's promise into a measured
390
+ sentence: \emph{zero-configuration and zero labels, repair 41\% of real errors at
391
+ ${\ge}0.90$ precision, with every declined merge surfaced for review}. For context,
392
+ Baran given oracle error positions and 20 gold-labeled tuples per dataset reaches
393
+ \realFBaran{} F1 on the same slice (\S\ref{sec:ws4})---selective prediction does not
394
+ close a supervised gap, but it makes the zero-label operating point trustworthy, which
395
+ is the regime our user occupies. Precision is flat ($0.89$--$0.91$) for
396
+ $\tau\in[0.2,0.8]$, so the operating point is not threshold-brittle, and the result is
397
+ seed-robust: across three training seeds of the same data recipe the union operating
398
+ point is \unionGateThreeSeedPrec{} precision at \unionGateThreeSeedCov{} coverage
399
+ (the shipped adapter is the strongest seed), with every seed clearing the
400
+ $0.70$-precision/$0.30$-coverage bar decisively. All 3-seed intervals in this paper
401
+ are normal-approximation 95\% CIs ($1.96\,\sigma/\sqrt{3}$); the $t$-based
402
+ interval at $n{=}3$ is ${\sim}2.7\times$ wider ($\pm 0.031$ here) and every
403
+ qualitative claim survives it --- the weakest seed alone clears the bar.
404
+
405
+ \textbf{Candidate-constrained planning (negative result).} We also tested constraining
406
+ the planner's \emph{inputs}: the profiler emits evidence-backed (variant$\,\to\,$
407
+ canonical) candidate pairs (frequency dominance, edit similarity, reference membership)
408
+ and the model may only select among them, with a deterministic check dropping
409
+ off-candidate mappings to review flags. As a standalone guard it is strong---the raw
410
+ plan's precision rises from \hospModelPrecVSix{} to \pairsRawPrec{} with no verifier at
411
+ all---but composed with the verifier and union it reaches \pairsUnionPrec{} precision
412
+ at \pairsUnionCov{} coverage, slightly \emph{below} the unconstrained pipeline at the
413
+ same $\tau$: the candidate cap (top-3 per surface) removes some correct repairs the
414
+ verifier would have kept, and the two mechanisms gate the same failure class. We ship
415
+ the verifier and keep candidate constraining available but off by default, reporting
416
+ this as a measured redundancy rather than a stacked win.
417
+
418
+ \begin{figure}[t]
419
+ \centering
420
+ \includegraphics[width=0.62\linewidth]{fig_precision_coverage}
421
+ \caption{Plan-level precision--coverage on hospital (509 real errors), sweeping the
422
+ verifier threshold $\tau$. The union planner dominates the raw model plan; the shipped
423
+ operating point ($\tau{=}0.5$) is annotated.}
424
+ \label{fig:pc}
425
+ \end{figure}
426
+
427
+ \subsection{The 4B fine-tune as one planner instantiation}
428
+ On frozen synthetic gold, the fine-tuned 4B planner reaches canonicalization micro-F1
429
+ \canonFMultiSeed{} --- versus \canonFBig{} for a much larger zero-shot generalist
430
+ prompted identically and \canonFHeur{} for the rule heuristic (best single run
431
+ \canonFOursBest; operation-F1 \opFOurs, JSON validity \jsonValidOurs). On real hospital
432
+ typos the synthetic-only fine-tune scores 0.000 repair recall; adding 20\%
433
+ real-derived supervision lifts it to \hospModelRecall, and a data-scaling iteration
434
+ (tripling the real-derived share from three paired benchmarks) reaches
435
+ \hospModelRecallVSix{} recall at \hospModelPrecVSix{} precision---approaching the
436
+ \frontierZeroShotRecall{} of a frontier-scale zero-shot model. The scaling gain is seed-robust: $+0.09$
437
+ canonicalization F1 over the base mix under identical protocol, with non-overlapping
438
+ 3-seed confidence intervals. Real, execution-verified pairs are what transfer:
439
+ the same iteration found frequency-derived and algorithm-cleaned labels both
440
+ \emph{reduce} quality, consistent with our grounding thesis.
441
+
442
+ \subsection{Grounding vs.\ clustering}
443
+ With the errors-are-rare frequency gates now in both paths, grounding and frequency
444
+ clustering are comparable on hospital alone (repairs-only, churn-neutral:
445
+ \hospPrecGrounded{} precision at \hospRecallGrounded{} recall grounded vs
446
+ \hospPrecFreq{} at \hospRecallFreq{} clustering---hospital's dominant errors are
447
+ in-column typos, clustering's best case). Grounding's margin appears where references
448
+ matter: across the five-benchmark real-error macro it reaches \ablFullRealF{} versus
449
+ \ablNoGroundRealF{} for the frequency-clustering ablation ($+29\%$), and it carries
450
+ the behavioral guarantees below.
451
+ On the full suite against OpenRefine (Table~\ref{tab:money}), the result splits
452
+ cleanly by regime, and we report both. On the \emph{real-error} slice---the regime the
453
+ tool exists for---grounded cleaning reaches REAL-F1 \realFGrounded{}, $3.9\times$
454
+ OpenRefine kNN (\realFORKnn) and $5.7\times$ fingerprint (\realFORFp), with seed CIs of
455
+ $\pm$\northGroundedCI. Provenance: the grounded and OpenRefine rows of
456
+ Table~\ref{tab:money} are regenerated at the current system head (2026-06-12,
457
+ post-capability, scorer fix in); the dagger rows keep their original capture
458
+ provenance. The June-10 freeze system measured REAL-F1 \realFGroundedFreeze{} on the
459
+ same protocol --- the $+0.05$ difference is the measured contribution of the four
460
+ deterministic capabilities (\S\ref{sec:capabilities}) on the real-error slice. On the \emph{injected} slice, fingerprint clustering wins
461
+ (\injFORFp{} vs \injFGrounded) at near-zero damage: our case/whitespace injectors are
462
+ exactly the perturbations key-collision normalizes away, so this is its home game and
463
+ we say so. kNN clustering---the method that, like us, attempts typo repair---loses on
464
+ both slices while incurring the highest damage among baselines (\damageORKnn), the
465
+ no-reference over-merging failure the grounding was built to prevent. The shipped
466
+ verified-union system's suite row (REAL-F1 \modelRealF, damage \modelDamage) shows the
467
+ grounding wrapper and heuristic union carry entity canonicalization on these datasets ---
468
+ the model's contribution concentrates on the synthetic regime and hospital repair
469
+ (\S\ref{sec:ws1results}), and the verifier cuts its suite damage to \modelDamage,
470
+ $6\times$ below the grounded heuristic's \damageGrounded{} (HEAD damage vs the union
471
+ row's freeze-time capture --- a disclosed basis mix). Within our own ablations
472
+ (June-10 freeze basis throughout), removing grounding cedes $22\%$ of real-error
473
+ F1 (\ablNoGroundRealF{} vs \ablFullRealF) and forfeits the behavioral guarantees:
474
+ perfect abstention on adversarial traps (\ablFullAbstain) versus
475
+ \ablNoAbstainAbstain{} without abstention, and reference-vetoed wrong merges (e.g.\
476
+ \texttt{guntxrsvillx}$\to$\texttt{huntsville}).
477
+
478
+ \begin{table}[t]
479
+ \centering
480
+ \caption{Wide-suite comparison, 3 injection seeds, churn-neutral metric. NORTH is the
481
+ double-macro harmonic mean; REAL-F1 is the real-error slice. Regenerated at the
482
+ current system head (2026-06-12); the June-10 freeze system measured
483
+ \realFGroundedFreeze{} REAL-F1 / \northGroundedFreeze{} NORTH on the same protocol.}
484
+ \label{tab:money}
485
+ \begin{tabular}{lcccccc}
486
+ \toprule
487
+ System & NORTH & $\pm$95\%CI & REAL-F1 & INJ-F1 & damage & abstain \\
488
+ \midrule
489
+ Grounded (ours) & \northGrounded & \northGroundedCI & \textbf{\realFGrounded} & \injFGrounded & \damageGrounded & \ablFullAbstain \\
490
+ OpenRefine fingerprint & \northORFp & 0.000 & \realFORFp & \injFORFp & \damageORFp & 1.000 \\
491
+ OpenRefine kNN & \northORKnn & 0.002 & \realFORKnn & 0.148 & \damageORKnn & 1.000 \\
492
+ Verified union 4B (shipped)$^{\dagger}$ & -- & -- & \modelRealF & -- & \modelDamage & \modelAbstain \\
493
+ \midrule
494
+ Baran (oracle det.\ + 20 labels)$^{\ddagger}$ & -- & -- & \realFBaran & -- & \damageBaran & -- \\
495
+ Jellyfish-13B (ED+DI)$^{\ddagger}$ & -- & -- & \realFJelly & -- & \damageJelly & -- \\
496
+ \bottomrule
497
+ \end{tabular}
498
+
499
+ \smallskip
500
+ {\small $^{\dagger}$single seed, REAL + typo-injected slice only (GPU cost); other rows
501
+ are 3-seed means. $^{\ddagger}$real slice only, disclosed protocol asymmetries
502
+ (\S\ref{sec:ws4}): Baran uses oracle error positions + gold labels; Jellyfish is our
503
+ detect-then-impute composition with seen-data caveats.}
504
+ \end{table}
505
+
506
+ \subsection{Generalization to never-seen tables}
507
+ \label{sec:wild}
508
+ The freeze-version system above was then pointed at data it had never seen, under
509
+ three new harnesses (all released with this paper as the \textsc{WildClean} bundle).
510
+ \textbf{(1) Paired bench}: \nPairs{} dirty/gold pairs spanning the Raha suite, SemTab
511
+ ToughTables, government open-data typo corpora, entity-matching tables, and
512
+ LLM-cleaning evaluation sets. On the 35 pairs from sources absent from training ---
513
+ a count that coincidentally equals, but is distinct from, the \nWild{} gold-free wild
514
+ tables of harness~(2) below --- the
515
+ post-freeze system scores \textbf{macro F1 \unseenMacroF{} at damage
516
+ \unseenMacroDamage}. The largest single contribution is the regime
517
+ \S\ref{sec:capabilities}(b) unlocks: on five all-unique entity tables where no
518
+ in-column frequency signal exists, F1 moves from $0.0$ to \ttFOne{} at zero damage.
519
+ Cross-row voting (\S\ref{sec:capabilities}c) is the second: flights---many sources
520
+ reporting the same flight---goes from \flightsBaseF{} to \flightsVoteF{} F1
521
+ heuristic-only, and the heuristic hospital path doubles from \hospBaseHeur{} to
522
+ \hospVoteHeur{}. The hospital union gate is invariant under all of this
523
+ (\unionGatePrec{} at \unionGateCov). \textbf{(2) Wild bench}: \nWild{} uncurated
524
+ in-the-wild tables (open-data portals, GitHub, Kaggle) with no gold; we score seeded
525
+ inject--recovery on each table's own data (mean recovery \wildRecovery{} over the 34
526
+ tables with inject scores; one table has none) plus a
527
+ behavioral audit: every run yields a valid plan, every changed cell is attributable
528
+ to a logged operation --- \textbf{zero silent edits across all \nWild{} tables}.
529
+ \textbf{(3) Trust audit at scale}: \nTrust{} GitTables tables, same property ---
530
+ \nTrust{}/\nTrust{} valid plans, zero crashes, zero silent edits. The held-out-source
531
+ generalization metric (train and evaluation drawn from disjoint benchmark sources)
532
+ remains low in absolute terms (GEN-F1 \genFTwo{}, variant-recall \genVRTwo{}, damage
533
+ \genDamageTwo): cleaning unfamiliar tables is far from solved, and we report the
534
+ number to anchor the next section's claim about \emph{where} the capability that does
535
+ exist actually lives.
536
+
537
+ \subsection{Where capability lives: a bounded null for fine-tuning}
538
+ \label{sec:negative}
539
+ Every attempt to move never-seen-table performance through the model weights failed;
540
+ every gain in \S\ref{sec:wild} came from deterministic machinery plus the verifier.
541
+ Five further supervised fine-tunes --- adding 109k harvested real-world alias pairs
542
+ (ToughTables-derived, MusicBrainz search hints, RxNorm, OpenFlights), error-dense
543
+ episode mixes, and a suspects-contract retrain --- left held-out GEN-F1
544
+ \emph{statistically bounded}: every retrain's delta is positive but negligible (mean
545
+ $+0.003$), never approaching the pre-registered $\delta{=}0.05$. ``Bounded'' is a
546
+ tested equivalence claim, not an eyeballed one~\cite{lakens}: across the five-retrain
547
+ series the mean held-out GEN-F1 delta (retrain minus champion) is $+0.0028$
548
+ (90\% bootstrap CI $[+0.0008, +0.0049]$, strictly positive;
549
+ 10{,}000 resamples, seed 42; per-dataset granularity, $n{=}15$ over 3 held-out
550
+ sources $\times$ 5 retrains --- per-pair deltas do not exist for the retrain
551
+ series, so within-retrain deltas are clustered and we add a retrain-level
552
+ robustness check, $n{=}5$ macro deltas), and TOST rejects effects beyond the
553
+ pre-registered SESOI of $\pm 0.05$ ($p = 8.0\times10^{-16}$; retrain-level check
554
+ $p = 8.3\times10^{-8}$). One disclosure sharpens the clustering caveat: two
555
+ retrains' held-out rows are \emph{bit-identical} --- mechanically verified as
556
+ verifier-collapse, not a data error (their raw plans share zero mapping entries,
557
+ 9 vs.\ 82 on flights, yet the verifier kills all of both, so each union
558
+ degenerates to the same deterministic plan;
559
+ \texttt{eval/results/equivalence\_coincidence.json}) --- so the $n{=}15$ rows
560
+ carry fewer independent observations than their count suggests, which is exactly
561
+ why the $n{=}5$ retrain-level test is the one we lean on. The collapse itself is
562
+ the finding in miniature: different weights, same held-out behavior, because the
563
+ verifier and the deterministic machinery decide what survives. Two reconciliations make the claim auditable. First, the
564
+ basis: the equivalence series is scored against the champion's absolute GEN-F1 of
565
+ \genChampionBasis{}, while the \genFTwo{} of \S\ref{sec:wild} is the \emph{shipped
566
+ system} at the post-freeze HEAD with all deterministic capabilities --- the
567
+ equivalence series scores each retrain's model-union path at its own capture time,
568
+ so the two figures share a metric but not a basis. Second, the SESOI: weight
569
+ interventions move GEN-F1 by at most $0.005$, while the deterministic machinery of
570
+ \S\ref{sec:capabilities} moved the unseen-pair macro from $0.10$ to \unseenMacroF{}
571
+ --- $\delta{=}0.05$ sits an order of magnitude above the measured weight effect and
572
+ well below the machinery effect, which is exactly the boundary the test is meant to
573
+ police. Mixing harvested pairs into the training blend
574
+ \emph{diluted} the synthetic skill the executor verifies (a monotonic dilution law
575
+ across mix ratios). A GRPO pilot using the executor as a verifiable reward (the
576
+ direction RLVR table work~\cite{tabler1} motivates) was negative in all three arms at
577
+ 4B/LoRA scale: the main arm and a KL-anchored variant degraded plan-format validity,
578
+ and a random-reward control arm reproduced the same drift, identifying it as an RL
579
+ artifact rather than signal~\cite{spurious}. We state this as a \emph{bounded} null,
580
+ not a universal one: at 4B/LoRA scale, under our propose/execute protocol and
581
+ training budgets, no weight intervention we ran produced measurable movement in
582
+ never-seen-table repair --- profiling visibility, reference grounding, cross-row
583
+ consensus, convention conservatism, and plan-level verification carry the capability
584
+ that exists. The bound is explicit: results with full-scale RL infrastructure
585
+ (execution-verified rewards on multi-GPU RLVR stacks~\cite{spreadsheetrl,tabler1})
586
+ show task skill moving at the same parameter scale, so our claim is about what
587
+ SFT-and-pilot-RL buy in this protocol class, not about reinforcement learning in
588
+ general. A second explicit bound: every weight experiment here uses the Qwen3
589
+ family --- and the very work we cite to explain the control arm's drift documents
590
+ that random-reward GRPO effects are themselves family-sensitive~\cite{spurious}
591
+ --- so the null is stated for Qwen3-class models pending a cross-family
592
+ replication. Concurrent evaluations corroborate the mechanism from independent
593
+ directions~\cite{distort,debate}. The practical corollary is unusual but actionable:
594
+ a contributor who wants to improve a system like this should write a deterministic
595
+ capability and gate it with the verifier, not collect more training data.
596
+
597
+ The null extends to test-time compute --- with one instructive exception that
598
+ \emph{confirms} the architecture claim. Self-consistency \emph{voting} over
599
+ $N{=}16$ temperature-0.7 samples (cell-edit-level majority, run through the
600
+ identical verifier--union pipeline) yields 0.906 precision at 0.454 coverage
601
+ versus 0.9055 at 0.4519 for matched greedy decoding on the same local runtime ---
602
+ a null at matched precision, the visibility law from the test-time side: voting
603
+ cannot surface repairs the profile does not expose, and it actively discards
604
+ verified-recoverable coverage. But pooling \emph{every} mapping from all 16
605
+ samples and letting the verifier filter the union gives the best operating point
606
+ we measure for the 4B: \textbf{0.911 precision at 0.483 coverage} ($+0.6$ points
607
+ precision, $+7.1$ points coverage over the shipped gate; an independent $N{=}8$
608
+ replication reproduces the \emph{voted} point to $\pm 0.0003$ precision /
609
+ $\pm 0.002$ coverage, and the greedy anchor exactly). The lesson is the paper's thesis in miniature: sampling
610
+ helps only as a \emph{candidate generator}; consensus adds nothing the verifier
611
+ does not already provide --- pool candidates, verify, do not vote. Separately,
612
+ the local capture path itself (Q8 quantization with grammar-constrained decoding)
613
+ is worth $+3.9$ points of coverage over the original Modal capture at equal
614
+ precision.
615
+
616
+ \subsection{Zero-label capability scaling: the verifier harness is planner-agnostic}
617
+ \label{sec:scaling}
618
+ The negative result bounds what fine-tuning small weights buys; it says nothing
619
+ about raw capability. To separate the two we dropped zero-shot, $\leq$32B
620
+ open-weights planners --- with \emph{no} task training --- into the identical
621
+ hospital pipeline the 4B fine-tune uses: same prompt contract, same
622
+ verify($\tau{=}0.5$), same union with the grounded heuristic
623
+ (Table~\ref{tab:scaling}). devstral-small-2-24B and gemma4-31B both reach
624
+ \textbf{\scalePrecBig{} precision at \scaleCovBig{} coverage} --- exceeding the
625
+ fine-tune's union point of \unionGatePrec{} at \unionGateCov{} --- while
626
+ nemotron-30B reaches \scalePrecNemo{} at \scaleCovNemo{} with JSON-plan validity
627
+ 0.4 (validity is part of the measurement: a planner that cannot reliably emit the
628
+ plan schema loses coverage before capability is measured). gpt-oss-20B is
629
+ excluded as a serving failure, documented rather than scored as capability: the
630
+ hosted proxy returned empty content on every planning call despite full-length
631
+ generation. The arm is multi-family (Mistral, Google, NVIDIA), which addresses
632
+ the single-family bound of \S\ref{sec:negative} for the inference side; the
633
+ weight-training null itself remains Qwen3-scoped. Disclosure: these models were
634
+ measured via hosted inference for speed; all are $\leq$32B open weights and
635
+ locally deployable in principle. The interpretation we draw is the paper's
636
+ sharpest: SFT at 4B does not buy held-out generalization (\S\ref{sec:negative}),
637
+ but raw capability at 24--31B does lift the same harness --- the verifier/union
638
+ architecture is the portable contribution, converting any sufficiently capable
639
+ planner into a trustworthy cleaner.
640
+
641
+ \begin{table}[t]
642
+ \centering
643
+ \caption{Zero-shot $\leq$32B planners in the identical verify($\tau{=}0.5$)+union
644
+ harness, hospital's \hospErrors{} real errors. Validity = fraction of planning
645
+ calls returning schema-valid JSON. Runtime = wall-clock for the planning calls on
646
+ hosted endpoints (single capture, no seeds; the 4B row is a prior Modal A100
647
+ capture with no comparable local figure). Each scaling row is a single capture;
648
+ the primary evidence is the union coverage delta ($+0.07$) at matched-or-better
649
+ precision, not any single cell. For context, 16-sample pooling lifts the 4B
650
+ fine-tune to $0.911@0.483$ at $16\times$ planning compute
651
+ (\S\ref{sec:negative}); the 24--31B planners reach $0.915@0.485$ in a single
652
+ greedy pass --- single-pass capability versus test-time compute, both converted
653
+ into trustworthy operating points by the same verifier. Bold marks the best union operating point.
654
+ gpt-oss-20B excluded (serving failure: empty
655
+ proxy responses, not measurable capability).
656
+ The identical devstral/gemma rows are a verified counting coincidence, not a
657
+ scoring artifact: their applied cell-edit sets share 266 of 270 cells, each
658
+ commits 4 model-specific repairs (all correct), and the totals coincide
659
+ (\texttt{eval/results/scaling\_coincidence.json}).
660
+ }
661
+ \label{tab:scaling}
662
+ \footnotesize
663
+ \begin{tabular}{lccccc}
664
+ \toprule
665
+ Planner & Params & Gated P@C & Union P@C & Validity & Runtime (s) \\
666
+ \midrule
667
+ ScrubData-v6 (Qwen3-4B fine-tune) & 4B & 0.993 @ 0.287 & 0.905 @ 0.413 & --- & --- \\
668
+ devstral-small-2 (Mistral) & 24B & 0.943 @ 0.426 & \textbf{0.915 @ 0.485} & 1.0 & \runtimeDevstral \\
669
+ nemotron-3-nano (NVIDIA) & 30B & 1.000 @ 0.138 & 0.877 @ 0.336 & 0.4 & \runtimeNemo \\
670
+ gemma4 (Google) & 31B & 0.943 @ 0.426 & \textbf{0.915 @ 0.485} & 1.0 & \runtimeGemma \\
671
+ \bottomrule
672
+ \end{tabular}
673
+ \end{table}
674
+
675
+ \subsection{Ablations}
676
+ All ablations are 3-seed means (CIs $\le\pm0.003$). Removing abstention costs $-0.013$
677
+ NORTH, raises damage to \ablNoAbstainDamage{} (from \ablFullDamage), and collapses trap
678
+ abstention to \ablNoAbstainAbstain. Removing the ambiguity margin costs $-0.006$ with
679
+ $+0.001$ damage. Removing case matching costs $-0.002$ under the churn-neutral metric
680
+ (and \emph{gained} $+0.12$ under the uncorrected metric---the artifact). Replacing
681
+ grounding with frequency clustering gains $+0.020$ NORTH, all of it from the injected
682
+ slice (\S\ref{sec:eval}), while ceding $-0.039$ real-error F1---the trade the system
683
+ refuses by design.
684
+
685
+ \subsection{Learned-repair baselines under disclosed protocols}
686
+ \label{sec:ws4}
687
+ We additionally run two learned-repair baselines on the real-error (Raha) slice,
688
+ under the identical churn-neutral metric but with honestly disclosed protocol
689
+ asymmetries. \textbf{Baran}~\cite{raha} is semi-supervised: we run its reference
690
+ configuration---oracle error positions from the dirty/gold diff plus 20 gold-labeled
691
+ tuples per dataset (its package default), without the optional Wikipedia-pretrained
692
+ value models. It reaches REAL-F1 \realFBaran{}$\,\pm$\realFBaranCI{} (3 label-sampling
693
+ seeds) at \damageBaran{} damage---an upper bound under a strictly more informed
694
+ protocol than ours (zero labels, no oracle detection); with oracle positions it can
695
+ essentially only edit true-error cells, so its near-zero damage is structural.
696
+ \textbf{Jellyfish-13B}~\cite{jellyfish} publishes per-cell error detection and
697
+ imputation but no repair task; we compose the two (detect, then impute flagged cells
698
+ with the attribute masked) --- a pipeline of our construction, not theirs. It scores
699
+ REAL-F1 \realFJelly{} at \damageJelly{} damage (single seed, recommended decoding;
700
+ note hospital is in its instruction-tuning data and flights/rayyan in its published
701
+ evaluation suite, so these numbers may flatter it). Neither baseline is run on the
702
+ 56-spec injected suite (computationally and methodologically out of scope for
703
+ semi-supervised and per-cell-LLM repair); their NORTH/INJ-F1 cells in
704
+ Table~\ref{tab:money} are blank by design. The comparison locates our contribution:
705
+ zero-config systems (ours, OpenRefine) occupy a different protocol class from
706
+ supervised repair, and the verifier (\S\ref{sec:ws1results}) is what makes the
707
+ zero-config class precise enough to trust, not what closes the labeled gap.
708
+
709
+ Table~\ref{tab:perdataset} breaks the real-error slice down per dataset at HEAD.
710
+ The verified-union rows are reported with their honest shape: off hospital the
711
+ union turns ultra-conservative --- on rayyan it commits 12 changes at 0.001
712
+ damage; on beers it holds precision 0.546 at recall 0.018. The gate's precision
713
+ premise transfers as \emph{safety} (union damage stays at 0.001--0.080) but not
714
+ as coverage. The movies\_1 union cell ($^{q}$: local Q8 capture, the disclosed
715
+ quantized protocol) is the instructive worst case: on entity-rich name columns
716
+ the quantized planner proposes plausible-but-wrong merges
717
+ (\texttt{The Longest Day}$\,\to\,$\texttt{The Longest Yard}); the verifier kills
718
+ most, and what leaks through is damage within the disclosed band with zero
719
+ credited fixes --- the planner contributes nothing there, and the system's value
720
+ is that it \emph{contains} a bad planner rather than amplifying it. This directly
721
+ answers the co-adaptation concern: hospital is where the model's learned mappings
722
+ live, and elsewhere the system abstains or contains rather than guesses.
723
+
724
+ \begin{table}[t]
725
+ \centering
726
+ \caption{Per-dataset real-error results (Raha slice), churn-neutral F1 / damage.
727
+ Grounded is the HEAD deterministic system; OR = OpenRefine reimplementations;
728
+ Union is the verified union planner ($\tau{=}0.5$) where a captured model plan
729
+ exists (movies\_1 capture pending); Baran uses oracle error positions + 20 gold
730
+ labels (mean of 3 label-sampling seeds) and is a supervised reference, not a
731
+ peer.}
732
+ \label{tab:perdataset}
733
+ \footnotesize
734
+ \begin{tabular}{lccccc}
735
+ \toprule
736
+ Dataset & Grounded (HEAD) & OR fingerprint & OR kNN & Verified union & Baran (oracle+20) \\
737
+ \midrule
738
+ hospital & 0.258 / .066 & 0.000 / .000 & 0.189 / .083 & 0.567 / .001 & 0.827 / .004 \\
739
+ beers & 0.025 / .005 & 0.194 / .000 & 0.086 / .074 & 0.035 / .001 & 0.918 / .000 \\
740
+ flights & 0.127 / .082 & 0.000 / .000 & 0.014 / .065 & 0.035 / .080 & 1.000 / .000 \\
741
+ rayyan & 0.000 / .118 & 0.000 / .001 & 0.002 / .008 & 0.000 / .001 & 0.402 / .010 \\
742
+ movies\_1 & 0.714 / .025 & 0.002 / .018 & 0.001 / .072 & 0.000 / .025$^{q}$ & 0.909 / .001 \\
743
+ \midrule
744
+ macro F1 & \realFOursHead & 0.039 & 0.058 & --- & \realFBaran \\
745
+ \bottomrule
746
+ \end{tabular}
747
+ \end{table}
748
+
749
+ \subsection{A matched label budget separates the supervision regimes}
750
+ \label{sec:labelcurve}
751
+ The Baran comparison above is two points (zero labels, twenty labels); the
752
+ matched-budget curve in Figure~\ref{fig:labelcurve} measures what each label is
753
+ worth to each system on the same five-dataset real-error macro. At zero labels
754
+ Baran --- even \emph{retaining} its oracle error positions --- repairs nothing
755
+ (F1 \realFBaranZero, 3 seeds): its value models have nothing to learn from.
756
+ ScrubData operates at \realFOursHead{} with zero configuration. With labels Baran
757
+ climbs steeply (\realFBaranFive{} at $k{=}5$, \realFBaran{} at $k{=}20$): the two
758
+ systems occupy complementary supervision regimes, a relationship now measured
759
+ rather than asserted. ScrubData's own $k$-label arm uses the labels \emph{only}
760
+ to validate and expand the verifier accept set --- no retraining, no oracle
761
+ positions: $\realFOursFive \pm 0.023$ at $k{=}5$ and $\realFOursTwenty \pm 0.012$
762
+ at $k{=}20$ (3 label-sampling seeds). The disclosed asymmetry stands at every
763
+ budget: Baran keeps oracle error positions throughout, so the curve is an upper
764
+ bound in its favor.
765
+
766
+ \begin{figure}[t]
767
+ \centering
768
+ \includegraphics[width=0.62\linewidth]{fig_label_curve}
769
+ \caption{Matched-budget label curve, five-dataset real-error macro F1. At
770
+ $k{=}0$ Baran repairs nothing even with oracle error positions retained;
771
+ ScrubData operates at \realFOursHead{} with zero configuration. With labels
772
+ Baran climbs steeply --- complementary supervision regimes, measured. Error
773
+ bars ($\pm$) are standard deviations over 3 label-sampling seeds; the Baran
774
+ $k{=}20$ point reuses the 3-seed baseline run of \S\ref{sec:ws4}.}
775
+ \label{fig:labelcurve}
776
+ \end{figure}
777
+
778
+ \subsection{Degenerate baselines and cost-weighted damage}
779
+ \label{sec:degenerate}
780
+ Four degenerate policies pin the metric's floor and ceiling on the full 42-pair
781
+ bench (Table~\ref{tab:degenerate}). No-op and oracle land exactly at 0 and 1;
782
+ abstain-all is score-identical to no-op because the repair metric is flag-blind
783
+ by design (abstentions are audited separately); seeded random editing of 5\% of
784
+ cells is vandalism the metric must punish. Since F1 alone under-punishes
785
+ vandalism, we add a cost-weighted score in the Effective-Reliability style,
786
+ $\Phi_c = (\mathrm{fixes} - c\cdot\mathrm{damaged})/\mathrm{errors}$ at
787
+ $c \in \{1, 5, 10\}$: random editing scores $-0.49$ to $-4.89$, while the
788
+ shipped system stays positive at $c{=}1$ (\degShippedPhiOne) --- and goes
789
+ negative at higher $c$, which is the honest reading: at 10:1 cost asymmetry,
790
+ only near-zero-damage operating points (the verified union) are defensible.
791
+
792
+ One disclosure: the oracle acceptance check itself surfaced a scorer artifact
793
+ --- 3 cells in 1.79M held the literal string \texttt{Nan} (a first name), which
794
+ parses to float NaN and was unequal to itself --- now fixed in
795
+ \texttt{eval/metrics.py} with a regression test; published numbers shift by
796
+ less than $10^{-4}$.
797
+
798
+ \begin{table}[t]
799
+ \centering
800
+ \caption{Degenerate policies pin the metric (42 pairs, churn-neutral macro;
801
+ random-edit: seeded, 5\% of cells). $\Phi_c$ is micro-summed
802
+ $(\mathrm{fixes} - c\cdot\mathrm{damaged})$ per benchmark error. ``Shipped''
803
+ here is the deterministic grounded path on the 42 pairs (damage
804
+ \degShippedDamage), distinct from the verified-union suite row of
805
+ Table~\ref{tab:money} (damage \modelDamage).}
806
+ \label{tab:degenerate}
807
+ \small
808
+ \begin{tabular}{lccccccc}
809
+ \toprule
810
+ Policy & F1 & P & R & damage & $\Phi_1$ & $\Phi_5$ & $\Phi_{10}$ \\
811
+ \midrule
812
+ no-op & 0.000 & 1.000 & 0.000 & 0.000 & $0.00$ & $0.00$ & $0.00$ \\
813
+ abstain-all & 0.000 & 1.000 & 0.000 & 0.000 & $0.00$ & $0.00$ & $0.00$ \\
814
+ random-edit & 0.000 & 0.001 & 0.001 & 0.049 & $-0.49$ & $-2.45$ & $-4.89$ \\
815
+ oracle & 1.000 & 1.000 & 1.000 & 0.000 & $+1.00$ & $+1.00$ & $+1.00$ \\
816
+ shipped & \degShippedF & \degShippedP & 0.308 & \degShippedDamage & $+0.13$ & $-1.37$ & $-3.26$ \\
817
+ \bottomrule
818
+ \end{tabular}
819
+ \end{table}
820
+
821
+ \subsection{Calibration of abstention}
822
+ \label{sec:calibration}
823
+ On a probe of reference-entity typos plus garbage traps, retrieval confidence is a
824
+ usable selective-prediction signal: AURC \aurc, ECE \ece{} (over-confident;
825
+ temperature scaling is future work), and
826
+ precision rises monotonically with threshold---\precAtDefault{} precision at the default
827
+ $\tau{=}0.84$ (coverage \covAtDefault), and $\geq$95\% precision at
828
+ $\tau{=}\threshNinetyFive$ (coverage \covNinetyFive). Figure~\ref{fig:rc} shows the
829
+ risk--coverage curve.
830
+
831
+ \begin{figure}[t]
832
+ \centering
833
+ \includegraphics[width=0.62\linewidth]{fig_risk_coverage}
834
+ \caption{Risk--coverage for grounded city reconciliation (650 probes). Operating points
835
+ annotated; the confidence supports thresholded abstention.}
836
+ \label{fig:rc}
837
+ \end{figure}
838
+
839
+ \section{Limitations}
840
+ Reference coverage is the recall ceiling: entities absent from the taxonomy abstain by
841
+ design, which is safe but not helpful; coverage work (larger gazetteers, ROR for
842
+ organizations) moves recall directly. Our damage metric is convention-tolerant for case
843
+ and whitespace but still counts alias expansion (\texttt{NYC}$\to$\texttt{New York}) as
844
+ damage when the gold keeps the alias---a value-level convention question we leave open.
845
+ The confidence signal is over-confident (ECE \ece); temperature scaling is future
846
+ work. The injected half of the suite, while seeded and reproducible, inherits the
847
+ injector's error model; we mitigate with the real-error slice and report both. All
848
+ weight-training experiments (SFT and GRPO) use a single model family (Qwen3), so
849
+ the negative result of \S\ref{sec:negative} is family-scoped until replicated on a
850
+ second family. PII
851
+ coverage is English-only, and we make no de-identification guarantee. Finally, the
852
+ fine-tune headline is reported with multi-seed confidence intervals, but the wide-suite
853
+ model row is single-seed for cost reasons and scoped as such.
854
+
855
+ \section{Conclusion}
856
+ A planner/executor decomposition with plan-level selective prediction --- the model
857
+ proposes, a deterministic engine executes, a verifier gates every mapping --- turns
858
+ LLM data cleaning from a trust liability into an auditable system: every change is a
859
+ named, reversible operation; uncertain actions become review flags rather than silent
860
+ corruptions; and the evaluation itself is built to resist gaming. The post-freeze
861
+ program sharpened the architecture into a finding: across
862
+ five further fine-tunes and a three-arm GRPO pilot, the weights never moved
863
+ never-seen-table performance --- deterministic visibility, grounding, consensus, and
864
+ verification did, at zero silent edits across \nWild{} wild tables and a
865
+ \nTrust{}-table trust audit. The scaling arm completes the picture: the bounded null
866
+ is about fine-tuning small weights, not about capability --- two of three zero-shot
867
+ 24--31B planners dropped into the unchanged verifier harness exceed the
868
+ fine-tune's operating point (\S\ref{sec:scaling}), so the architecture is
869
+ planner-agnostic: capability gains arrive as better operating points without
870
+ retraining. The shipped system runs
871
+ entirely locally on commodity hardware and no data leaves the machine; the
872
+ scaling-arm planners were measured via hosted endpoints, but all are locally
873
+ deployable open weights. We believe the recipe---propose/execute decomposition,
874
+ verification-by-execution, retrieval-grounded outputs, and selective prediction over
875
+ deterministic capabilities---is a template for deploying small specialized models on
876
+ other structured tasks.
877
+
878
+ \section*{Reproducibility}
879
+ \begin{sloppypar}
880
+ The model weights are public:
881
+ \url{https://huggingface.co/ricalanis/scrubdata-qwen3-4b-v6-q8}. Code, evaluation
882
+ suite, and result artifacts are released at the project repository,
883
+ \url{https://github.com/ricalanis/scrubdata-hackathon} (public upon publication,
884
+ available to reviewers from the initial submission). The \textsc{WildClean}
885
+ bundle --- redistributable dirty/gold pairs, the GitTables audit slice, open
886
+ vocabularies, result JSONs, and license-gated loaders for the non-redistributable
887
+ pairs --- is a public Hugging Face dataset
888
+ (\url{https://huggingface.co/datasets/ricalanis/wildclean}). The shipped product
889
+ planner is the identical code path measured here (\texttt{scrubdata/active.py}).
890
+ \end{sloppypar}
891
+
892
+ \paragraph{Release integrity.} Our own reproducibility QA discovered that the
893
+ published Q8\_0 GGUF was corrupted by an export bug (the export declared a wrong
894
+ end-of-generation token id inside the Qwen3 vocabulary, degenerating into
895
+ tool-call loops on all runtimes; a base-model control isolated the fault to the
896
+ export, not the adapter). It has been re-exported from the v6 adapter and
897
+ replaced under the same filename, with both sha256 checksums recorded in the
898
+ model card's Integrity section. Third-party reproduction of the model-path
899
+ numbers additionally requires constrained decoding on long prompts ---
900
+ \texttt{format=json} under Ollama, or
901
+ \texttt{suppress\_tokens=[151657,151658]} under transformers --- which is now
902
+ documented in the model card and \texttt{notebooks/Modelfile}.
903
+
904
+ \paragraph{Setup.} Clone the repository and run \texttt{uv sync} (Python 3.12;
905
+ \texttt{uv} resolves the pinned environment). The non-redistributable benchmark
906
+ pairs materialize from their original sources with the \textsc{WildClean}
907
+ \texttt{loaders.py}. Model-path results additionally need the released Q8\_0 GGUF
908
+ served by a local Ollama (\texttt{SCRUBDATA\_MODEL}); every deterministic-path
909
+ number runs with no model at all. Baran runs in the separate pinned environment
910
+ documented at the top of \texttt{eval/run\_baran.py}; Jellyfish-13B runs remotely
911
+ via Modal.
912
+
913
+ \paragraph{One command per reported number} (all from the repository root, at the
914
+ released revision):
915
+
916
+ \begin{center}
917
+ \footnotesize
918
+ \begin{tabular}{@{}ll@{}}
919
+ \toprule
920
+ Reported result & Command \\
921
+ \midrule
922
+ Wide-suite comparison (Table~\ref{tab:money}) & \texttt{python -m eval.run\_real\_multi --out eval/results} \\
923
+ Precision--coverage curve + gate & \texttt{python -m eval.precision\_curve} \\
924
+ \quad (Figure~\ref{fig:pc}, \S\ref{sec:ws1results}) & \texttt{\ \ --plan eval/results/v6\_hospital\_raw\_plan.json --union} \\
925
+ Ablations & \texttt{python -m eval.ablations} \\
926
+ Calibration (Figure~\ref{fig:rc}) & \texttt{python -m eval.calibration} \\
927
+ PII leak test & \texttt{python -m eval.pii\_leak} \\
928
+ Baran baseline & \texttt{python eval/run\_baran.py}, then \\
929
+ & \texttt{python -m eval.baselines\_learned --score-baran} \\
930
+ Jellyfish baseline & \texttt{modal run scripts/modal\_jellyfish.py} \\
931
+ \midrule
932
+ Paired bench (\S\ref{sec:wild}) & \texttt{python -m eval.paired\_bench} \\
933
+ Wild bench (\S\ref{sec:wild}) & \texttt{python -m eval.wild\_bench} \\
934
+ GitTables trust audit (\S\ref{sec:wild}) & \texttt{python -m eval.gittables\_audit} \\
935
+ Held-out-source generalization & \texttt{python -m eval.generalization} \\
936
+ \midrule
937
+ Scorer validation (\S\ref{sec:eval}) & \texttt{python -m pytest tests/test\_wildclean\_scorer.py} \\
938
+ Degenerate baselines (Table~\ref{tab:degenerate}) & \texttt{python -m eval.degenerate} \\
939
+ TOST equivalence (\S\ref{sec:negative}) & \texttt{python -m eval.equivalence} \\
940
+ Label curve (Figure~\ref{fig:labelcurve}) & \texttt{python -m eval.label\_curve} \\
941
+ Per-dataset table (Table~\ref{tab:perdataset}) & \texttt{python -m eval.raha\_table} \\
942
+ Self-consistency vote/pool (\S\ref{sec:negative}) & \texttt{python -m eval.sc\_rerank --model scrubdata-ft --n 16} \\
943
+ Scaling arm (Table~\ref{tab:scaling}) & \texttt{python -m eval.scaling\_arm} \\
944
+ \bottomrule
945
+ \end{tabular}
946
+ \end{center}
947
+
948
+ \begin{thebibliography}{20}
949
+ \bibitem{raha} M.~Mahdavi, Z.~Abedjan, R.~Castro Fernandez, S.~Madden, M.~Ouzzani,
950
+ M.~Stonebraker, N.~Tang. Raha: A Configuration-Free Error Detection System. SIGMOD
951
+ 2019; M.~Mahdavi, Z.~Abedjan. Baran: Effective Error Correction via a Unified Context
952
+ Representation and Transfer Learning. PVLDB 13(11):1948--1961, 2020.
953
+ \bibitem{holoclean} T.~Rekatsinas, X.~Chu, I.~F.~Ilyas, C.~R\'e. HoloClean: Holistic
954
+ Data Repairs with Probabilistic Inference. PVLDB 10(11), 2017. arXiv:1702.00820.
955
+ \bibitem{garf} J.~Peng, D.~Shen, N.~Tang, T.~Liu, Y.~Kou, T.~Nie, H.~Cui, G.~Yu.
956
+ Self-Supervised and Interpretable Data Cleaning with Sequence Generative Adversarial
957
+ Networks (GARF). PVLDB 16(3):433--446, 2022.
958
+ \bibitem{wrangle} A.~Narayan, I.~Chami, L.~Orr, S.~Arora, C.~R\'e. Can Foundation
959
+ Models Wrangle Your Data? PVLDB 16(4):738--746, 2022. arXiv:2205.09911.
960
+ \bibitem{jellyfish} H.~Zhang, Y.~Dong, C.~Xiao, M.~Oyamada. Jellyfish:
961
+ Instruction-Tuning Local Large Language Models for Data Preprocessing. EMNLP 2024.
962
+ arXiv:2312.01678.
963
+ \bibitem{cocoon} S.~Zhang, Z.~Huang, E.~Wu. Data Cleaning Using Large Language Models
964
+ (Cocoon). arXiv:2410.15547, 2024 (preprint; no published reproduction).
965
+ \bibitem{zeroed} W.~Ni, K.~Zhang, X.~Miao, X.~Zhao, Y.~Wu, Y.~Wang, J.~Yin. ZeroED:
966
+ Hybrid Zero-Shot Error Detection Through Large Language Model Reasoning. ICDE 2025.
967
+ arXiv:2504.05345.
968
+ \bibitem{forested} M.~Wang, J.~Wang, Q.~Liu, X.~Xu, Z.~Xing, L.~Zhu, W.~Zhang.
969
+ Ensembling LLM-Induced Decision Trees for Explainable and Robust Error Detection.
970
+ arXiv:2512.07246, 2025 (preprint).
971
+ \bibitem{autotest} Q.~Chen, Y.~He, R.~C.-W.~Wong, W.~Cui, S.~Ge, H.~Zhang, D.~Zhang,
972
+ S.~Chaudhuri. Auto-Test: Learning Semantic-Domain Constraints for Unsupervised Error
973
+ Detection in Tables. SIGMOD 2025. arXiv:2504.10762.
974
+ \bibitem{gidcl} M.~Yan, Y.~Wang, Y.~Wang, X.~Miao, J.~Li. GIDCL: A Graph-Enhanced
975
+ Interpretable Data Cleaning Framework with Large Language Models. Proc.\ ACM Manag.\
976
+ Data 2(6), Article 236, 2024 (SIGMOD).
977
+ \bibitem{spreadsheetrl} B.~Chi, Y.~Xie, M.~Wu, J.~Yang, J.~Jiang, Z.~Li, et al.
978
+ Spreadsheet-RL: Advancing Large Language Model Agents on Realistic Spreadsheet Tasks
979
+ via Reinforcement Learning. arXiv:2605.22642, 2026.
980
+ \bibitem{distort} A.~Dutta, H.~Nigam, H.~Hasanbeig, A.~Radhakrishna, S.~Gulwani.
981
+ An Empirical Investigation of Robustness in Large Language Models under Tabular
982
+ Distortions. arXiv:2601.05009, 2026.
983
+ \bibitem{debate} C.~Parmar, A.~Mehta, H.~Wu, J.~Ramamurthy, S.~Medhekar. When Helping
984
+ Hurts and How to Fix It: Multi-Agent Debate for Data Cleaning. arXiv:2606.02866, 2026.
985
+ \bibitem{tabler1} Z.~Yang, L.~Chen, A.~Cohan, Y.~Zhao. Table-R1: Inference-Time
986
+ Scaling for Table Reasoning. EMNLP 2025. arXiv:2505.23621.
987
+ \bibitem{spurious} R.~Shao, S.~S.~Li, R.~Xin, S.~Geng, Y.~Wang, et al. Spurious
988
+ Rewards: Rethinking Training Signals in RLVR. arXiv:2506.10947, 2025.
989
+ \bibitem{tablegpt} P.~Li, Y.~He, D.~Yashar, W.~Cui, S.~Ge, H.~Zhang, D.~Rifinski
990
+ Fainman, D.~Zhang, S.~Chaudhuri. Table-GPT: Table Fine-tuned GPT for Diverse Table
991
+ Tasks. Proc.\ ACM Manag.\ Data 2(3), Article 176, 2024 (SIGMOD). arXiv:2310.09263.
992
+ \bibitem{retclean} Z.~A.~Naeem, M.~S.~Ahmad, M.~Eltabakh, M.~Ouzzani, N.~Tang.
993
+ RetClean: Retrieval-Based Data Cleaning Using LLMs and Data Lakes. PVLDB 17(12), 2024
994
+ (demo). arXiv:2303.16909.
995
+ \bibitem{turl} X.~Deng, H.~Sun, A.~Lees, Y.~Wu, C.~Yu. TURL: Table Understanding
996
+ through Representation Learning. PVLDB 14(3):307--319, 2021. arXiv:2006.14806.
997
+ \bibitem{tablellama} T.~Zhang, X.~Yue, Y.~Li, H.~Sun. TableLlama: Towards Open Large
998
+ Generalist Models for Tables. NAACL 2024. arXiv:2311.09206.
999
+ \bibitem{belotti} F.~Belotti, F.~Dadda, M.~Cremaschi, R.~Avogadro, M.~Palmonari.
1000
+ Evaluating LLMs on Entity Disambiguation in Tables. arXiv:2408.06423, 2024 (preprint).
1001
+ \bibitem{racoon} L.~L.~Wei, G.~Xiao, M.~Balazinska. RACOON: An LLM-based Framework for
1002
+ Retrieval-Augmented Column Type Annotation with a Knowledge Graph. arXiv:2409.14556,
1003
+ 2024 (preprint).
1004
+ \bibitem{mtab} P.~Nguyen, N.~Kertkeidkachorn, R.~Ichise, H.~Takeda. MTab: Matching
1005
+ Tabular Data to Knowledge Graph using Probability Models. SemTab/ISWC 2019.
1006
+ arXiv:1910.00246.
1007
+ \bibitem{selective} R.~El-Yaniv, Y.~Wiener. On the Foundations of Noise-free Selective
1008
+ Classification. JMLR 11:1605--1641, 2010; Y.~Geifman, R.~El-Yaniv. Selective
1009
+ Classification for Deep Neural Networks. NeurIPS 2017.
1010
+ \bibitem{openmed} M.~Panahi. OpenMed NER: Open-Source, Domain-Adapted State-of-the-Art
1011
+ Transformers for Biomedical NER Across 12 Public Datasets. arXiv:2508.01630, 2025
1012
+ (preprint).
1013
+ \bibitem{lakens} D.~Lakens. Equivalence Tests: A Practical Primer for t Tests,
1014
+ Correlations, and Meta-Analyses. Social Psychological and Personality Science
1015
+ 8(4):355--362, 2017.
1016
+ \bibitem{grouse} S.~Muller, A.~Loison, B.~Omrani, G.~Viaud. GroUSE: A Benchmark
1017
+ to Evaluate Evaluators in Grounded Question Answering. COLING 2025.
1018
+ arXiv:2409.06595.
1019
+ \end{thebibliography}
1020
+
1021
+ \end{document}
docs/paper/numbers.tex ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ % Result macros — every value regenerates from one command (see Reproducibility section).
2
+ % Headline fine-tune (synthetic frozen gold, Layer 1)
3
+ \newcommand{\canonFOurs}{0.815} % v5 bf16, n=20 (single run; multi-seed below)
4
+ \newcommand{\canonFOursBest}{0.901} % v4 Q8 measurement
5
+ \newcommand{\canonFBig}{0.452} % large generic model (GLM-class, zero-shot)
6
+ \newcommand{\canonFHeur}{0.152} % rule heuristic
7
+ \newcommand{\canonFMultiSeed}{$0.803 \pm 0.009$ (95\% CI, 3 training seeds)}
8
+ \newcommand{\opFOurs}{0.957}
9
+ \newcommand{\jsonValidOurs}{0.950}
10
+
11
+ % Hospital head-to-head, repairs-only churn-neutral (both paths incl. errors-are-rare gates)
12
+ \newcommand{\hospRecallGrounded}{0.257}
13
+ \newcommand{\hospRecallFreq}{0.293}
14
+ \newcommand{\hospPrecGrounded}{0.845}
15
+ \newcommand{\hospPrecFreq}{0.871}
16
+ \newcommand{\hospModelRecall}{0.424} % fine-tuned v5, repair_recall (vs 0.000 synthetic-only)
17
+
18
+ % Wide-suite comparison (3 seeds, churn-neutral metric) — money table.
19
+ % PRIMARY = HEAD regeneration 2026-06-12 (eval/results/money_table_head.json,
20
+ % post-capability system, NaN metric fix in). Freeze (2026-06-10) values kept
21
+ % as *Freeze macros where the narrative discusses the freeze-version system.
22
+ \newcommand{\northGrounded}{0.224}
23
+ \newcommand{\northGroundedCI}{0.004}
24
+ \newcommand{\northORFp}{0.211}
25
+ \newcommand{\northORKnn}{0.122}
26
+ \newcommand{\realFGrounded}{0.225}
27
+ \newcommand{\realFORKnn}{0.058}
28
+ \newcommand{\damageGrounded}{0.092}
29
+ \newcommand{\damageORKnn}{0.096}
30
+ \newcommand{\northGroundedFreeze}{0.203}
31
+ \newcommand{\realFGroundedFreeze}{0.174}
32
+ \newcommand{\damageGroundedFreeze}{0.104}
33
+
34
+ % SHIPPED system (verified union, v6 adapter) on suite — scripts/modal_eval_suite.py
35
+ \newcommand{\modelRealF}{0.142}
36
+ \newcommand{\modelDamage}{0.015}
37
+ \newcommand{\modelAbstain}{1.000}
38
+
39
+ % Ablations (churn-neutral metric, 3 seeds — eval/results/ablations.json)
40
+ \newcommand{\ablFull}{0.203}
41
+ \newcommand{\ablNoGround}{0.223}
42
+ \newcommand{\ablNoAbstain}{0.190}
43
+ \newcommand{\ablNoMargin}{0.197}
44
+ \newcommand{\ablNoCase}{0.201}
45
+ \newcommand{\ablFullRealF}{0.174}
46
+ \newcommand{\ablNoGroundRealF}{0.135}
47
+ \newcommand{\ablNoAbstainDamage}{0.108}
48
+ \newcommand{\ablFullDamage}{0.104}
49
+ \newcommand{\ablFullAbstain}{1.000}
50
+ \newcommand{\ablNoAbstainAbstain}{0.250}
51
+
52
+ % Selective prediction / calibration
53
+ \newcommand{\aurc}{0.120}
54
+ \newcommand{\ece}{0.169}
55
+ \newcommand{\precAtDefault}{0.899} % threshold 0.84
56
+ \newcommand{\covAtDefault}{0.669}
57
+ \newcommand{\threshNinetyFive}{0.91}
58
+ \newcommand{\covNinetyFive}{0.206}
59
+
60
+ % PII transfer validation (OpenMed-PII 44M on bare cells)
61
+ \newcommand{\piiNameBare}{100\%}
62
+ \newcommand{\piiAddrBare}{100\%}
63
+ \newcommand{\piiNegRate}{43\%}
64
+ \newcommand{\piiLeakRate}{zero (0/360)} % eval/pii_leak.py, seeded
65
+ \newcommand{\realFORFp}{0.039}
66
+ \newcommand{\injFGrounded}{0.224}
67
+ \newcommand{\injFORFp}{0.282}
68
+ \newcommand{\damageORFp}{0.001}
69
+ \newcommand{\hospModelRecallVSix}{0.475}
70
+ \newcommand{\hospModelPrecVSix}{0.185}
71
+
72
+ % WS1 — plan-level selective prediction (verified union planner)
73
+ % repro: uv run python -m eval.precision_curve --plan eval/results/v6_hospital_raw_plan.json --union
74
+ \newcommand{\unionGatePrec}{0.905}
75
+ \newcommand{\unionGateCov}{0.413}
76
+ \newcommand{\modelGatePrec}{0.993} % gated model plan alone, tau=0.5 (146/147 correct)
77
+ \newcommand{\modelGateCov}{0.287}
78
+ \newcommand{\unionChanged}{232}
79
+ \newcommand{\unionFixed}{210}
80
+ \newcommand{\hospErrors}{509}
81
+ % 3 training seeds (mixA 21=shipped/25/26), union @ tau=0.5 (eval/results/union_gate_3seed.json)
82
+ \newcommand{\unionGateThreeSeedPrec}{$0.891 \pm 0.012$}
83
+ \newcommand{\unionGateThreeSeedCov}{$0.396 \pm 0.025$}
84
+ % WS2 pair-profiles (measured-and-cut): constrained raw plan / composed at tau=0.5
85
+ \newcommand{\pairsRawPrec}{0.760}
86
+ \newcommand{\pairsRawCov}{0.348}
87
+ \newcommand{\pairsUnionPrec}{0.876}
88
+ \newcommand{\pairsUnionCov}{0.387}
89
+
90
+ % ===== v2 (post-freeze system, 2026-06-11/12) =====
91
+ \newcommand{\nPairs}{42}
92
+ \newcommand{\nWild}{35}
93
+ \newcommand{\nTrust}{239}
94
+ \newcommand{\unseenMacroF}{0.363}
95
+ \newcommand{\unseenMacroDamage}{0.0219}
96
+ \newcommand{\wildRecovery}{0.207}
97
+ \newcommand{\genFTwo}{0.058}
98
+ \newcommand{\genVRTwo}{0.108}
99
+ \newcommand{\genDamageTwo}{0.036}
100
+ \newcommand{\ttFOne}{0.955--0.957}
101
+ \newcommand{\flightsVoteF}{0.164}
102
+ \newcommand{\flightsBaseF}{0.044}
103
+ \newcommand{\hospVoteHeur}{0.186}
104
+ \newcommand{\hospBaseHeur}{0.092}
105
+ \newcommand{\gidclHosp}{0.97}
106
+
107
+ % WS4 — learned-repair baselines, Raha real slice only (eval/baselines_learned.py)
108
+ \newcommand{\realFBaran}{0.811} % oracle detection + 20 gold labels: upper bound
109
+ \newcommand{\realFBaranCI}{0.018} % 3 label-sampling seeds
110
+ \newcommand{\damageBaran}{0.003}
111
+ \newcommand{\precBaran}{0.824}
112
+ \newcommand{\realFJelly}{0.074} % Jellyfish-13B ED+DI (scripts/modal_jellyfish.py)
113
+ \newcommand{\damageJelly}{0.027}
114
+
115
+ % W1.a — matched-budget label curve, 5-dataset Raha macro (eval/results/label_curve.json)
116
+ \newcommand{\realFBaranZero}{0.000} % Baran k=0 (oracle positions retained), 3 seeds
117
+ \newcommand{\realFBaranFive}{0.504} % Baran k=5
118
+ \newcommand{\realFOursFive}{0.282} % ours k=5 (labels validate/expand accept set only)
119
+ \newcommand{\realFOursTwenty}{0.351} % ours k=20
120
+ \newcommand{\realFOursHead}{0.225} % ours k=0 at HEAD (post-freeze capabilities)
121
+
122
+ % W4.3/4.4 — degenerate baselines + cost-weighted scores (eval/results/degenerate.json)
123
+ \newcommand{\degShippedF}{0.343}
124
+ \newcommand{\degShippedP}{0.576}
125
+ \newcommand{\degShippedDamage}{0.023}
126
+ \newcommand{\degShippedPhiOne}{$+0.13$}
127
+
128
+ % W1.c — zero-shot capability scaling arm (eval/results/scaling_arm.json)
129
+ \newcommand{\scalePrecBig}{0.915} % devstral-24B and gemma4-31B union point
130
+ \newcommand{\scaleCovBig}{0.485}
131
+ \newcommand{\scalePrecNemo}{0.877}
132
+ \newcommand{\scaleCovNemo}{0.336}
133
+ % hosted wall-clock for the 5 planning calls, single capture (scaling_arm.json runtime_s)
134
+ \newcommand{\runtimeDevstral}{135}
135
+ \newcommand{\runtimeNemo}{114}
136
+ \newcommand{\runtimeGemma}{104}
137
+
138
+ % Frontier zero-shot reference point: hospital repair recall of a vanilla frontier-scale
139
+ % cloud planner run through the same propose/execute harness (2026-06-04..07 architecture
140
+ % validation captures, pre-verifier; recorded in project training-run logs). Quoted in the
141
+ % fine-tune results subsection as the zero-shot ceiling the v6 recall approaches.
142
+ \newcommand{\frontierZeroShotRecall}{0.51}
143
+
144
+ % R3 — absolute champion GEN-F1 basis of the equivalence retrain series
145
+ % (eval/results/equivalence.json spec.champion_macro_gen_f1 = 0.014606)
146
+ \newcommand{\genChampionBasis}{0.0146}
eval/README.md ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Eval harness + goalpost
2
+
3
+ Measures any planner against a **held-out** synthetic gold set (seed differs from
4
+ training, and gold is filtered to oracle-solvable so the ceiling is a clean 1.0).
5
+
6
+ ```bash
7
+ uv run eval/run_eval.py --n 300 --seed 4242
8
+ ```
9
+
10
+ Adopts the researched tooling: `jsonschema` for plan validity; set-based micro-F1 for
11
+ operations and canonicalization mappings; the **executor itself** for end-to-end
12
+ cell-recovery (the Raha-style dirty→clean comparison). promptfoo + `llm-rubric` will
13
+ wrap the report-quality layer once a model exists.
14
+
15
+ ## Metrics
16
+ - **json_valid** — plan conforms to the schema (`eval/metrics.py:PLAN_SCHEMA`).
17
+ - **op_f1 / op_r** — micro-F1 / recall over `(column, operation)` pairs vs gold.
18
+ - **canon_f1 / canon_r** — micro-F1 / recall over `(column, raw→canonical)` mapping
19
+ pairs. *This is the fuzzy skill rules can't do — the whole reason for the model.*
20
+ - **recovery** — fraction of clean-reference cells recovered by executing the plan.
21
+
22
+ ## Baseline (measured) and the goalpost
23
+
24
+ Two reference systems frame every run:
25
+ - **ORACLE** = the gold plan → the ceiling.
26
+ - **HEURISTIC** (`scrubdata.mock_plan`) = the rule-based baseline the model must beat.
27
+
28
+ Measured on the frozen 300-example gold set (`eval/gold.jsonl`, **value_counts/aggregation
29
+ format**):
30
+
31
+ | system | json_valid | op_f1 | canon_f1 | canon_r | recovery |
32
+ |---|---|---|---|---|---|
33
+ | ORACLE (gold) | 1.000 | 1.000 | 1.000 | 1.000 | **1.000** |
34
+ | HEURISTIC (baseline) | 1.000 | 0.932 | **0.189** | 0.129 | **0.637** |
35
+
36
+ **Reading:** with case-folding + typo-clustering the heuristic does the *easy*
37
+ canonicalization (collapse to most-frequent surface), but it's still ~blind to
38
+ **alias/semantic** canonicalization (`USA`→`United States`, `NYC`→`New York`) — canon_f1
39
+ 0.19 vs the oracle's 1.0. That gap is the fine-tuned model's job. (Earlier, on the old
40
+ sample-rows format, a fine-tune reached canon_f1 0.86 vs a big vanilla model's 0.45 —
41
+ proving small-aligned > big-generic; the v4 retrain re-establishes this on the new format.)
42
+
43
+ ### 🎯 Goalpost for the fine-tuned Qwen3-4B
44
+ | metric | baseline | **target** | ceiling |
45
+ |---|---|---|---|
46
+ | json_valid | 1.000 | **≥ 0.99** | 1.000 |
47
+ | op_f1 | 0.932 | **≥ 0.98** | 1.000 |
48
+ | canon_f1 | 0.189 | **≥ 0.85** | 1.000 |
49
+ | recovery | 0.637 | **≥ 0.95** | 1.000 |
50
+
51
+ A fine-tune that hits these clearly beats the (now stronger) heuristic and approaches the
52
+ oracle — the headline being **canon_f1 0.133 → ≥0.85** (alias-level canonicalization) and
53
+ **recovery 0.627 → ≥0.95**.
54
+
55
+ ## Plugging in the model
56
+ `evaluate(planner, gold)` takes any `planner(dirty_df, gold_plan) -> plan dict`. For
57
+ the model, wrap inference (build prompt via `scrubdata.prompt`, parse JSON) and pass it
58
+ in alongside the two reference systems. Track the table every fine-tune iteration; the
59
+ per-metric delta vs baseline is the cheap regression signal.
60
+
61
+ ## Layer 2 — real out-of-distribution data (`uv run eval/run_real.py`)
62
+
63
+ Raha `hospital` (1000×20, row-aligned dirty/clean). Errors are char-substitution typos
64
+ (`birminghxm`→`birmingham`) — only ~2.5% of cells. Scored with the Raha **repair**
65
+ protocol (the right metric when data is already mostly correct):
66
+
67
+ | system | recovery | repair_recall | repair_prec | broken |
68
+ |---|---|---|---|---|
69
+ | NO-OP (dirty as-is) | 0.975 | 0.000 | 0.000 | 0 |
70
+ | HEURISTIC (baseline) | 0.880 | **0.293** | 0.065 | 2041 |
71
+
72
+ (Typo-clustering now fixes ~29% of the real char-substitution errors — up from 0. The
73
+ model should push repair_recall higher and improve repair_prec.)
74
+
75
+ **Reading (honest + important):** the rule heuristic fixes **0** typos. Its 2021 changed
76
+ cells are **convention divergence, not errors** — our tool parses `100%`→`1.0` and
77
+ reformats phones, which this benchmark stores as raw text. That's product value, so raw
78
+ `recovery`/`broken` *understates* a standardizing tool on a foreign benchmark. The honest
79
+ metric here is **`repair_recall`** — did we fix the actual char-substitution typos
80
+ (`birminghxm`→`birmingham`)? The heuristic can't (scores 0); cluster-canonicalization is
81
+ the model's job. Two takeaways:
82
+ 1. **The headline real-data metric is `repair_recall`** (error-fixing), not recovery.
83
+ 2. **Product feature surfaced:** offer a "preserve original formats" toggle — some users
84
+ want raw representation kept; standardizing is the default but should be reversible
85
+ (matches PRODUCT.md's trust contract).
86
+
87
+ ### 🎯 Real-data goalpost (fine-tuned model)
88
+ | metric | NO-OP | HEURISTIC | **target** | note |
89
+ |---|---|---|---|---|
90
+ | **repair_recall** | 0.000 | 0.000 | **≥ 0.30** | the real test — fix typos via clustering |
91
+ | repair_prec | 0.000 | 0.000 | **≥ 0.70** | of cells changed, fraction that fixed an error |
92
+ | recovery | 0.975 | 0.874 | report-only | convention-sensitive; not a pass/fail gate |
93
+
94
+ The model plugs into `_score(dirty, clean, model_output)` exactly like the heuristic.
95
+
96
+ > Data auto-fetched to `data/real/hospital/` (gitignored). Add Flights/Beers/CleanML the
97
+ > same way for breadth.
98
+
99
+ ## Scale: aggregation + agentic batching (validated)
100
+
101
+ Cleaning *large* tables doesn't mean bigger prompts — it means reasoning over **patterns**:
102
+ - **Aggregation** — the profiler sends per-column `value_counts` (`[value, frequency]`), so
103
+ the prompt size depends on DISTINCT values, not rows. Rare typos sit at the tail next to
104
+ their dominant canonical (`birminghxm`:1 vs `birmingham`:312) — visible at any scale.
105
+ - **Column batching** — `scrubdata.model_planner.make_batched_planner` plans a wide table
106
+ in small column-batches, so a 20-column table never blows one prompt.
107
+
108
+ **Validated** on the real Raha hospital table (1000×20) with a *vanilla* model (no retrain):
109
+ **repair_recall 0.509** (fixed 259/509 typos), vs **0.000** for the old one-shot+sample-rows
110
+ approach. The v4 fine-tune trains on this `value_counts` format.
111
+
112
+ ---
113
+
114
+ ## The wide suite (current north-star)
115
+
116
+ The single-dataset hospital metric was retired as north-star (biased: one table,
117
+ recall-only, convention-sensitive, abstain-blind). The current harness:
118
+
119
+ - **`run_real_multi.py`** — 65-dataset suite (5 Raha real-error benchmarks + seeded
120
+ error injection over 15 harvested open-data domains), scored with a **churn-neutral**
121
+ metric (pure case/whitespace rewrites that don't restore gold count as nothing) and
122
+ aggregated as a **double macro** (error-type × domain, harmonic mean) so no single
123
+ table or error type dominates. Reports REAL vs INJECTED slices separately — injected
124
+ typos are in-distribution for frequency clustering by construction.
125
+ - **`ablations.py`** — removes one grounding component at a time (reference, abstain,
126
+ ambiguity margin, case-match). Caught two metric artifacts (churn inflation,
127
+ reference-unsafe traps) now fixed and documented in the paper.
128
+ - **`calibration.py`** — risk–coverage + ECE for the abstention confidence
129
+ (AURC 0.120; 90% precision at the default threshold, ≥95% at 0.91).
130
+ - **`pii_leak.py`** — masking leak test: 0/360 residual detectable PII.
131
+ - **`pii_slice.py`** — OOD PII column typing on Gretel test: 5/5 types, 0/7 FP.
132
+ - **`inject.py`** — seeded, self-verifying error injectors (typo/OCR/case/whitespace)
133
+ that turn any clean table into validation data.
134
+
135
+ Baselines include OpenRefine fingerprint + kNN clustering (`scrubdata/baselines.py`,
136
+ with blocking, as the real tool uses). Full results & discussion: `docs/paper/`.
eval/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Evaluation harness for the ScrubData planner.
2
+
3
+ Measures any planner (`callable(dirty_df) -> plan dict`) against a held-out gold set:
4
+ - JSON-schema validity of the plan
5
+ - operation-level micro-F1 vs the gold plan
6
+ - canonicalization mapping micro-F1 (the fuzzy skill rules can't do)
7
+ - end-to-end cell-recovery (executor(dirty, plan) vs known-clean reference)
8
+
9
+ Two reference systems frame every run:
10
+ - HEURISTIC (`scrubdata.mock_plan`) = the baseline a fine-tuned model must beat.
11
+ - ORACLE (the gold plan itself) = the goalpost ceiling (~100% by construction).
12
+ """
eval/ablations.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Ablation suite — isolate each grounding component's contribution to the north-star.
2
+
3
+ Each row turns ONE design decision off (via mock_plan's ground_cfg) and re-runs the wide
4
+ validation suite. Shows what grounding / abstention / ambiguity-checking / case-matching each
5
+ buy in F1 and (critically) in DAMAGE.
6
+
7
+ uv run python -m eval.ablations
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from scrubdata.planner import mock_plan
13
+
14
+ from .run_real_multi import evaluate_suite
15
+
16
+ ABLATIONS = [
17
+ ("full (grounded)", {}),
18
+ ("- grounding (freq-cluster)", {"use_reference": False}),
19
+ ("- abstain (map nearest)", {"threshold": 0.0, "min_margin": 0.0}),
20
+ ("- ambiguity check", {"min_margin": 0.0}),
21
+ ("- case match", {"case_match": False}),
22
+ ]
23
+
24
+
25
+ def main(seeds=(7, 17, 27), out: str | None = None) -> None:
26
+ def mean(xs):
27
+ xs = list(xs)
28
+ return sum(xs) / len(xs) if xs else 0.0
29
+
30
+ print(f"\n=== Ablation suite (wide validation suite, {len(seeds)} seeds) — each "
31
+ "removes ONE grounding component ===\n")
32
+ print(f"{'variant':<28}{'NORTH*':>9}{'REAL-F1':>9}{'INJ-F1':>8}{'damage':>9}{'abstain':>9}")
33
+ print("-" * 72)
34
+ rows = []
35
+ for name, cfg in ABLATIONS:
36
+ planner = (lambda df, c=cfg: mock_plan(df, ground_cfg=c))
37
+ per_seed = [evaluate_suite(planner, seed=s) for s in seeds]
38
+ r = {k: mean(p[k] for p in per_seed)
39
+ for k in ("north", "real", "injected", "damage", "abstain")}
40
+ mu = r["north"]
41
+ var = mean([(p["north"] - mu) ** 2 for p in per_seed])
42
+ r["north_ci"] = 1.96 * (var ** 0.5) / (len(per_seed) ** 0.5)
43
+ rows.append((name, r))
44
+ print(f"{name:<28}{r['north']:>9.3f}{r['real']:>9.3f}{r['injected']:>8.3f}"
45
+ f"{r['damage']:>9.3f}{r['abstain']:>9.3f}", flush=True)
46
+ full = rows[0][1]
47
+ print("\nDeltas vs full (what each component buys):")
48
+ for name, r in rows[1:]:
49
+ print(f" {name:<28} ΔNORTH={r['north'] - full['north']:+.3f} "
50
+ f"Δdamage={r['damage'] - full['damage']:+.3f} Δabstain={r['abstain'] - full['abstain']:+.3f}")
51
+ if out:
52
+ import json
53
+ json.dump([{"variant": n, **r, "seeds": list(seeds)} for n, r in rows],
54
+ open(out, "w"), indent=1)
55
+ print(f"rows written to {out}")
56
+ print("\nGrounding lifts F1; abstain + ambiguity-check cut DAMAGE; case-match avoids "
57
+ "convention damage. The combination is the contribution.")
58
+
59
+
60
+ if __name__ == "__main__":
61
+ import argparse
62
+ ap = argparse.ArgumentParser()
63
+ ap.add_argument("--out", type=str, default=None)
64
+ main(out=ap.parse_args().out)
eval/baselines_learned.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """WS4 learned-repair baselines: scoring + Jellyfish prompt construction.
2
+
3
+ Both baselines bypass plan dicts (the executor is column-level by design; learned repair
4
+ is per-cell) — they produce repaired DataFrames scored by the SAME churn-neutral
5
+ `eval.run_real_multi.score` as every other row of the money table.
6
+
7
+ * Baran: repaired CSVs come from eval/run_baran.py (pinned env). Score here:
8
+ uv run python -m eval.baselines_learned --score-baran
9
+ * Jellyfish: prompts built here (unit-testable without a GPU), executed by
10
+ scripts/modal_jellyfish.py (vLLM on Modal), scored in-run with the same `score`.
11
+
12
+ Jellyfish has NO repair task — we compose its two published cell-level tasks:
13
+ error detection (yes/no per cell) then data imputation (infer the flagged cell with the
14
+ attribute removed). Prompt templates are verbatim from the NECOUDBFM/Jellyfish-13B model
15
+ card; this composition is OURS, not theirs (disclosed in the paper).
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ from pathlib import Path
23
+
24
+ SYSTEM_MESSAGE = ("You are an AI assistant that follows instruction extremely well. "
25
+ "Help as much as you can.")
26
+
27
+ _ED_TEMPLATE = (
28
+ "Your task is to determine if there is an error in the value of a specific "
29
+ "attribute within the whole record provided.\n"
30
+ "The attributes may include {attrs}.\n"
31
+ "Errors may include, but are not limited to, spelling errors, inconsistencies, "
32
+ "or values that don't make sense given the context of the whole record.\n"
33
+ "Record [{record}]\n"
34
+ "Attribute for Verification: [{col}: {val}]\n"
35
+ "Question: Is there an error in the value of {col}? "
36
+ "Choose your answer from: [Yes, No]."
37
+ )
38
+
39
+ _DI_TEMPLATE = (
40
+ "You are presented with a {keyword} record that is missing a specific attribute: "
41
+ "{col}.\n"
42
+ "Your task is to deduce or infer the value of {col} using the available "
43
+ "information in the record.\n"
44
+ "You may be provided with fields like {attrs} to help you in the inference.\n"
45
+ "Record: [{record}]\n"
46
+ "Based on the provided record, what would you infer is the value for the missing "
47
+ "attribute {col}?\n"
48
+ "Answer only the value of {col}."
49
+ )
50
+
51
+
52
+ def wrap_prompt(user_message: str) -> str:
53
+ """The Jellyfish-13B chat scaffold (verbatim from the model card)."""
54
+ return f"{SYSTEM_MESSAGE}\n\n### Instruction:\n\n{user_message}\n\n### Response:\n\n"
55
+
56
+
57
+ def _serialize(record: dict, skip: str | None = None) -> str:
58
+ return ", ".join(f"{k}: {v}" for k, v in record.items() if k != skip)
59
+
60
+
61
+ def ed_prompt(record: dict, col: str) -> str:
62
+ """Error-detection prompt (whole-record form) for one cell."""
63
+ return wrap_prompt(_ED_TEMPLATE.format(
64
+ attrs=", ".join(record.keys()), record=_serialize(record),
65
+ col=col, val=record[col]))
66
+
67
+
68
+ def di_prompt(record: dict, col: str, keyword: str) -> str:
69
+ """Data-imputation prompt for a flagged cell — the attribute is REMOVED from the
70
+ serialized record so the model infers, not copies."""
71
+ attrs = [k for k in record.keys() if k != col]
72
+ return wrap_prompt(_DI_TEMPLATE.format(
73
+ keyword=keyword, col=col, attrs=", ".join(attrs),
74
+ record=_serialize(record, skip=col)))
75
+
76
+
77
+ def parse_ed(text: str) -> bool:
78
+ """True = the model says the cell is erroneous."""
79
+ return text.strip().lower().lstrip("[").startswith("yes")
80
+
81
+
82
+ def parse_di(text: str, original: str) -> str:
83
+ """Imputed value, or the original (abstain) when the answer is unusable —
84
+ empty, multi-line/rambling, or implausibly long for a cell."""
85
+ ans = text.strip().strip('"').strip()
86
+ if not ans or "\n" in ans or len(ans) > 80:
87
+ return original
88
+ return ans
89
+
90
+
91
+ # ---------------------------------------------------------------- Baran scoring
92
+
93
+ def score_baran(repaired_dir: str = "eval/results/baran",
94
+ out: str = "eval/results/baran_raha.json") -> dict:
95
+ """Score every <name>_seed<k>_repaired.csv against (dirty, clean) under the
96
+ identical churn-neutral protocol; macro REAL-F1 mean ± 95% CI over seeds."""
97
+ import collections
98
+
99
+ import pandas as pd
100
+
101
+ from .run_real_multi import _raha_pair, score
102
+
103
+ per_seed: dict[int, list] = collections.defaultdict(list)
104
+ per_ds = []
105
+ for p in sorted(Path(repaired_dir).glob("*_seed*_repaired.csv")):
106
+ name, seed = p.stem.rsplit("_repaired", 1)[0].rsplit("_seed", 1)
107
+ repaired = pd.read_csv(p, dtype=str, keep_default_na=False)
108
+ dirty, clean = _raha_pair(name)
109
+ m = score(dirty, clean, repaired)
110
+ per_seed[int(seed)].append(m)
111
+ per_ds.append({"name": name, "seed": int(seed), **{k: v for k, v in m.items()}})
112
+ print(f" {name:<10} seed{seed}: F1={m['f1']:.3f} P={m['precision']:.3f} "
113
+ f"R={m['recall']:.3f} dmg={m['damage']:.3f}")
114
+ if not per_seed:
115
+ raise SystemExit(f"no repaired CSVs found in {repaired_dir}")
116
+
117
+ def mean(xs):
118
+ xs = list(xs)
119
+ return sum(xs) / len(xs) if xs else 0.0
120
+
121
+ seed_f1 = [mean(m["f1"] for m in ms) for ms in per_seed.values()]
122
+ mu = mean(seed_f1)
123
+ var = mean([(x - mu) ** 2 for x in seed_f1])
124
+ ci = 1.96 * (var ** 0.5) / (len(seed_f1) ** 0.5)
125
+ result = {
126
+ "system": "Baran (oracle detection, 20 gold labels)",
127
+ "real_f1": mu, "real_f1_ci": ci, "real_f1_per_seed": seed_f1,
128
+ "damage": mean(mean(m["damage"] for m in ms) for ms in per_seed.values()),
129
+ "precision": mean(mean(m["precision"] for m in ms) for ms in per_seed.values()),
130
+ "recall": mean(mean(m["recall"] for m in ms) for ms in per_seed.values()),
131
+ "n_seeds": len(per_seed), "per_dataset": per_ds,
132
+ "protocol_note": "upper bound: oracle error positions + 20 gold-labeled tuples "
133
+ "(its package default); damage=0 by construction",
134
+ }
135
+ json.dump(result, open(out, "w"), indent=1)
136
+ print(f"\nBaran macro REAL-F1 {mu:.3f} ± {ci:.3f} (n={len(seed_f1)} seeds) -> {out}")
137
+ return result
138
+
139
+
140
+ if __name__ == "__main__":
141
+ ap = argparse.ArgumentParser()
142
+ ap.add_argument("--score-baran", action="store_true")
143
+ args = ap.parse_args()
144
+ if args.score_baran:
145
+ score_baran()
eval/calibration.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Selective prediction / calibration study for grounded canonicalization.
2
+
3
+ "Knowing when NOT to act" is the research contribution (and the AI-safety monitorability
4
+ angle): instead of always emitting a canonical, the grounded reconciler attaches a
5
+ CONFIDENCE and ABSTAINS below threshold. This module measures whether that confidence is
6
+ trustworthy:
7
+
8
+ * Risk-Coverage curve + AURC — sort decisions by confidence; as we cover more (abstain
9
+ less) does risk rise gracefully? Low AURC = a good selective predictor.
10
+ * ECE (Expected Calibration Error) — does a confidence of 0.9 actually mean ~90% correct?
11
+ * Operating point — at our default threshold, what coverage and precision do we get, and
12
+ what threshold hits a target precision (e.g. 95%)?
13
+
14
+ Probe = real cities sampled from the reference with injected typos (recoverable, gold known)
15
+ + garbage TRAP strings (acting at all is an error). Reproducible (fixed seed).
16
+
17
+ uv run python -m eval.calibration
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import random
23
+ import string
24
+
25
+ from scrubdata.reconcile import _norm, default_index
26
+
27
+
28
+ def _typo(s: str, rng: random.Random) -> str:
29
+ if len(s) < 4:
30
+ return s + rng.choice(string.ascii_lowercase)
31
+ i = rng.randrange(1, len(s) - 1)
32
+ if not s[i].isalpha():
33
+ i = 1
34
+ pool = string.ascii_lowercase if s[i].islower() else string.ascii_uppercase
35
+ return s[:i] + rng.choice([c for c in pool if c != s[i].lower()]) + s[i + 1:]
36
+
37
+
38
+ def build_probe(n_real: int = 500, n_trap: int = 150, seed: int = 5):
39
+ """(value, gold|None, kind) probes: real-city typos (recoverable) + garbage traps."""
40
+ idx = default_index()
41
+ cities = [c for bucket in idx._buckets.get("city", {}).values() for (c, _) in bucket]
42
+ rng = random.Random(seed)
43
+ probe = []
44
+ for c in rng.sample(cities, min(n_real, len(cities))):
45
+ probe.append((_typo(c, rng), c, "real"))
46
+ for _ in range(n_trap):
47
+ g = "".join(rng.choice(string.ascii_lowercase) for _ in range(rng.randint(5, 9)))
48
+ probe.append((g, None, "trap"))
49
+ rng.shuffle(probe)
50
+ return probe, idx
51
+
52
+
53
+ def _scored(probe, idx, ctype="city"):
54
+ """(confidence, correct_if_acted) per probe."""
55
+ out = []
56
+ for value, gold, kind in probe:
57
+ b = idx.best(value, ctype)
58
+ conf = b[1] if b else 0.0
59
+ correct = bool(kind == "real" and b and _norm(b[0]) == _norm(gold))
60
+ out.append((conf, correct))
61
+ return out
62
+
63
+
64
+ def risk_coverage(scored):
65
+ rows = sorted(scored, key=lambda x: -x[0])
66
+ n, cum = len(rows), 0
67
+ curve = []
68
+ for k, (conf, ok) in enumerate(rows, 1):
69
+ cum += int(ok)
70
+ curve.append((k / n, 1 - cum / k, conf)) # coverage, risk, confidence
71
+ aurc = sum(r for _, r, _ in curve) / len(curve)
72
+ return curve, aurc
73
+
74
+
75
+ def ece(scored, bins: int = 10) -> float:
76
+ n = len(scored)
77
+ e = 0.0
78
+ for b in range(bins):
79
+ lo, hi = b / bins, (b + 1) / bins
80
+ bucket = [(c, ok) for c, ok in scored if (lo <= c < hi) or (b == bins - 1 and c == 1.0)]
81
+ if not bucket:
82
+ continue
83
+ conf = sum(c for c, _ in bucket) / len(bucket)
84
+ acc = sum(int(ok) for _, ok in bucket) / len(bucket)
85
+ e += len(bucket) / n * abs(conf - acc)
86
+ return e
87
+
88
+
89
+ def operating_point(scored, threshold: float):
90
+ acted = [(c, ok) for c, ok in scored if c >= threshold]
91
+ coverage = len(acted) / len(scored)
92
+ precision = (sum(int(ok) for _, ok in acted) / len(acted)) if acted else 1.0
93
+ return coverage, precision
94
+
95
+
96
+ def main() -> None:
97
+ probe, idx = build_probe()
98
+ scored = _scored(probe, idx)
99
+ curve, aurc = risk_coverage(scored)
100
+ e = ece(scored)
101
+ print(f"\n=== Selective prediction / calibration — grounded city reconciliation "
102
+ f"({len(probe)} probes: real typos + traps) ===\n")
103
+ print(f" AURC (area under risk-coverage, lower=better) = {aurc:.4f}")
104
+ print(f" ECE (expected calibration error, lower=better) = {e:.4f}")
105
+ print("\n Risk-Coverage operating points:")
106
+ print(f" {'threshold':>10}{'coverage':>10}{'precision':>11}")
107
+ for t in (0.70, 0.78, 0.84, 0.90, 0.95, 1.00):
108
+ cov, prec = operating_point(scored, t)
109
+ print(f" {t:>10.2f}{cov:>10.3f}{prec:>11.3f}")
110
+ # threshold achieving >=95% precision
111
+ best_t = next((t / 100 for t in range(70, 101)
112
+ if operating_point(scored, t / 100)[1] >= 0.95), 1.0)
113
+ cov95, _ = operating_point(scored, best_t)
114
+ print(f"\n -> for >=95% precision use threshold {best_t:.2f} (coverage {cov95:.3f}). "
115
+ "The confidence is trustworthy enough to ABSTAIN on the rest — the safety contract.")
116
+
117
+
118
+ if __name__ == "__main__":
119
+ main()
eval/capture_plan_local.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Capture a raw v6 model plan LOCALLY (Ollama Q8_0 GGUF) for a Raha dataset.
2
+
3
+ Mirrors the Modal capture composition (scripts/modal_eval_v5.py --capture):
4
+ make_batched_planner(base, batch_size=4), greedy, no grounded wrapper, no union —
5
+ verification/union happen downstream (eval/raha_table.py, eval/precision_curve.py).
6
+ DISCLOSED deltas vs the Modal captures: (1) Q8_0 GGUF on local Ollama instead of the
7
+ bf16 merged adapter on A100 — quantization may shift individual mappings; (2) Ollama
8
+ format=json instead of generate(suppress_tokens=[151657,151658]) — both exist solely
9
+ to block the degenerate <tool_call> first token (without either, generation loops).
10
+
11
+ Prereq: ollama pull hf.co/ricalanis/scrubdata-qwen3-4b-v6-q8:Q8_0
12
+ ollama create scrubdata-ft -f notebooks/Modelfile
13
+
14
+ uv run python -m eval.capture_plan_local --dataset beers
15
+ Writes eval/results/v6_<dataset>_raw_plan_localq8.json.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import time
23
+ from pathlib import Path
24
+
25
+ from scrubdata.model_planner import _extract_json, make_batched_planner
26
+
27
+ from .run_real_multi import _raha_pair
28
+
29
+
30
+ def make_json_constrained_planner(model: str, host: str = "http://localhost:11434",
31
+ timeout: int = 600):
32
+ """Local Ollama planner with format=json (grammar-constrained decoding)."""
33
+ import urllib.request
34
+
35
+ from scrubdata.profiler import profile_dataframe
36
+ from scrubdata.prompt import SYSTEM_PROMPT, build_user_prompt
37
+
38
+ def planner(dirty_df, *_):
39
+ user = build_user_prompt(profile_dataframe(dirty_df), dirty_df)
40
+ payload = {
41
+ "model": model, "stream": False, "format": "json",
42
+ "messages": [{"role": "system", "content": SYSTEM_PROMPT},
43
+ {"role": "user", "content": user}],
44
+ "options": {"temperature": 0, "num_predict": 2000, "num_ctx": 16384},
45
+ }
46
+ req = urllib.request.Request(
47
+ host + "/api/chat", data=json.dumps(payload).encode(),
48
+ headers={"Content-Type": "application/json"})
49
+ try:
50
+ with urllib.request.urlopen(req, timeout=timeout) as r:
51
+ out = json.loads(r.read())["message"]["content"]
52
+ except Exception as e: # noqa: BLE001
53
+ print(f" batch failed: {str(e)[:80]}", flush=True)
54
+ return {"__error__": str(e)[:120]}
55
+ plan = _extract_json(out)
56
+ if plan is None:
57
+ print(f" batch returned no JSON: {out[:80]!r}", flush=True)
58
+ return {"__error__": "no_json"}
59
+ plan.setdefault("table_operations", [])
60
+ plan.setdefault("columns", [])
61
+ plan.setdefault("flags", [])
62
+ return plan
63
+ return planner
64
+
65
+
66
+ def main() -> None:
67
+ ap = argparse.ArgumentParser()
68
+ ap.add_argument("--dataset", required=True)
69
+ ap.add_argument("--model", default="scrubdata-ft")
70
+ ap.add_argument("--timeout", type=int, default=600)
71
+ args = ap.parse_args()
72
+
73
+ dirty, _clean = _raha_pair(args.dataset) # same table the scorer sees
74
+ print(f"capturing plan: {args.dataset} ({len(dirty)} rows x {dirty.shape[1]} cols)",
75
+ flush=True)
76
+ t0 = time.time()
77
+ plan = make_batched_planner(make_json_constrained_planner(args.model, timeout=args.timeout),
78
+ batch_size=4)(dirty)
79
+ dt = time.time() - t0
80
+ n_ops = sum(len(c.get("operations", [])) for c in plan.get("columns", []))
81
+ print(f"done in {dt:.0f}s — {len(plan.get('columns', []))} columns, {n_ops} ops")
82
+
83
+ out = (Path(__file__).resolve().parent / "results"
84
+ / f"v6_{args.dataset}_raw_plan_localq8.json")
85
+ json.dump(plan, open(out, "w"), indent=1)
86
+ print(f"written to {out}")
87
+
88
+
89
+ if __name__ == "__main__":
90
+ main()
eval/contamination_probe.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Memorization probe (W4.6): can a web-trained model complete benchmark rows verbatim?
2
+
3
+ Legacy-public benchmarks (hospital et al., GitHub since 2019) sit inside every base
4
+ model's training window; a HIGH verbatim-completion rate red-flags memorized gold.
5
+ A low rate does not prove absence — the contamination statement stays assumption-based.
6
+ Control: a date-stamped post-cutoff wild harvest (expected ~0).
7
+
8
+ uv run python -m eval.contamination_probe
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import random
14
+ import subprocess
15
+ from pathlib import Path
16
+
17
+ import pandas as pd
18
+
19
+ ROOT = Path(__file__).resolve().parent.parent
20
+ N_ROWS, N_GIVEN, MODEL = 30, 5, "glm-5.1"
21
+
22
+
23
+ def probe(df: pd.DataFrame, name: str) -> dict:
24
+ rng = random.Random(0)
25
+ rows = rng.sample(range(len(df)), min(N_ROWS, len(df)))
26
+ cols = list(df.columns)
27
+ given, asked = cols[:N_GIVEN], cols[N_GIVEN:N_GIVEN + 4]
28
+ hits = total = 0
29
+ for r in rows:
30
+ prompt = (f"This is a row from the well-known public dataset '{name}'. "
31
+ f"Complete the remaining fields EXACTLY as they appear in the dataset. "
32
+ f"Known fields: "
33
+ + "; ".join(f"{c}={df.iloc[r][c]}" for c in given)
34
+ + ". Respond ONLY with: " + "; ".join(f"{c}=<value>" for c in asked))
35
+ out = subprocess.run(["oll", prompt, "--model", MODEL, "--max-tokens", "200"],
36
+ capture_output=True, text=True, timeout=120).stdout.lower()
37
+ for c in asked:
38
+ total += 1
39
+ v = str(df.iloc[r][c]).strip().lower()
40
+ if v and v not in ("nan", "") and v in out:
41
+ hits += 1
42
+ return {"table": name, "rows": len(rows), "cells_asked": total,
43
+ "verbatim_hits": hits, "rate": round(hits / max(total, 1), 4)}
44
+
45
+
46
+ def main() -> None:
47
+ hosp = pd.read_csv(ROOT / "data" / "real" / "hospital" / "clean.csv").astype(str)
48
+ wild = pd.read_csv(ROOT / "data" / "wild" / "glassdoor_jobs.csv").astype(str)
49
+ res = {"model": MODEL, "protocol": f"{N_ROWS} rows, {N_GIVEN} given cols, 4 asked cols, exact-substring match",
50
+ "probes": [probe(hosp, "hospital (Raha benchmark)"),
51
+ probe(wild, "glassdoor_jobs (post-cutoff wild harvest)")]}
52
+ json.dump(res, open(ROOT / "eval" / "results" / "contamination_probe.json", "w"), indent=1)
53
+ print(json.dumps(res["probes"], indent=1))
54
+
55
+
56
+ if __name__ == "__main__":
57
+ main()
eval/cross_scoring.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """B1 (W4.2) dual-metric cross-scoring on the 5 Raha real-error datasets.
2
+
3
+ Scores every system under BOTH metric families, side by side:
4
+ * original — the Raha/Baran cell-level repair protocol (Mahdavi & Abedjan,
5
+ PVLDB 13(12), p1948, Sec 6.1 + raha/dataset.py get_data_cleaning_evaluation):
6
+ values minimally normalized (html-unescape, whitespace collapse — their
7
+ value_normalizer), then RAW string equality; precision = exact-gold repairs /
8
+ cells changed; recall = exact-gold repairs / (dirty->clean diff); no
9
+ churn-neutrality, no case folding, no semantic tolerance, no damage metric.
10
+ * churn_neutral — our eval.run_real_multi.score (the scoring contract):
11
+ convention-normalized, churn ignored, damage reported.
12
+
13
+ Systems: grounded (HEAD mock_plan), verified union (v6, tau=0.5 — identical plan
14
+ files to eval.raha_table), OpenRefine fingerprint/kNN, and Baran at labeling
15
+ budgets 0/5/20 (oracle detection; repaired CSVs from eval/run_baran.py, 3 seeds,
16
+ seed-mean). Baran-from-CSV caveat: corrections equal to the dirty value vanish
17
+ from the repaired-vs-dirty diff, so reconstructed |changed| is a lower bound on
18
+ Baran's own output_size (precision an upper bound; recall exact).
19
+
20
+ Also computes Kendall tau-b between the SYSTEM RANKINGS induced by the two F1s
21
+ (per dataset + macro), and a calibration block: our Baran oracle+20 repro vs the
22
+ published Table 3 "Baran" row (verified from the PVLDB PDF; see PUBLISHED below).
23
+
24
+ Acceptance: the churn-neutral rows must reproduce eval/results/raha_per_dataset.json
25
+ exactly (checked, hard-fails otherwise).
26
+
27
+ uv run python -m eval.cross_scoring
28
+ Writes eval/results/cross_scoring.json and prints LaTeX rows.
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import html
34
+ import json
35
+ import re
36
+ from pathlib import Path
37
+
38
+ import pandas as pd
39
+
40
+ from scrubdata.baselines import openrefine_fingerprint_plan, openrefine_knn_plan
41
+ from scrubdata.executor import apply_plan
42
+ from scrubdata.planner import mock_plan
43
+ from scrubdata.verifier import union_plans, verify_plan
44
+
45
+ from .precision_curve import _repairs_only
46
+ from .raha_table import TAU, UNION_PLANS, _gen_plan
47
+ from .run_real_multi import RAHA, _cell_only, _raha_pair, score
48
+
49
+ RESULTS = Path(__file__).resolve().parent / "results"
50
+ BARAN_DIRS = {0: RESULTS / "baran_n0", 5: RESULTS / "baran_n5", 20: RESULTS / "baran"}
51
+
52
+ # Baran PVLDB'20 Table 3, row "Baran" (no TL): complete set of data errors given as
53
+ # input (= oracle detection), labeling budget 20, mean of 10 runs. Verified by reading
54
+ # vldb.org/pvldb/vol13/p1948-mahdavi.pdf p1957 (2026-06-12). movies_1 is not evaluated
55
+ # in the paper (its real-error sets are hospital/flights/address/beers/rayyan/it/tax).
56
+ PUBLISHED = {"hospital": {"precision": 0.88, "recall": 0.86, "f1": 0.87},
57
+ "flights": {"precision": 1.00, "recall": 1.00, "f1": 1.00},
58
+ "beers": {"precision": 0.91, "recall": 0.89, "f1": 0.90},
59
+ "rayyan": {"precision": 0.76, "recall": 0.40, "f1": 0.52}}
60
+
61
+
62
+ def _norm(v: str) -> str:
63
+ """raha.dataset.Dataset.value_normalizer, verbatim semantics."""
64
+ v = html.unescape(str(v))
65
+ v = re.sub("[\t\n ]+", " ", v, re.UNICODE)
66
+ return v.strip("\t\n ")
67
+
68
+
69
+ def baran_score(dirty: pd.DataFrame, clean: pd.DataFrame, out: pd.DataFrame) -> dict:
70
+ """The original Raha/Baran repair metric over a repaired DataFrame: minimal
71
+ normalization then raw equality; changed = repaired-vs-dirty diff."""
72
+ n = min(len(dirty), len(out), len(clean))
73
+ errors = changed = tp = 0
74
+ for j, col in enumerate(dirty.columns):
75
+ present = col in out.columns
76
+ for i in range(n):
77
+ dv, cv = _norm(dirty.iat[i, j]), _norm(clean.iat[i, j])
78
+ ov = _norm(out.iloc[i][col]) if present else dv
79
+ err, chg = dv != cv, ov != dv
80
+ errors += err
81
+ changed += chg
82
+ tp += chg and err and ov == cv
83
+ p = tp / changed if changed else 0.0
84
+ r = tp / errors if errors else 0.0
85
+ f1 = 2 * p * r / (p + r) if (p + r) else 0.0
86
+ return {"f1": f1, "precision": p, "recall": r,
87
+ "_errors": errors, "_changed": changed, "_tp": tp}
88
+
89
+
90
+ def _both(dirty, clean, out) -> dict:
91
+ m = score(dirty, clean, out)
92
+ return {"original": baran_score(dirty, clean, out),
93
+ "churn_neutral": {k: m[k] for k in
94
+ ("f1", "precision", "recall", "damage",
95
+ "_errors", "_changed", "_fixed")}}
96
+
97
+
98
+ def kendall_tau(xs, ys) -> float:
99
+ """Kendall tau-b (tie-corrected), stdlib."""
100
+ n = len(xs)
101
+ n0, n1, n2, nc, nd = n * (n - 1) // 2, 0, 0, 0, 0
102
+ for i in range(n):
103
+ for j in range(i + 1, n):
104
+ a, b = xs[i] - xs[j], ys[i] - ys[j]
105
+ n1 += a == 0
106
+ n2 += b == 0
107
+ if a != 0 and b != 0:
108
+ nc += (a > 0) == (b > 0)
109
+ nd += (a > 0) != (b > 0)
110
+ denom = ((n0 - n1) * (n0 - n2)) ** 0.5
111
+ return (nc - nd) / denom if denom else 0.0
112
+
113
+
114
+ def _mean_rows(rows: list[dict]) -> dict:
115
+ return {k: sum(r[k] for r in rows) / len(rows) for k in rows[0]}
116
+
117
+
118
+ def main() -> None:
119
+ out = {"protocol": {
120
+ "original": "Raha/Baran convention: value_normalizer (html-unescape + "
121
+ "whitespace collapse) then raw string equality; P = exact-gold "
122
+ "repairs / changed cells, R = exact-gold repairs / (dirty->clean "
123
+ "diff); no churn-neutrality, no damage",
124
+ "churn_neutral": "eval.run_real_multi.score — the scoring contract",
125
+ "baran_rows": "oracle error positions + n gold labels, 3 seeds, seed-mean; "
126
+ "reconstructed from repaired CSVs (no-op corrections vanish: "
127
+ "|changed| lower-bounds Baran's output_size)",
128
+ "movies_1": "first 2000 rows (_raha_pair), as everywhere in the suite"},
129
+ "systems": {}}
130
+
131
+ deterministic = [("grounded", mock_plan),
132
+ ("openrefine_fingerprint", openrefine_fingerprint_plan),
133
+ ("openrefine_knn", openrefine_knn_plan)]
134
+ for label, planner in deterministic:
135
+ rows = []
136
+ for name, _dom in RAHA:
137
+ dirty, clean = _raha_pair(name)
138
+ cleaned, _ = apply_plan(dirty, _cell_only(planner(dirty)))
139
+ m = _both(dirty, clean, cleaned)
140
+ rows.append({"dataset": name, **m})
141
+ print(f" {label:<24}{name:<10} orig={m['original']['f1']:.3f} "
142
+ f"cn={m['churn_neutral']['f1']:.3f}", flush=True)
143
+ out["systems"][label] = {"per_dataset": rows}
144
+
145
+ rows = []
146
+ for name, _dom in RAHA:
147
+ base = (json.load(open(UNION_PLANS[name])) if name in UNION_PLANS
148
+ else _gen_plan(name))
149
+ dirty, clean = _raha_pair(name)
150
+ plan = _repairs_only(union_plans(verify_plan(dirty, base, tau=TAU),
151
+ mock_plan(dirty)))
152
+ cleaned, _ = apply_plan(dirty, plan)
153
+ m = _both(dirty, clean, cleaned)
154
+ rows.append({"dataset": name, **m})
155
+ print(f" {'verified_union':<24}{name:<10} orig={m['original']['f1']:.3f} "
156
+ f"cn={m['churn_neutral']['f1']:.3f}", flush=True)
157
+ out["systems"]["verified_union_v6_tau0.5"] = {"per_dataset": rows}
158
+
159
+ for n_labels, d in BARAN_DIRS.items():
160
+ rows = []
161
+ for name, _dom in RAHA:
162
+ dirty, clean = _raha_pair(name)
163
+ per_seed = []
164
+ for p in sorted(d.glob(f"{name}_seed*_repaired.csv")):
165
+ repaired = pd.read_csv(p, dtype=str, keep_default_na=False)
166
+ per_seed.append(_both(dirty, clean, repaired))
167
+ m = {"original": _mean_rows([s["original"] for s in per_seed]),
168
+ "churn_neutral": _mean_rows([s["churn_neutral"] for s in per_seed])}
169
+ rows.append({"dataset": name, "n_seeds": len(per_seed), **m})
170
+ print(f" {'baran_oracle%d' % n_labels:<24}{name:<10} "
171
+ f"orig={m['original']['f1']:.3f} "
172
+ f"cn={m['churn_neutral']['f1']:.3f}", flush=True)
173
+ out["systems"][f"baran_oracle{n_labels}"] = {"per_dataset": rows}
174
+
175
+ for sys in out["systems"].values():
176
+ for fam in ("original", "churn_neutral"):
177
+ sys[f"macro_f1_{fam}"] = _mean_rows(
178
+ [r[fam] for r in sys["per_dataset"]])["f1"]
179
+
180
+ # acceptance: churn-neutral rows == raha_per_dataset.json (exact)
181
+ ref = json.load(open(RESULTS / "raha_per_dataset.json"))
182
+ checks = []
183
+ for key, ref_key in [("grounded", "grounded"),
184
+ ("openrefine_fingerprint", "openrefine_fingerprint"),
185
+ ("openrefine_knn", "openrefine_knn"),
186
+ ("verified_union_v6_tau0.5", "verified_union_v6_tau0.5"),
187
+ ("baran_oracle20", "baran_oracle20")]:
188
+ for got, want in zip(out["systems"][key]["per_dataset"],
189
+ ref["systems"][ref_key]["per_dataset"]):
190
+ for k in ("f1", "precision", "recall", "damage"):
191
+ ok = abs(got["churn_neutral"][k] - want[k]) < 1e-9
192
+ checks.append(ok)
193
+ if not ok:
194
+ print(f"MISMATCH {key}/{got['dataset']}/{k}: "
195
+ f"{got['churn_neutral'][k]} vs {want[k]}")
196
+ out["acceptance"] = {"vs": "raha_per_dataset.json", "n_cells": len(checks),
197
+ "pass": all(checks)}
198
+ print(f"\nacceptance: {sum(checks)}/{len(checks)} cells match "
199
+ f"-> {'PASS' if all(checks) else 'FAIL'}")
200
+ if not all(checks):
201
+ raise SystemExit("acceptance FAILED")
202
+
203
+ # Kendall tau-b between system rankings under the two F1s
204
+ primary = ["grounded", "verified_union_v6_tau0.5", "openrefine_fingerprint",
205
+ "openrefine_knn", "baran_oracle20"]
206
+ extended = primary + ["baran_oracle0", "baran_oracle5"]
207
+ taus = {}
208
+ for label, sysset in [("primary", primary), ("extended", extended)]:
209
+ per_ds = {}
210
+ for i, (name, _dom) in enumerate(RAHA):
211
+ xs = [out["systems"][s]["per_dataset"][i]["original"]["f1"] for s in sysset]
212
+ ys = [out["systems"][s]["per_dataset"][i]["churn_neutral"]["f1"] for s in sysset]
213
+ per_ds[name] = kendall_tau(xs, ys)
214
+ xs = [out["systems"][s]["macro_f1_original"] for s in sysset]
215
+ ys = [out["systems"][s]["macro_f1_churn_neutral"] for s in sysset]
216
+ taus[label] = {"systems": sysset, "per_dataset": per_ds,
217
+ "macro": kendall_tau(xs, ys)}
218
+ print(f"tau-b ({label}): macro={taus[label]['macro']:.3f} " +
219
+ " ".join(f"{n}={t:.3f}" for n, t in per_ds.items()))
220
+ out["kendall_tau_b"] = taus
221
+
222
+ # calibration: our Baran oracle+20 repro (ORIGINAL metric) vs published Table 3
223
+ cal = []
224
+ b20 = {r["dataset"]: r for r in out["systems"]["baran_oracle20"]["per_dataset"]}
225
+ for name, pub in PUBLISHED.items():
226
+ ours = b20[name]["original"]
227
+ cal.append({"dataset": name, "published_f1": pub["f1"],
228
+ "published_precision": pub["precision"],
229
+ "published_recall": pub["recall"],
230
+ "repro_f1": ours["f1"], "repro_precision": ours["precision"],
231
+ "repro_recall": ours["recall"],
232
+ "delta_f1": ours["f1"] - pub["f1"]})
233
+ print(f"calibration {name:<10} published F1={pub['f1']:.2f} "
234
+ f"repro F1={ours['f1']:.3f} (d={ours['f1'] - pub['f1']:+.3f})")
235
+ out["calibration"] = {
236
+ "source": "Mahdavi & Abedjan, PVLDB 13(12) p1948, Table 3 row 'Baran' "
237
+ "(no TL): complete error set given (oracle detection), budget 20, "
238
+ "mean of 10 runs; PDF read 2026-06-12",
239
+ "notes": "their runs: full datasets, 10 label seeds, Wikipedia value models "
240
+ "available in package but Table-3 row is without TL; ours: 3 label "
241
+ "seeds, no pretraining, movies_1 not in their paper; our "
242
+ "churn-neutral macro for this row is the paper's 0.811",
243
+ "rows": cal}
244
+
245
+ dest = RESULTS / "cross_scoring.json"
246
+ json.dump(out, open(dest, "w"), indent=1)
247
+ print(f"written to {dest}")
248
+ print(latex(out))
249
+
250
+
251
+ LABELS = [("grounded", "Grounded (ours, deterministic)"),
252
+ ("verified_union_v6_tau0.5", r"Verified union (v6, $\tau{=}0.5$)"),
253
+ ("openrefine_fingerprint", "OpenRefine fingerprint"),
254
+ ("openrefine_knn", "OpenRefine kNN"),
255
+ ("baran_oracle20", r"Baran (oracle det.\ + 20 labels)")]
256
+
257
+
258
+ def latex(out: dict) -> str:
259
+ """Booktabs rows: per system x dataset, original P/R/F1 next to churn-neutral
260
+ P/R/F1 + damage."""
261
+ L = [r"\begin{tabular}{llrrrrrrr}", r"\toprule",
262
+ r" & & \multicolumn{3}{c}{Original (Baran) metric} & "
263
+ r"\multicolumn{4}{c}{Churn-neutral (ours)} \\",
264
+ r"\cmidrule(lr){3-5}\cmidrule(lr){6-9}",
265
+ r"System & Dataset & Prec. & Rec. & F1 & Prec. & Rec. & F1 & Damage \\",
266
+ r"\midrule"]
267
+ for key, label in LABELS:
268
+ for i, r in enumerate(out["systems"][key]["per_dataset"]):
269
+ o, c = r["original"], r["churn_neutral"]
270
+ L.append(f"{label if i == 0 else ''} & "
271
+ f"{r['dataset'].replace('_', r'\_')} & "
272
+ f"{o['precision']:.3f} & {o['recall']:.3f} & {o['f1']:.3f} & "
273
+ f"{c['precision']:.3f} & {c['recall']:.3f} & {c['f1']:.3f} & "
274
+ f"{c['damage']:.3f} \\\\")
275
+ L.append(f" & \\emph{{macro}} & & & "
276
+ f"\\emph{{{out['systems'][key]['macro_f1_original']:.3f}}} & & & "
277
+ f"\\emph{{{out['systems'][key]['macro_f1_churn_neutral']:.3f}}} & \\\\")
278
+ L.append(r"\midrule")
279
+ t = out["kendall_tau_b"]["primary"]
280
+ L.append(r"\multicolumn{9}{l}{Kendall $\tau_b$ between system rankings: "
281
+ f"macro {t['macro']:.2f}; per dataset " +
282
+ ", ".join(f"{n.replace('_', r'\_')} {v:.2f}"
283
+ for n, v in t["per_dataset"].items()) + r"} \\")
284
+ cal = ", ".join(f"{r['dataset'].replace('_', r'\_')} {r['repro_f1']:.3f} vs "
285
+ f"{r['published_f1']:.2f}" for r in out["calibration"]["rows"])
286
+ L.append(r"\multicolumn{9}{l}{Calibration, original metric (our Baran oracle+20 "
287
+ r"repro vs PVLDB'20 Table~3): " + cal + r"} \\")
288
+ L.append(r"\bottomrule")
289
+ L.append(r"\end{tabular}")
290
+ return "\n".join(L)
291
+
292
+
293
+ if __name__ == "__main__":
294
+ main()
eval/degenerate.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """W4.3 + W4.4 — degenerate baselines + cost-weighted damage over the paired sets.
2
+
3
+ Four scorer-pinning policies over the same dirty/clean pairs eval/paired_bench.py
4
+ walks: no-op (output = dirty), abstain-all (no-op + flags; score-identical at the
5
+ cell level — the repair metric is flag-blind by design, flags surface in audit
6
+ metrics), random-edit (seeded vandalism: 5% of cells replaced with another value
7
+ from the same column) and oracle (output = clean, headers realigned to dirty's —
8
+ 23/42 pairs differ in header naming only; cell alignment is positional). They pin
9
+ the metric's floor (no-op F1 = 0, damage = 0), ceiling (oracle F1 = 1, damage = 0)
10
+ and show it punishes vandalism. Also reruns the SHIPPED pipeline (mock_plan) to
11
+ capture raw fix/damage cell counts and reports Effective-Reliability-style
12
+ cost-weighted scores score_c = fixes - c*damage_cells for c in {1, 5, 10}.
13
+
14
+ uv run python -m eval.degenerate
15
+ Writes eval/results/degenerate.json + docs/DEGENERATE_BASELINES.md. Per-pair rows
16
+ are cached incrementally (eval/results/degenerate_pairs.json) so a killed run
17
+ resumes where it stopped.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import json
24
+ import random
25
+ import time
26
+ from pathlib import Path
27
+
28
+ from scrubdata.executor import apply_plan
29
+ from scrubdata.planner import mock_plan
30
+
31
+ from .paired_bench import _load, pairs
32
+ from .run_real_multi import _cell_only, score
33
+
34
+ ROOT = Path(__file__).resolve().parent.parent
35
+ EDIT_FRAC = 0.05
36
+ SEED = 7
37
+ COSTS = (1, 5, 10)
38
+
39
+
40
+ def _noop(dirty, clean):
41
+ return dirty
42
+
43
+
44
+ def _abstain_all(dirty, clean):
45
+ return dirty.copy() # + flags conceptually; the cell metric is flag-blind
46
+
47
+
48
+ def _random_edit(dirty, clean, seed=SEED):
49
+ rng = random.Random(seed)
50
+ out = dirty.copy()
51
+ n, m = out.shape
52
+ uniq = [list(dict.fromkeys(out.iloc[:, j])) for j in range(m)]
53
+ for idx in rng.sample(range(n * m), max(1, int(n * m * EDIT_FRAC))):
54
+ i, j = divmod(idx, m)
55
+ alts = [v for v in uniq[j] if v != out.iat[i, j]]
56
+ if alts:
57
+ out.iat[i, j] = rng.choice(alts)
58
+ return out
59
+
60
+
61
+ def _oracle(dirty, clean):
62
+ out = clean.copy()
63
+ out.columns = dirty.columns # header-naming variants only; alignment is positional
64
+ return out
65
+
66
+
67
+ def _shipped(dirty, clean):
68
+ return apply_plan(dirty, _cell_only(mock_plan(dirty)))[0]
69
+
70
+
71
+ POLICIES = [("no-op", _noop), ("abstain-all", _abstain_all),
72
+ ("random-edit", _random_edit), ("oracle", _oracle),
73
+ ("shipped", _shipped)]
74
+
75
+
76
+ def _mean(xs):
77
+ xs = list(xs)
78
+ return sum(xs) / len(xs) if xs else 0.0
79
+
80
+
81
+ def main() -> None:
82
+ ap = argparse.ArgumentParser()
83
+ ap.add_argument("--only", default=None)
84
+ ap.add_argument("--out", default="eval/results/degenerate.json")
85
+ ap.add_argument("--cache", default="eval/results/degenerate_pairs.json")
86
+ args = ap.parse_args()
87
+ cache = json.load(open(args.cache)) if Path(args.cache).exists() else {}
88
+ for p in pairs():
89
+ if args.only and p.name != args.only:
90
+ continue
91
+ if p.name in cache:
92
+ continue
93
+ try:
94
+ dirty, clean = _load(p)
95
+ except Exception as e: # noqa: BLE001
96
+ print(f" {p.name}: LOAD FAILED {type(e).__name__}")
97
+ continue
98
+ entry = {}
99
+ for name, policy in POLICIES:
100
+ t0 = time.perf_counter()
101
+ m = score(dirty, clean, policy(dirty, clean))
102
+ n = min(len(dirty), len(clean))
103
+ clean_cells = n * dirty.shape[1] - m["_errors"]
104
+ entry[name] = {
105
+ "name": p.name, "errors": m["_errors"],
106
+ "f1": m["f1"], "precision": m["precision"], "recall": m["recall"],
107
+ "damage": m["damage"], "fixed": m["_fixed"], "changed": m["_changed"],
108
+ "damage_cells": round(m["damage"] * clean_cells),
109
+ "sec": round(time.perf_counter() - t0, 1)}
110
+ cache[p.name] = entry
111
+ json.dump(cache, open(args.cache, "w"), indent=1)
112
+ print(f" {p.name:<46} " + " ".join(
113
+ f"{name}={entry[name]['f1']:.3f}" for name, _ in POLICIES), flush=True)
114
+ res = {name: [cache[k][name] for k in sorted(cache)] for name, _ in POLICIES}
115
+
116
+ out = {"n_pairs": len(res["no-op"]), "edit_frac": EDIT_FRAC, "seed": SEED,
117
+ "policies": {}, "acceptance": {}}
118
+ for name, _ in POLICIES:
119
+ rows = res[name]
120
+ E, F, D = (sum(r[k] for r in rows) for k in ("errors", "fixed", "damage_cells"))
121
+ out["policies"][name] = {
122
+ "macro": {k: round(_mean(r[k] for r in rows), 4)
123
+ for k in ("f1", "precision", "recall", "damage")},
124
+ "micro": {"errors": E, "fixed": F, "changed": sum(r["changed"] for r in rows),
125
+ "damage_cells": D},
126
+ "score_c": {f"c={c}": {"raw": F - c * D,
127
+ "per_error": round((F - c * D) / E, 4)}
128
+ for c in COSTS},
129
+ "sec": round(sum(r["sec"] for r in rows), 1),
130
+ "per_pair": rows}
131
+ bad_oracle = [r["name"] for r in res["oracle"] if r["f1"] != 1.0]
132
+ bad_noop = [r["name"] for r in res["no-op"] if r["damage"] != 0.0]
133
+ out["acceptance"] = {"oracle_f1_all_exactly_1": not bad_oracle,
134
+ "noop_damage_all_exactly_0": not bad_noop,
135
+ "violations": {"oracle": bad_oracle, "no-op": bad_noop}}
136
+ Path(args.out).parent.mkdir(parents=True, exist_ok=True)
137
+ json.dump(out, open(args.out, "w"), indent=1)
138
+
139
+ P = out["policies"]
140
+ L = ["# Degenerate baselines + cost-weighted damage (W4.3 + W4.4)", "",
141
+ f"Same {out['n_pairs']} dirty/clean pairs as `eval/paired_bench.py`, scored with "
142
+ "`run_real_multi.score()` (churn-neutral F1 + damage). The degenerate policies pin",
143
+ "the metric: no-op = floor (F1 0, damage 0), oracle = ceiling (F1 1, damage 0),",
144
+ "random-edit (seeded, 5% of cells) = vandalism the metric must punish. Abstain-all",
145
+ "is score-identical to no-op — the repair metric is flag-blind by design.", "",
146
+ "| policy | macro F1 | macro P | macro R | macro damage | fixed | damage cells |",
147
+ "|---|---|---|---|---|---|---|"]
148
+ for name, _ in POLICIES:
149
+ ma, mi = P[name]["macro"], P[name]["micro"]
150
+ L.append(f"| {name} | {ma['f1']:.3f} | {ma['precision']:.3f} | {ma['recall']:.3f} "
151
+ f"| {ma['damage']:.4f} | {mi['fixed']} | {mi['damage_cells']} |")
152
+ L += ["", "## Cost-weighted scores (Effective-Reliability style, W4.4)", "",
153
+ "score_c = fixes − c·damage_cells, micro-summed over all pairs; per-error =",
154
+ f"score_c / {P['shipped']['micro']['errors']} total benchmark errors.", "",
155
+ "| policy | " + " | ".join(f"c={c} (per-error)" for c in COSTS) + " |",
156
+ "|---|" + "---|" * len(COSTS)]
157
+ for name, _ in POLICIES:
158
+ sc = P[name]["score_c"]
159
+ L.append(f"| {name} | " + " | ".join(
160
+ f"{sc[f'c={c}']['raw']} ({sc[f'c={c}']['per_error']:+.3f})" for c in COSTS) + " |")
161
+ a = out["acceptance"]
162
+ L += ["", f"Acceptance: oracle F1 = 1.0 on all pairs: **{a['oracle_f1_all_exactly_1']}** · "
163
+ f"no-op damage = 0.0 on all pairs: **{a['noop_damage_all_exactly_0']}**",
164
+ f"Repro: `uv run python -m eval.degenerate` (seed {SEED}, edit fraction {EDIT_FRAC})."]
165
+ (ROOT / "docs" / "DEGENERATE_BASELINES.md").write_text("\n".join(L) + "\n")
166
+ print(f"{out['n_pairs']} pairs x {len(POLICIES)} policies -> {args.out} "
167
+ "+ docs/DEGENERATE_BASELINES.md")
168
+ print("acceptance:", out["acceptance"])
169
+
170
+
171
+ if __name__ == "__main__":
172
+ main()
eval/diagnose_model.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Diagnose vanilla-model failures: truncation vs genuine schema violation.
2
+
3
+ Runs N examples through an Ollama Cloud model, categorizing each output:
4
+ empty / no_json / truncated / json_but_schema_invalid / valid
5
+ and reading `oll`'s stderr token counts to detect output hitting the cap.
6
+
7
+ uv run eval/diagnose_model.py --n 12 --model glm-5.1 --max-tokens 8000
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import json
14
+ import random
15
+ import re
16
+ import subprocess
17
+ from collections import Counter
18
+
19
+ from jsonschema import Draft202012Validator
20
+
21
+ from scrubdata.prompt import SYSTEM_PROMPT, build_user_prompt
22
+ from scrubdata.profiler import profile_dataframe
23
+ from training.generate import make_example
24
+
25
+ from .metrics import PLAN_SCHEMA
26
+
27
+ _V = Draft202012Validator(PLAN_SCHEMA)
28
+ _TOK = re.compile(r"out\s+(\d+)\s*tok", re.I)
29
+
30
+
31
+ def _call(user: str, model: str, max_tokens: int):
32
+ r = subprocess.run(
33
+ ["oll", "--model", model, "--system", SYSTEM_PROMPT,
34
+ "--max-tokens", str(max_tokens), "--temperature", "0"],
35
+ input=user, capture_output=True, text=True, timeout=300)
36
+ out_tok = None
37
+ m = _TOK.search(r.stderr or "")
38
+ if m:
39
+ out_tok = int(m.group(1))
40
+ return r.stdout, out_tok
41
+
42
+
43
+ def _categorize(out: str, out_tok: int | None, max_tokens: int):
44
+ s = out.strip()
45
+ if not s:
46
+ return "empty", None
47
+ i, j = s.find("{"), s.rfind("}")
48
+ if i == -1:
49
+ return "no_json", None
50
+ near_cap = out_tok is not None and out_tok >= max_tokens - 50
51
+ if j < i:
52
+ return ("truncated" if near_cap else "no_close_brace"), None
53
+ try:
54
+ plan = json.loads(s[i:j + 1])
55
+ except json.JSONDecodeError:
56
+ return ("truncated" if near_cap else "malformed_json"), None
57
+ errs = sorted(_V.iter_errors(plan), key=lambda e: e.path)
58
+ if not errs:
59
+ return "valid", None
60
+ return "schema_invalid", errs[0].message[:90]
61
+
62
+
63
+ def main() -> None:
64
+ ap = argparse.ArgumentParser()
65
+ ap.add_argument("--n", type=int, default=12)
66
+ ap.add_argument("--model", type=str, default="glm-5.1")
67
+ ap.add_argument("--max-tokens", type=int, default=8000)
68
+ ap.add_argument("--seed", type=int, default=4242)
69
+ args = ap.parse_args()
70
+
71
+ rng = random.Random(args.seed)
72
+ cats = Counter()
73
+ print(f"Diagnosing {args.model} @ max_tokens={args.max_tokens} on {args.n} examples\n")
74
+ for k in range(args.n):
75
+ ex = make_example(rng)
76
+ user = build_user_prompt(profile_dataframe(ex["dirty_df"]), ex["dirty_df"])
77
+ out, out_tok = _call(user, args.model, args.max_tokens)
78
+ cat, detail = _categorize(out, out_tok, args.max_tokens)
79
+ cats[cat] += 1
80
+ print(f" ex{k:2d}: {cat:<16} out_tok={out_tok}"
81
+ + (f" [{detail}]" if detail else ""))
82
+
83
+ print("\nBreakdown:", dict(cats))
84
+ valid = cats.get("valid", 0)
85
+ trunc = cats.get("truncated", 0)
86
+ print(f"valid={valid}/{args.n} ({valid/args.n:.0%}) | truncated={trunc} "
87
+ f"| schema_invalid={cats.get('schema_invalid', 0)}")
88
+
89
+
90
+ if __name__ == "__main__":
91
+ main()
eval/equivalence.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """W2.d — TOST equivalence statistics for the SFT null (the bounded negative claim).
2
+
3
+ Operationalizes "weight interventions did not move held-out repair": paired
4
+ per-dataset GEN-F1 deltas (retrain minus champion v6) over the 3 held-out EVAL
5
+ sources x the 5-retrain SFT series (challenger seed31, v7 seed32, v8 seed33,
6
+ v9 seed34, v10 seed35), pooled (n=15). DISCLOSED granularity: the retrain series
7
+ was scored per held-out SOURCE only (eval/results/generalization_*.json) — the
8
+ 42-pair paired bench exists for the shipped pipeline, not per retrain — so the
9
+ unit here is per-dataset, not per-pair, and within-retrain deltas are clustered
10
+ (flights/rayyan deltas are near-identical across retrains). A retrain-level
11
+ robustness check (n=5 macro deltas, one per retrain) is reported alongside.
12
+
13
+ PRE-REGISTERED (docs/ROADMAP_PUBLICATION.md W2.d, before this analysis ran):
14
+ SESOI delta = +/-0.05 GEN-F1, justified as smaller than the gain deterministic
15
+ grounding provides. TOST per Lakens'17: two one-sided t-tests against the SESOI
16
+ bounds; equivalence p = max of the two. Bootstrap: 10k resamples, seed 42, 90% CI.
17
+
18
+ uv run python -m eval.equivalence
19
+ Writes eval/results/equivalence.json.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import json
25
+ from pathlib import Path
26
+
27
+ import numpy as np
28
+ from scipy import stats
29
+
30
+ RESULTS = Path(__file__).resolve().parent / "results"
31
+ SESOI = 0.05 # pre-registered (roadmap W2.d) — do not change post hoc
32
+ N_BOOT = 10_000
33
+ SEED = 42
34
+
35
+ CHAMPION = "generalization_champion.json" # champion v6/seed21 (union)
36
+ RETRAINS = [ # the five SFT retrains (paper sec:negative)
37
+ ("generalization_challenger.json", "challenger seed31"),
38
+ ("generalization_v7.json", "v7 seed32 (unicode-punct archetype)"),
39
+ ("generalization_v8.json", "v8 seed33 (+109k harvested alias vocabs)"),
40
+ ("generalization_v9.json", "v9 seed34 (+MusicBrainz hints, gidcl pairs)"),
41
+ ("generalization_v10.json", "v10 seed35 (suspects-contract)"),
42
+ ]
43
+
44
+
45
+ def _per_source_f1(fname: str) -> dict[str, float]:
46
+ rec = json.loads((RESULTS / fname).read_text())[0]
47
+ return {s["source"]: s["f1"] for s in rec["per_source"]}, rec["gen_f1"]
48
+
49
+
50
+ def _tost(deltas: np.ndarray) -> dict:
51
+ """Two one-sided t-tests against [-SESOI, +SESOI]; equivalence p = max."""
52
+ p_lo = stats.ttest_1samp(deltas, -SESOI, alternative="greater").pvalue
53
+ p_hi = stats.ttest_1samp(deltas, +SESOI, alternative="less").pvalue
54
+ return {"p_lower": float(p_lo), "p_upper": float(p_hi),
55
+ "p_tost": float(max(p_lo, p_hi)), "n": int(len(deltas)),
56
+ "mean": float(deltas.mean()), "sd": float(deltas.std(ddof=1))}
57
+
58
+
59
+ def main() -> dict:
60
+ champ, champ_macro = _per_source_f1(CHAMPION)
61
+ pooled, per_retrain = [], []
62
+ for fname, label in RETRAINS:
63
+ ps, macro = _per_source_f1(fname)
64
+ assert set(ps) == set(champ), f"{fname}: source mismatch vs champion"
65
+ per_retrain.append({
66
+ "retrain": label, "file": fname,
67
+ "macro_gen_f1": round(macro, 6),
68
+ "macro_delta": round(macro - champ_macro, 6),
69
+ "per_dataset_delta": {s: round(ps[s] - champ[s], 6) for s in champ},
70
+ })
71
+ pooled += [ps[s] - champ[s] for s in sorted(champ)]
72
+ deltas = np.array(pooled)
73
+
74
+ rng = np.random.default_rng(SEED)
75
+ boot = np.array([rng.choice(deltas, size=len(deltas), replace=True).mean()
76
+ for _ in range(N_BOOT)])
77
+ ci = (float(np.percentile(boot, 5)), float(np.percentile(boot, 95)))
78
+
79
+ macro_deltas = np.array([r["macro_delta"] for r in per_retrain])
80
+ out = {
81
+ "spec": {"sesoi": SESOI, "sesoi_preregistered": "docs/ROADMAP_PUBLICATION.md W2.d",
82
+ "n_boot": N_BOOT, "seed": SEED, "ci_level": 0.90,
83
+ "champion": CHAMPION, "champion_macro_gen_f1": round(champ_macro, 6)},
84
+ "granularity": ("per-dataset (3 held-out sources x 5 retrains = 15 paired "
85
+ "deltas). Per-pair rows do not exist for the retrain series "
86
+ "(only the shipped pipeline was scored on the 42-pair bench); "
87
+ "within-retrain deltas are clustered, hence the retrain-level "
88
+ "robustness check below."),
89
+ "per_retrain": per_retrain,
90
+ "pooled_per_dataset": {
91
+ **_tost(deltas),
92
+ "ci90_bootstrap": [round(ci[0], 6), round(ci[1], 6)],
93
+ "ci90_width": round(ci[1] - ci[0], 6),
94
+ "equivalent_at_sesoi": bool(-SESOI < ci[0] and ci[1] < SESOI),
95
+ },
96
+ "retrain_level_robustness": _tost(macro_deltas),
97
+ "caveat": ("GEN-F1 sits near floor (champion 0.015 absolute), so the bound "
98
+ "certifies absence of movement on a low-dynamic-range metric; "
99
+ "the CI width (~0.004) shows the data could have detected effects "
100
+ "an order of magnitude smaller than the 0.05 SESOI."),
101
+ }
102
+ p = out["pooled_per_dataset"]
103
+ out["paper_sentence"] = (
104
+ f"Across the five-retrain series the mean held-out GEN-F1 delta (retrain "
105
+ f"minus champion, per-dataset, n={p['n']}) is {p['mean']:+.4f} (90\\% "
106
+ f"bootstrap CI [{ci[0]:+.4f}, {ci[1]:+.4f}]); TOST rejects effects larger "
107
+ f"than the pre-registered $\\pm$0.05 SESOI (p = {p['p_tost']:.1e}), and the "
108
+ f"retrain-level check (n=5 macro deltas) agrees "
109
+ f"(p = {out['retrain_level_robustness']['p_tost']:.1e}).")
110
+
111
+ (RESULTS / "equivalence.json").write_text(json.dumps(out, indent=2) + "\n")
112
+ print(json.dumps({k: out[k] for k in ("pooled_per_dataset",
113
+ "retrain_level_robustness",
114
+ "paper_sentence")}, indent=2))
115
+ return out
116
+
117
+
118
+ if __name__ == "__main__":
119
+ main()
eval/generalization.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """D1 — the GENERALIZATION metric: held-out-source real-error evaluation.
2
+
3
+ The wide-suite REAL slice mixes sources whose pairs are IN the champion's training mix
4
+ (hospital/beers/movies_1 -> mixA), so it part-measures memorization. This metric fixes
5
+ that and one more honesty problem:
6
+
7
+ * HELD-OUT SOURCES ONLY: a model is scored only on real-error benchmarks whose pairs
8
+ were never used to train it. The split is explicit and committed (TRAIN_SOURCES);
9
+ new harvested sources must be assigned to exactly one side.
10
+ * ERROR-CLASS BREAKDOWN: benchmark errors split by the SAME variant gate the training
11
+ derivation uses (training.real_data._is_variant — one source of truth). A
12
+ canonicalization system claims competence on the VARIANT class (typos / casing /
13
+ aliases); imputation-class errors (missing or non-variant rewrites) are reported,
14
+ never hidden, but a system that abstains on them is behaving correctly.
15
+
16
+ Headline numbers per system:
17
+ GEN-F1 churn-neutral F1 over ALL errors, macro over held-out sources
18
+ VARIANT-RECALL share of variant-class errors repaired (claimed competence)
19
+ VARIANT-PREC of committed changes on variant cells, share correct
20
+ damage clean cells corrupted (churn-neutral)
21
+
22
+ DISCLOSED class imperfection: the string-variant gate over-counts on flights —
23
+ single-digit time differences ('7:59 p.m.' vs '7:58 p.m.') pass the similarity
24
+ threshold but are cross-source VALUE disagreements (need per-entity cross-row
25
+ voting, a different capability), not surface canonicalization. ~950 of flights'
26
+ 1049 "variant" errors are of this kind; treat flights' variant-recall as a
27
+ lower-bound stress number, not addressable headroom.
28
+
29
+ uv run python -m eval.generalization # grounded heuristic baseline
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import argparse
35
+ import json
36
+
37
+ from scrubdata.executor import apply_plan
38
+ from scrubdata.planner import mock_plan
39
+ from training.real_data import _is_variant
40
+
41
+ from .metrics import _cell_equal
42
+ from .run_real_multi import _cell_only, _fetch, _sem_equal, score
43
+
44
+ # pairs used to train the current champion (v6 = mixA) — anything here is OFF-LIMITS
45
+ # for generalization scoring of that model. Update per training run.
46
+ TRAIN_SOURCES = {"v6": {"hospital", "beers", "movies_1"}}
47
+
48
+ # held-out real-error sources. Harvested D1 sources get appended here OR to the
49
+ # training side — never both. ed2_restaurants (stage-2 harvest): real NYC-restaurant
50
+ # typos, in-regime, EVAL-ONLY — its sibling domain source (fodors_zagats) trains, so
51
+ # this measures cross-source same-domain transfer. dblp_scholar was REJECTED as an
52
+ # eval source: its gold systematically prefers the opposite case convention from the
53
+ # dirty side (Scholar lowercase vs DBLP Title Case), which measures convention
54
+ # preference, not cleaning — the artifact this metric is designed against.
55
+ EVAL_SOURCES = ["flights", "rayyan", "ed2_restaurants"]
56
+
57
+
58
+ def variant_breakdown(dirty, clean, out) -> dict:
59
+ """Split benchmark errors by class and count repairs per class (churn-neutral)."""
60
+ n = min(len(dirty), len(out), len(clean))
61
+ c = {"variant_errors": 0, "variant_fixed": 0, "variant_changed": 0,
62
+ "variant_good": 0, "other_errors": 0, "other_fixed": 0}
63
+ for j, col in enumerate(dirty.columns):
64
+ present = col in out.columns
65
+ for i in range(n):
66
+ dv, cv = dirty.iat[i, j], clean.iat[i, j]
67
+ if _cell_equal(dv, cv):
68
+ continue # not a benchmark error
69
+ ov = out.iloc[i][col] if present else dv
70
+ chg = present and not _cell_equal(ov, dv)
71
+ if chg and _sem_equal(ov, dv) and not _cell_equal(ov, cv):
72
+ chg = False # churn: ignore
73
+ fixed = _cell_equal(ov, cv) or (_sem_equal(ov, cv) and chg)
74
+ is_variant = (str(dv).strip() and str(cv).strip()
75
+ and _is_variant(str(dv), str(cv)))
76
+ if is_variant:
77
+ c["variant_errors"] += 1
78
+ c["variant_fixed"] += int(fixed)
79
+ if chg:
80
+ c["variant_changed"] += 1
81
+ c["variant_good"] += int(_sem_equal(ov, cv))
82
+ else:
83
+ c["other_errors"] += 1
84
+ c["other_fixed"] += int(fixed)
85
+ return c
86
+
87
+
88
+ def evaluate_generalization(planner, sources=None, label: str = "system") -> dict:
89
+ sources = sources or EVAL_SOURCES
90
+ rows = []
91
+ for name in sources:
92
+ # FULL tables, no truncation — ed2_restaurants' real errors are concentrated
93
+ # outside the first 2k rows (_raha_pair's head(2000) hid 473 of 477).
94
+ dirty, clean = _fetch(name)
95
+ cleaned, _ = apply_plan(dirty, _cell_only(planner(dirty)))
96
+ m = score(dirty, clean, cleaned)
97
+ b = variant_breakdown(dirty, clean, cleaned)
98
+ rows.append({"source": name, **{k: m[k] for k in
99
+ ("f1", "precision", "recall", "damage")}, **b})
100
+ print(f" {name:<10} F1={m['f1']:.3f} dmg={m['damage']:.3f} | variant: "
101
+ f"{b['variant_fixed']}/{b['variant_errors']} fixed, "
102
+ f"{b['variant_good']}/{b['variant_changed']} changes good | "
103
+ f"other: {b['other_fixed']}/{b['other_errors']}", flush=True)
104
+
105
+ return _aggregate(rows, sources, label)
106
+
107
+
108
+ def evaluate_captured_union(plans: dict, sources, label: str, tau: float = 0.5) -> dict:
109
+ """Score the SHIPPED pipeline from captured raw model plans (Modal --capture):
110
+ per source, verify(tau) the captured plan, union with the grounded heuristic —
111
+ byte-identical composition to scrubdata/active.py."""
112
+ from scrubdata.verifier import union_plans, verify_plan
113
+
114
+ def planner_for(name):
115
+ def planner(df, *_):
116
+ return union_plans(verify_plan(df, plans[name], tau=tau), mock_plan(df))
117
+ return planner
118
+
119
+ rows = []
120
+ for name in sources:
121
+ dirty, clean = _fetch(name)
122
+ cleaned, _ = apply_plan(dirty, _cell_only(planner_for(name)(dirty)))
123
+ m = score(dirty, clean, cleaned)
124
+ b = variant_breakdown(dirty, clean, cleaned)
125
+ rows.append({"source": name, **{k: m[k] for k in
126
+ ("f1", "precision", "recall", "damage")}, **b})
127
+ print(f" {name:<16} F1={m['f1']:.3f} dmg={m['damage']:.3f} | variant: "
128
+ f"{b['variant_fixed']}/{b['variant_errors']} fixed", flush=True)
129
+ return _aggregate(rows, sources, label)
130
+
131
+
132
+ def _aggregate(rows, sources, label) -> dict:
133
+ def mean(xs):
134
+ xs = list(xs)
135
+ return sum(xs) / len(xs) if xs else 0.0
136
+
137
+ def rate(num, den):
138
+ return num / den if den else 0.0
139
+
140
+ out = {
141
+ "system": label, "sources": list(sources),
142
+ "gen_f1": mean(r["f1"] for r in rows),
143
+ "variant_recall": mean(rate(r["variant_fixed"], r["variant_errors"]) for r in rows),
144
+ "variant_precision": mean(rate(r["variant_good"], r["variant_changed"])
145
+ if r["variant_changed"] else 1.0 for r in rows),
146
+ "other_recall": mean(rate(r["other_fixed"], r["other_errors"]) for r in rows),
147
+ "damage": mean(r["damage"] for r in rows),
148
+ "per_source": rows,
149
+ }
150
+ print(f"{label}: GEN-F1={out['gen_f1']:.3f} VARIANT-RECALL={out['variant_recall']:.3f} "
151
+ f"VARIANT-PREC={out['variant_precision']:.3f} dmg={out['damage']:.3f}")
152
+ return out
153
+
154
+
155
+ def main() -> None:
156
+ ap = argparse.ArgumentParser()
157
+ ap.add_argument("--sources", default=",".join(EVAL_SOURCES))
158
+ ap.add_argument("--plans", default=None,
159
+ help="JSON file {source: captured raw model plan} -> score the "
160
+ "shipped union pipeline instead of the local baselines")
161
+ ap.add_argument("--label", default="captured union")
162
+ ap.add_argument("--out", default="eval/results/generalization_baseline.json")
163
+ args = ap.parse_args()
164
+ sources = args.sources.split(",")
165
+ if args.plans:
166
+ plans = json.load(open(args.plans))
167
+ results = [evaluate_captured_union(plans, sources, args.label)]
168
+ else:
169
+ results = [
170
+ evaluate_generalization(mock_plan, sources, "grounded heuristic"),
171
+ evaluate_generalization(
172
+ lambda df: {"table_operations": [], "columns": [], "flags": []},
173
+ sources, "no-op"),
174
+ ]
175
+ json.dump(results, open(args.out, "w"), indent=1)
176
+ print(f"written to {args.out}")
177
+
178
+
179
+ if __name__ == "__main__":
180
+ main()
eval/gittables_audit.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """N=250 GitTables audit — the at-scale trust + repair board.
2
+
3
+ 250 real GitHub tables (LUH-DBS Matelda GitTables-subsets, Apache-2.0; injected
4
+ typos on real heterogeneous tables) scored end-to-end with the shipped pipeline:
5
+ schema validity, SILENT-EDIT attribution (the trust contract at scale), and the
6
+ churn-neutral repair metric. No inject-recovery here (these pairs carry their own
7
+ errors). Summary feeds docs/GITTABLES_AUDIT.md.
8
+
9
+ uv run python -m eval.gittables_audit
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import time
16
+ from pathlib import Path
17
+
18
+ import pandas as pd
19
+
20
+ from scrubdata.executor import apply_plan
21
+ from scrubdata.planner import mock_plan
22
+
23
+ from .metrics import is_valid
24
+ from .run_real_multi import _cell_only, score
25
+ from .wild_bench import behavioral
26
+
27
+ ROOT = Path(__file__).resolve().parent.parent
28
+ DIR = ROOT / "data" / "gittables250"
29
+ N_CAP = 3000
30
+
31
+
32
+ def _load(p: Path):
33
+ kw = dict(dtype=str, keep_default_na=False, nrows=N_CAP, on_bad_lines="skip")
34
+ try:
35
+ return pd.read_csv(p, encoding_errors="replace", **kw)
36
+ except Exception: # noqa: BLE001
37
+ return pd.read_csv(p, engine="python", **kw)
38
+
39
+
40
+ def main() -> None:
41
+ slugs = sorted({p.name.split("_")[0] for p in DIR.glob("t*_dirty.csv")})
42
+ rows, failures = [], []
43
+ t0 = time.perf_counter()
44
+ for slug in slugs:
45
+ try:
46
+ dirty = _load(DIR / f"{slug}_dirty.csv")
47
+ clean = _load(DIR / f"{slug}_clean.csv")
48
+ n = min(len(dirty), len(clean))
49
+ if n < 3 or dirty.shape[1] < 2:
50
+ continue
51
+ dirty, clean = dirty.head(n), clean.head(n)
52
+ b = behavioral(dirty)
53
+ plan = _cell_only(mock_plan(dirty))
54
+ cleaned, _ = apply_plan(dirty, plan)
55
+ m = score(dirty, clean, cleaned)
56
+ rows.append({"table": slug, "rows": n, "cols": dirty.shape[1],
57
+ "plan_valid": b["plan_valid"],
58
+ "silent_edit_columns": len(b["silent_edit_columns"]),
59
+ "errors": m["_errors"], "f1": round(m["f1"], 3),
60
+ "damage": round(m["damage"], 4)})
61
+ except Exception as e: # noqa: BLE001
62
+ failures.append(f"{slug}: {type(e).__name__}")
63
+ dt = time.perf_counter() - t0
64
+
65
+ n = len(rows)
66
+ valid = sum(r["plan_valid"] for r in rows)
67
+ silent = sum(1 for r in rows if r["silent_edit_columns"])
68
+ scored = [r for r in rows if r["errors"] > 0]
69
+ f1s = [r["f1"] for r in scored]
70
+ dmgs = [r["damage"] for r in rows]
71
+ summary = {
72
+ "tables_audited": n, "pipeline_failures": len(failures),
73
+ "plan_valid": valid, "tables_with_silent_edits": silent,
74
+ "tables_with_errors": len(scored),
75
+ "macro_f1_on_errored": round(sum(f1s) / len(f1s), 3) if f1s else None,
76
+ "macro_damage": round(sum(dmgs) / len(dmgs), 4),
77
+ "zero_damage_tables": sum(1 for d in dmgs if d == 0),
78
+ "seconds": round(dt, 1),
79
+ }
80
+ json.dump({"summary": summary, "rows": rows, "failures": failures},
81
+ open(ROOT / "eval" / "results" / "gittables_audit.json", "w"), indent=1)
82
+ L = ["# GitTables N=250 audit — trust contract at scale", "",
83
+ f"Shipped pipeline over {n} real GitHub tables (Matelda GitTables-subsets,",
84
+ "Apache-2.0; injected typos on real heterogeneous tables).", "",
85
+ "| metric | value |", "|---|---|"]
86
+ for k, v in summary.items():
87
+ L.append(f"| {k} | {v} |")
88
+ (ROOT / "docs" / "GITTABLES_AUDIT.md").write_text("\n".join(L) + "\n")
89
+ print(json.dumps(summary, indent=1))
90
+ if failures:
91
+ print("failures:", failures[:8])
92
+
93
+
94
+ if __name__ == "__main__":
95
+ main()
eval/gold.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval/gold.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Frozen held-out gold eval set (committed to eval/gold.jsonl).
2
+
3
+ A FIXED test set so every fine-tune iteration (and generator change) is scored on the
4
+ same examples — v1 vs v2 stay comparable. Regenerate intentionally with `build_gold`.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import random
11
+ from pathlib import Path
12
+
13
+ import pandas as pd
14
+
15
+ from training.generate import make_example
16
+
17
+ from . import metrics
18
+
19
+ GOLD_PATH = Path(__file__).resolve().parent / "gold.jsonl"
20
+
21
+
22
+ def build_gold(n: int = 300, seed: int = 4242, path: Path = GOLD_PATH) -> list[dict]:
23
+ rng = random.Random(seed)
24
+ out = []
25
+ while len(out) < n:
26
+ ex = make_example(rng)
27
+ if metrics.recovery(ex["clean_df"], ex["dirty_df"], ex["plan"]) >= 0.999:
28
+ out.append(ex)
29
+ with Path(path).open("w", encoding="utf-8") as f:
30
+ for ex in out:
31
+ clean = ex["clean_df"].where(pd.notna(ex["clean_df"]), None)
32
+ f.write(json.dumps({
33
+ "dirty": ex["dirty_df"].to_dict("records"),
34
+ "clean": clean.to_dict("records"),
35
+ "dirty_cols": list(ex["dirty_df"].columns),
36
+ "clean_cols": list(ex["clean_df"].columns),
37
+ "plan": ex["plan"],
38
+ }, ensure_ascii=False, default=str) + "\n")
39
+ return out
40
+
41
+
42
+ def load_gold(path: Path = GOLD_PATH) -> list[dict]:
43
+ p = Path(path)
44
+ if not p.exists():
45
+ return build_gold(path=p)
46
+ out = []
47
+ for line in p.read_text(encoding="utf-8").splitlines():
48
+ d = json.loads(line)
49
+ dirty = (pd.DataFrame(d["dirty"])[d["dirty_cols"]] if d["dirty"]
50
+ else pd.DataFrame(columns=d["dirty_cols"]))
51
+ clean = (pd.DataFrame(d["clean"])[d["clean_cols"]] if d["clean"]
52
+ else pd.DataFrame(columns=d["clean_cols"]))
53
+ out.append({"dirty_df": dirty, "clean_df": clean, "plan": d["plan"]})
54
+ return out
55
+
56
+
57
+ if __name__ == "__main__":
58
+ import argparse
59
+ ap = argparse.ArgumentParser()
60
+ ap.add_argument("--n", type=int, default=300)
61
+ ap.add_argument("--seed", type=int, default=4242)
62
+ args = ap.parse_args()
63
+ g = build_gold(args.n, args.seed)
64
+ print(f"Wrote {len(g)} frozen gold examples to {GOLD_PATH}")
eval/inject.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Seeded, self-verifying error injection — turns any CLEAN table into dirty/clean
2
+ validation. This is the de-biasing core of the north-star: our 20+ harvested clean
3
+ domains become per-cell-ground-truth validation across error types, far beyond any one
4
+ published benchmark.
5
+
6
+ Self-contained (no nlpaug/BART deps): we inject a KNOWN corruption into a clean cell, so
7
+ the (dirty -> clean) ground truth is exact and the run is reproducible (fixed seed).
8
+
9
+ Injects RECOVERABLE error types (the cleaner can restore the clean value): typo, ocr,
10
+ case, whitespace — i.e. the canonicalization + format axes. Targets CATEGORICAL text
11
+ columns (recurring values), where canonicalization is the task.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import random
17
+ import string
18
+
19
+ _OCR = {"O": "0", "o": "0", "l": "1", "I": "1", "S": "5", "s": "5",
20
+ "B": "8", "Z": "2", "z": "2", "g": "9", "G": "6", "b": "6"}
21
+
22
+
23
+ def _typo(s: str, rng: random.Random) -> str:
24
+ if len(s) < 4:
25
+ return s
26
+ i = rng.randrange(1, len(s) - 1)
27
+ if not s[i].isalpha():
28
+ return s
29
+ m = rng.random()
30
+ if m < 0.55: # substitute (the classic 'birminghxm')
31
+ pool = string.ascii_uppercase if s[i].isupper() else string.ascii_lowercase
32
+ return s[:i] + rng.choice([c for c in pool if c != s[i].lower()]) + s[i + 1:]
33
+ if m < 0.78: # delete
34
+ return s[:i] + s[i + 1:]
35
+ return s[:i] + s[i + 1] + s[i] + s[i + 2:] # transpose
36
+
37
+
38
+ def _ocr(s: str, rng: random.Random) -> str:
39
+ idxs = [i for i, c in enumerate(s) if c in _OCR]
40
+ if not idxs:
41
+ return _typo(s, rng)
42
+ i = rng.choice(idxs)
43
+ return s[:i] + _OCR[s[i]] + s[i + 1:]
44
+
45
+
46
+ def _case(s: str, rng: random.Random) -> str:
47
+ return rng.choice([s.upper(), s.lower(), s.title()])
48
+
49
+
50
+ def _ws(s: str, rng: random.Random) -> str:
51
+ return rng.choice([" " * rng.randint(1, 2) + s, s + " " * rng.randint(1, 2),
52
+ s.replace(" ", " ", 1) if " " in s else " " + s])
53
+
54
+
55
+ INJECTORS = {"typo": _typo, "ocr": _ocr, "case": _case, "whitespace": _ws}
56
+
57
+
58
+ def _categorical_text_cols(df, max_cols: int = 12) -> list[str]:
59
+ """Text columns whose values RECUR (canonicalization is meaningful)."""
60
+ out = []
61
+ for c in df.columns:
62
+ vals = [str(v).strip() for v in df[c].tolist() if str(v).strip()]
63
+ if len(vals) < 20:
64
+ continue
65
+ alpha = sum(1 for v in vals if any(ch.isalpha() for ch in v)) / len(vals)
66
+ nonnum = 0
67
+ for v in vals:
68
+ try:
69
+ float(v.replace(",", ""))
70
+ except ValueError:
71
+ nonnum += 1
72
+ if alpha < 0.7 or nonnum / len(vals) < 0.7:
73
+ continue
74
+ if len(set(vals)) / len(vals) > 0.5: # must recur (categorical)
75
+ continue
76
+ out.append(c)
77
+ if len(out) >= max_cols:
78
+ break
79
+ return out
80
+
81
+
82
+ def inject(clean_df, error_type: str, seed: int, rate: float = 0.07):
83
+ """Return a dirty copy of `clean_df` with `error_type` errors injected into a
84
+ `rate` fraction of cells in its categorical-text columns, or None if no eligible
85
+ column. The original `clean_df` is the exact ground truth."""
86
+ fn = INJECTORS[error_type]
87
+ cols = _categorical_text_cols(clean_df)
88
+ if not cols:
89
+ return None
90
+ rng = random.Random(seed)
91
+ dirty = clean_df.copy()
92
+ touched = 0
93
+ for c in cols:
94
+ col = dirty[c].tolist()
95
+ for i, v in enumerate(col):
96
+ s = str(v)
97
+ if s.strip() and rng.random() < rate:
98
+ nv = fn(s, rng)
99
+ if nv != s:
100
+ col[i] = nv
101
+ touched += 1
102
+ dirty[c] = col
103
+ return dirty if touched else None
eval/inject_validity.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """W4.5 inject-validity (TableEG-style) — does the injected slice LOOK like and RANK
2
+ like the real slice?
3
+
4
+ (1) Classifies every real dirty->gold cell error (hospital's 509 + all 42 paired
5
+ sources eval/paired_bench.py walks) with a deterministic taxonomy (typo/edit-dist<=2,
6
+ case-only, whitespace, encoding/mojibake, numeric, date-format, token-swap, missing,
7
+ other); (2) classifies the suite's INJECTED errors at the money-table seeds (7/17/27);
8
+ (3) reports Jensen-Shannon divergence (base 2) between injected and real type
9
+ distributions, pooled and per real source; (4) reports Kendall tau-b between system
10
+ rankings on the injected vs real F1 slices of money_table_head.json, with degenerate
11
+ policies (abstain-all / random-edit / oracle) run through the same suite as anchors.
12
+ Honesty rule: if the injector is far from real (high JSD), that IS the result — the
13
+ paper's mitigation (both slices reported separately) already stands.
14
+
15
+ uv run python -m eval.inject_validity # full run (~15 min CPU)
16
+ uv run python -m eval.inject_validity --tex-only # rebuild the snippet from JSON
17
+ Writes eval/results/inject_validity.json + eval/results/inject_validity_appendix.tex.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import collections
23
+ import json
24
+ import math
25
+ import time
26
+ from datetime import datetime
27
+ from pathlib import Path
28
+
29
+ from .degenerate import _abstain_all, _oracle, _random_edit
30
+ from .metrics import _cell_equal
31
+ from .paired_bench import _load, pairs
32
+ from .run_real_multi import build_suite, score
33
+
34
+ ROOT = Path(__file__).resolve().parent.parent
35
+ SEEDS = (7, 17, 27) # money-table seeds (run_real_multi.main)
36
+ CATS = ["typo", "case", "whitespace", "encoding", "numeric", "date-format",
37
+ "token-swap", "missing", "other"]
38
+ EXPECT = {"typo": "typo", "ocr": "typo", "case": "case", "whitespace": "whitespace"}
39
+ _MOJI = ("�", "Ã", "Â", "â€", "ï¿")
40
+ _DATE_FMTS = ("%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y", "%m/%d/%y", "%Y/%m/%d",
41
+ "%d-%m-%Y", "%b %d, %Y", "%B %d, %Y", "%d %b %Y", "%Y%m%d")
42
+
43
+
44
+ def _num(s: str):
45
+ t = s.strip().replace(",", "").lstrip("$").rstrip("%")
46
+ try:
47
+ return float(t)
48
+ except ValueError:
49
+ return None
50
+
51
+
52
+ def _date(s: str):
53
+ for f in _DATE_FMTS:
54
+ try:
55
+ return datetime.strptime(s.strip(), f).date()
56
+ except ValueError:
57
+ pass
58
+ return None
59
+
60
+
61
+ def _lev_gt2(a: str, b: str) -> bool:
62
+ """True iff Levenshtein(a, b) > 2 (banded DP, O(len*5))."""
63
+ k = 2
64
+ la, lb = len(a), len(b)
65
+ if abs(la - lb) > k:
66
+ return True
67
+ INF = k + 1
68
+ prev = [min(j, INF) for j in range(lb + 1)]
69
+ for i in range(1, la + 1):
70
+ lo, hi = max(1, i - k), min(lb, i + k)
71
+ cur = [INF] * (lb + 1)
72
+ if i <= k:
73
+ cur[0] = i
74
+ for j in range(lo, hi + 1):
75
+ cur[j] = min(prev[j] + 1, cur[j - 1] + 1,
76
+ prev[j - 1] + (a[i - 1] != b[j - 1]), INF)
77
+ prev = cur
78
+ if min(prev[max(0, lo - 1):hi + 1]) >= INF:
79
+ return True
80
+ return prev[lb] > k
81
+
82
+
83
+ def classify(d, g) -> str:
84
+ """Deterministic error type from (dirty, gold) cell pair. Order matters:
85
+ surface classes first, then value classes, edit-distance last."""
86
+ ds, gs = str(d), str(g)
87
+ if not ds.strip() or not gs.strip():
88
+ return "missing"
89
+ if "".join(ds.split()) == "".join(gs.split()):
90
+ return "whitespace"
91
+ if "".join(ds.split()).casefold() == "".join(gs.split()).casefold():
92
+ return "case"
93
+ if any(m in ds for m in _MOJI) != any(m in gs for m in _MOJI):
94
+ return "encoding"
95
+ if _num(ds) is not None and _num(gs) is not None:
96
+ return "numeric"
97
+ dd, gd = _date(ds), _date(gs)
98
+ if dd is not None and dd == gd:
99
+ return "date-format"
100
+ dt, gt = sorted(ds.casefold().split()), sorted(gs.casefold().split())
101
+ if dt == gt and len(dt) > 1:
102
+ return "token-swap"
103
+ if not _lev_gt2(ds.strip(), gs.strip()):
104
+ return "typo"
105
+ return "other"
106
+
107
+
108
+ def _classify_pair(dirty, clean) -> collections.Counter:
109
+ n = min(len(dirty), len(clean))
110
+ c = collections.Counter()
111
+ for j in range(dirty.shape[1]):
112
+ for i in range(n):
113
+ dv, cv = dirty.iat[i, j], clean.iat[i, j]
114
+ if not _cell_equal(dv, cv):
115
+ c[classify(dv, cv)] += 1
116
+ return c
117
+
118
+
119
+ def _jsd(p: dict, q: dict) -> float:
120
+ """Jensen-Shannon divergence, base 2 (0 = identical, 1 = disjoint)."""
121
+ sp, sq = sum(p.values()), sum(q.values())
122
+ out = 0.0
123
+ for k in set(p) | set(q):
124
+ a, b = p.get(k, 0) / sp, q.get(k, 0) / sq
125
+ m = (a + b) / 2
126
+ if a:
127
+ out += 0.5 * a * math.log2(a / m)
128
+ if b:
129
+ out += 0.5 * b * math.log2(b / m)
130
+ return out
131
+
132
+
133
+ def _tau_b(xs, ys) -> float:
134
+ """Kendall tau-b (tie-corrected); n is small, O(n^2) is fine."""
135
+ n0 = nc = nd = tx = ty = 0
136
+ for i in range(len(xs)):
137
+ for j in range(i + 1, len(xs)):
138
+ n0 += 1
139
+ a, b = xs[i] - xs[j], ys[i] - ys[j]
140
+ tx += a == 0
141
+ ty += b == 0
142
+ nc += a * b > 0
143
+ nd += a * b < 0
144
+ den = ((n0 - tx) * (n0 - ty)) ** 0.5
145
+ return (nc - nd) / den if den else 0.0
146
+
147
+
148
+ def _dist(counter) -> dict:
149
+ tot = sum(counter.values())
150
+ return {k: round(counter.get(k, 0) / tot, 4) for k in CATS} if tot else {}
151
+
152
+
153
+ def _suite_slices(cleaner) -> tuple[float, float]:
154
+ """(real-slice mean F1, injected-slice mean F1 over SEEDS) for a degenerate
155
+ cleaner(dirty, clean) -> out, mirroring run_real_multi's by-source means."""
156
+ real = []
157
+ for spec in build_suite(seed=SEEDS[0]):
158
+ if spec["source"] != "real":
159
+ continue
160
+ dirty, clean = spec["load"]()
161
+ real.append(score(dirty, clean, cleaner(dirty, clean))["f1"])
162
+ inj = []
163
+ for s in SEEDS:
164
+ fs = []
165
+ for spec in build_suite(seed=s):
166
+ if spec["source"] != "injected":
167
+ continue
168
+ loaded = spec["load"]()
169
+ if loaded is None:
170
+ continue
171
+ dirty, clean = loaded
172
+ fs.append(score(dirty, clean, cleaner(dirty, clean))["f1"])
173
+ inj.append(sum(fs) / len(fs))
174
+ return sum(real) / len(real), sum(inj) / len(inj)
175
+
176
+
177
+ def _write_tex(out: dict, res: Path) -> None:
178
+ rd, jd = out["real"]["pooled_dist"], out["injected"]["pooled_dist"]
179
+ j, rk = out["jsd"], out["ranking"]
180
+ L = [r"% Auto-generated by eval/inject_validity.py — do not edit by hand.",
181
+ r"\subsection{Validity of the Injected Slice}\label{app:inject-validity}",
182
+ r"Following the TableEG-style audit, we classify every error cell (dirty vs.\ gold)",
183
+ r"with a deterministic taxonomy and compare the suite's injected errors (money-table",
184
+ r"seeds " + "/".join(map(str, out["seeds"])) + r", $n=" +
185
+ f"{out['injected']['n']:,}".replace(",", r"{,}") + r"$) against the $" +
186
+ f"{out['real']['n']:,}".replace(",", r"{,}") +
187
+ r"$ real errors across the 42 paired sources (hospital's " +
188
+ f"{out['real']['hospital_n']}" + r" included).",
189
+ r"\begin{table}[t]\centering\small",
190
+ r"\caption{Error-type distributions, real vs.\ injected (pooled).}",
191
+ r"\label{tab:inject-validity}",
192
+ r"\begin{tabular}{lrr}\toprule",
193
+ r"error type & real & injected \\ \midrule"]
194
+ for c in CATS:
195
+ L.append(f"{c} & {rd.get(c, 0):.3f} & {jd.get(c, 0):.3f} " + r"\\")
196
+ L += [r"\bottomrule\end{tabular}\end{table}",
197
+ r"The injector covers only the recoverable surface classes it targets by design",
198
+ r"(typo/case/whitespace; injector--taxonomy agreement " +
199
+ f"{out['injected']['injector_taxonomy_agreement']:.3f}" + r"), whereas real errors",
200
+ r"are dominated by substitutions beyond edit distance~2 (other, " +
201
+ f"{rd['other']:.3f}" + r") and short typos (" + f"{rd['typo']:.3f}" +
202
+ r"), with numeric (" + f"{rd['numeric']:.3f}" + r"), missing-value (" +
203
+ f"{rd['missing']:.3f}" + r"), and encoding classes the injector never produces.",
204
+ r"Pooled Jensen--Shannon divergence is " + f"{j['pooled']:.3f}" +
205
+ r"~bits (per-source median " + f"{j['median']:.3f}" + r", range " +
206
+ f"{j['min']:.3f}" + r"--" + f"{j['max']:.3f}" + r"; hospital " +
207
+ f"{j['hospital_vs_injected']:.3f}" + r"): the two slices are \emph{not}",
208
+ r"interchangeable, which is why the paper reports them separately and localizes",
209
+ r"the grounding claim in the real slice. Ranking preservation is partial: Kendall",
210
+ r"$\tau_b$ between system rankings on the injected vs.\ real F1 slices is $" +
211
+ f"{rk['kendall_tau_b_money_table']:.2f}" + r"$ over the four cross-system rows and $" +
212
+ f"{rk['kendall_tau_b_with_anchors']:.2f}" + r"$ with the degenerate anchors",
213
+ r"(abstain-all, random-edit, oracle) included. The injected slice preserves the",
214
+ r"floor/ceiling ordering but ranks OpenRefine fingerprint above both our system",
215
+ r"and OpenRefine kNN, the reverse of the real slice --- frequency clustering looks",
216
+ r"strong exactly where the canonical form is present and dominant by construction.",
217
+ r"Injected-only evaluation would therefore overstate frequency-clustering",
218
+ r"baselines."]
219
+ (res / "inject_validity_appendix.tex").write_text("\n".join(L) + "\n")
220
+
221
+
222
+ def main() -> None:
223
+ t0 = time.perf_counter()
224
+ # (1) real errors: all 42 paired sources (hospital included -> its 509)
225
+ real_per: dict[str, collections.Counter] = {}
226
+ for p in pairs():
227
+ try:
228
+ dirty, clean = _load(p)
229
+ except Exception as e: # noqa: BLE001
230
+ print(f" {p.name}: LOAD FAILED {type(e).__name__}")
231
+ continue
232
+ real_per[p.name] = _classify_pair(dirty, clean)
233
+ print(f" real {p.name:<46} n={sum(real_per[p.name].values())}", flush=True)
234
+ real_pool = sum(real_per.values(), collections.Counter())
235
+ t_real = time.perf_counter() - t0
236
+
237
+ # (2) injected errors at the money-table seeds, via the SAME suite generator
238
+ inj_pool = collections.Counter()
239
+ inj_per_injector: dict[str, collections.Counter] = collections.defaultdict(collections.Counter)
240
+ inj_per_seed = {}
241
+ for s in SEEDS:
242
+ cs = collections.Counter()
243
+ for spec in build_suite(seed=s):
244
+ if spec["source"] != "injected":
245
+ continue
246
+ loaded = spec["load"]()
247
+ if loaded is None:
248
+ continue
249
+ dirty, clean = loaded
250
+ c = _classify_pair(dirty, clean)
251
+ cs += c
252
+ inj_per_injector[spec["name"].split(":")[1]] += c
253
+ inj_per_seed[s] = sum(cs.values())
254
+ inj_pool += cs
255
+ print(f" injected seed={s} n={inj_per_seed[s]}", flush=True)
256
+ agree = sum(inj_per_injector[et][want] for et, want in EXPECT.items())
257
+ t_inj = time.perf_counter() - t0 - t_real
258
+
259
+ # (3) distribution similarity
260
+ jsd_per_source = {k: round(_jsd(real_per[k], inj_pool), 4)
261
+ for k in sorted(real_per) if real_per[k]}
262
+ jsd_vals = sorted(jsd_per_source.values())
263
+ # (4) ranking preservation: money-table systems + degenerate anchors
264
+ money = json.load(open(ROOT / "eval" / "results" / "money_table_head.json"))
265
+ systems = [{"system": r["system"], "real_f1": r["real_f1"], "inj_f1": r["inj_f1"],
266
+ "anchor": False} for r in money]
267
+ for name, fn in [("abstain-all", _abstain_all), ("random-edit", _random_edit),
268
+ ("oracle", _oracle)]:
269
+ rf, jf = _suite_slices(fn)
270
+ systems.append({"system": name, "real_f1": rf, "inj_f1": jf, "anchor": True})
271
+ print(f" anchor {name:<12} real={rf:.3f} inj={jf:.3f}", flush=True)
272
+ tau_money = _tau_b([s["real_f1"] for s in systems if not s["anchor"]],
273
+ [s["inj_f1"] for s in systems if not s["anchor"]])
274
+ tau_all = _tau_b([s["real_f1"] for s in systems], [s["inj_f1"] for s in systems])
275
+
276
+ out = {
277
+ "taxonomy": CATS, "seeds": list(SEEDS),
278
+ "real": {"n": sum(real_pool.values()), "n_sources": len(real_per),
279
+ "hospital_n": sum(real_per.get("hospital", {}).values()),
280
+ "pooled_counts": dict(real_pool), "pooled_dist": _dist(real_pool),
281
+ "per_source": {k: {"n": sum(v.values()), "dist": _dist(v)}
282
+ for k, v in sorted(real_per.items())}},
283
+ "injected": {"n": sum(inj_pool.values()), "per_seed_n": inj_per_seed,
284
+ "pooled_counts": dict(inj_pool), "pooled_dist": _dist(inj_pool),
285
+ "per_injector_dist": {k: _dist(v)
286
+ for k, v in sorted(inj_per_injector.items())},
287
+ "injector_taxonomy_agreement": round(agree / sum(inj_pool.values()), 4)},
288
+ "jsd": {"pooled": round(_jsd(real_pool, inj_pool), 4),
289
+ "hospital_vs_injected": round(_jsd(real_per["hospital"], inj_pool), 4),
290
+ "per_real_source_vs_injected": jsd_per_source,
291
+ "min": jsd_vals[0], "median": jsd_vals[len(jsd_vals) // 2],
292
+ "max": jsd_vals[-1]},
293
+ "ranking": {"systems": systems,
294
+ "kendall_tau_b_money_table": round(tau_money, 4),
295
+ "kendall_tau_b_with_anchors": round(tau_all, 4)},
296
+ "sec": {"real_classify": round(t_real, 1), "injected_classify": round(t_inj, 1),
297
+ "total": round(time.perf_counter() - t0, 1)},
298
+ }
299
+ res = ROOT / "eval" / "results"
300
+ json.dump(out, open(res / "inject_validity.json", "w"), indent=1)
301
+ _write_tex(out, res)
302
+ print(f"JSD pooled={out['jsd']['pooled']} tau(money)={tau_money:.3f} "
303
+ f"tau(+anchors)={tau_all:.3f} -> {res / 'inject_validity.json'} "
304
+ f"+ inject_validity_appendix.tex ({out['sec']['total']}s)")
305
+
306
+
307
+ if __name__ == "__main__":
308
+ import argparse
309
+ ap = argparse.ArgumentParser()
310
+ ap.add_argument("--tex-only", action="store_true",
311
+ help="rebuild the LaTeX snippet from the existing JSON")
312
+ if ap.parse_args().tex_only:
313
+ res = ROOT / "eval" / "results"
314
+ _write_tex(json.load(open(res / "inject_validity.json")), res)
315
+ print(f"-> {res / 'inject_validity_appendix.tex'}")
316
+ else:
317
+ main()