Spaces:
Running
Running
OpenAI Codex OpenAI Codex commited on
Commit ·
16dc556
0
Parent(s):
deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build
Browse filesTags the submission for OpenAI's Best Use of Codex prize — backed by real
Codex-attributed commits (@codex in the connected GitHub repo + this Space's history).
Same human-verified Codex-hardened build (84 tests green).
Co-authored-by: OpenAI Codex <codex@openai.com>
This view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +47 -0
- .gitignore +37 -0
- .python-version +1 -0
- PRODUCT.md +162 -0
- README.md +231 -0
- TRANSFER.md +69 -0
- app.py +90 -0
- design/mockups/calm/index.html +430 -0
- design/mockups/cozy/index.html +526 -0
- design/mockups/helper/index.html +517 -0
- design/mockups/office/index.html +219 -0
- docs/DATASETS.md +57 -0
- docs/DEGENERATE_BASELINES.md +30 -0
- docs/FIELD_NOTES.md +128 -0
- docs/GITTABLES_AUDIT.md +24 -0
- docs/PAIRED_BENCH.md +49 -0
- docs/PAPER.md +66 -0
- docs/SCALING_ARM.md +46 -0
- docs/TOOL_REFERENCE.md +251 -0
- docs/WILD_BENCH.md +41 -0
- docs/assets/space_landing.png +3 -0
- docs/assets/space_results.png +3 -0
- docs/paper/fig_label_curve.pdf +3 -0
- docs/paper/fig_label_curve.png +3 -0
- docs/paper/fig_precision_coverage.pdf +3 -0
- docs/paper/fig_precision_coverage.png +3 -0
- docs/paper/fig_risk_coverage.pdf +3 -0
- docs/paper/fig_risk_coverage.png +3 -0
- docs/paper/main.aux +59 -0
- docs/paper/main.log +269 -0
- docs/paper/main.pdf +3 -0
- docs/paper/main.tex +1021 -0
- docs/paper/numbers.tex +146 -0
- eval/README.md +136 -0
- eval/__init__.py +12 -0
- eval/ablations.py +64 -0
- eval/baselines_learned.py +145 -0
- eval/calibration.py +119 -0
- eval/capture_plan_local.py +90 -0
- eval/contamination_probe.py +57 -0
- eval/cross_scoring.py +294 -0
- eval/degenerate.py +172 -0
- eval/diagnose_model.py +91 -0
- eval/equivalence.py +119 -0
- eval/generalization.py +180 -0
- eval/gittables_audit.py +95 -0
- eval/gold.jsonl +0 -0
- eval/gold.py +64 -0
- eval/inject.py +103 -0
- eval/inject_validity.py +317 -0
.gitattributes
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
docs/assets/space_results.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
docs/paper/main.pdf filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
docs/paper-eab/main.pdf filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
docs/paper-pvldb/main.pdf filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
.venv/
|
| 5 |
+
*.egg-info/
|
| 6 |
+
|
| 7 |
+
# uv
|
| 8 |
+
.uv/
|
| 9 |
+
|
| 10 |
+
# Gradio
|
| 11 |
+
.gradio/
|
| 12 |
+
flagged/
|
| 13 |
+
|
| 14 |
+
# Models / data (keep large artifacts out of git; push to the Hub instead)
|
| 15 |
+
*.gguf
|
| 16 |
+
*.bin
|
| 17 |
+
*.safetensors
|
| 18 |
+
models/
|
| 19 |
+
data/
|
| 20 |
+
|
| 21 |
+
# Env / secrets
|
| 22 |
+
.env
|
| 23 |
+
.env.*
|
| 24 |
+
|
| 25 |
+
# OS / editor
|
| 26 |
+
.DS_Store
|
| 27 |
+
.idea/
|
| 28 |
+
.vscode/
|
| 29 |
+
.gstack/
|
| 30 |
+
|
| 31 |
+
# internal: working memory + agent/skill defs — never publish
|
| 32 |
+
project-memory/
|
| 33 |
+
.claude/
|
| 34 |
+
_private/
|
| 35 |
+
|
| 36 |
+
# demo video assets (local only)
|
| 37 |
+
_video/
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.12
|
PRODUCT.md
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ScrubData — Product Research & Spec
|
| 2 |
+
|
| 3 |
+
> What does an office worker actually mean by "just clean my data"? This doc
|
| 4 |
+
> pins down the expectations so the cleaning-plan schema and UX aren't guesses.
|
| 5 |
+
> (Living doc — refine when the deep-research workflows land.)
|
| 6 |
+
|
| 7 |
+
## 1. The user & the moment
|
| 8 |
+
|
| 9 |
+
**Who:** an operations / sales-ops / finance / admin person. Lives in
|
| 10 |
+
spreadsheets exported from a CRM, an ERP, a Google Form, a POS, a bank portal.
|
| 11 |
+
Not a pandas user. Competent with Excel but doesn't want to write `=PROPER()`
|
| 12 |
+
across 40 columns or learn Power Query.
|
| 13 |
+
|
| 14 |
+
**The moment of pain:** they exported a file to do their actual job —
|
| 15 |
+
build a report, upload to another system, send a mail-merge, reconcile numbers —
|
| 16 |
+
and the file is dirty enough that the next step breaks or lies. The import fails,
|
| 17 |
+
the pivot double-counts, the vlookup misses, the "total revenue" is wrong because
|
| 18 |
+
amounts are text.
|
| 19 |
+
|
| 20 |
+
**What they want:** drop the file in, get a *trustworthy* clean file back, and
|
| 21 |
+
a plain sentence telling them what was wrong so they can vouch for it to their
|
| 22 |
+
boss. They do **not** want 30 config toggles. Hands-off is the whole pitch.
|
| 23 |
+
|
| 24 |
+
**What they fear (must design against):** that the tool silently changed
|
| 25 |
+
something it shouldn't have. Trust is the product. Every change must be
|
| 26 |
+
**visible, explained, and reversible**.
|
| 27 |
+
|
| 28 |
+
## 2. Taxonomy of "dirty" — what we must detect & fix
|
| 29 |
+
|
| 30 |
+
Grouped by how an office worker would describe it. This list *is* the operation
|
| 31 |
+
set the planner emits and the executor implements.
|
| 32 |
+
|
| 33 |
+
### A. Structural / table-level
|
| 34 |
+
- **Exact duplicate rows** — "this person is in here 3 times."
|
| 35 |
+
- **Near-duplicate rows** — same entity, trivial differences (later/stretch).
|
| 36 |
+
- **Empty rows & empty columns** — junk from the export.
|
| 37 |
+
- **Header problems** — header not in row 1, merged cells, `Unnamed: 0`,
|
| 38 |
+
duplicated column names, units baked into headers (`Amount (USD)`).
|
| 39 |
+
- **Inconsistent column naming** — `First Name` vs `first_name` (normalize to
|
| 40 |
+
snake_case as an option, off by default — it's a rename, higher-trust-risk).
|
| 41 |
+
|
| 42 |
+
### B. Whitespace & casing (the silent killers behind failed joins)
|
| 43 |
+
- Leading/trailing whitespace; doubled internal spaces; non-breaking spaces.
|
| 44 |
+
- Inconsistent casing (`ACME`, `Acme`, `acme corp`).
|
| 45 |
+
- Invisible characters (zero-width, BOM), smart quotes.
|
| 46 |
+
|
| 47 |
+
### C. Missing values, disguised
|
| 48 |
+
- Real blanks **plus** disguised nulls: `N/A`, `na`, `-`, `--`, `null`, `None`,
|
| 49 |
+
`#N/A`, `TBD`, `?`, `0` (context-dependent — risky, don't auto-assume).
|
| 50 |
+
- Decision: normalize disguised nulls → true missing; **imputation is opt-in**,
|
| 51 |
+
never silent (filling values is a claim about reality).
|
| 52 |
+
|
| 53 |
+
### D. Type & format inconsistency (where the model earns its keep)
|
| 54 |
+
- **Numbers stored as text:** `"$1,200.50"`, `"1.200,50"` (EU), `"(500)"`
|
| 55 |
+
(accounting negative), `"12%"`, `"1,2k"`.
|
| 56 |
+
- **Dates in mixed formats:** `2023-01-05`, `01/05/2023`, `5 Jan 2023`,
|
| 57 |
+
`Jan-23`, Excel serial `44931`. Ambiguous DMY vs MDY must be detected, not
|
| 58 |
+
guessed blindly — infer from the column's evidence, flag if undecidable.
|
| 59 |
+
- **Booleans:** `Yes/No`, `Y/N`, `TRUE/FALSE`, `1/0`, `T/F`, `✓`.
|
| 60 |
+
- **Phone numbers:** wildly inconsistent; standardize to E.164-ish where region
|
| 61 |
+
is inferable, else just strip to digits + canonical format.
|
| 62 |
+
- **Emails:** casing, whitespace, obvious typos (`@gmial.com`), trailing junk.
|
| 63 |
+
|
| 64 |
+
### E. Categorical canonicalization (the headline AI feature)
|
| 65 |
+
- Inconsistent labels for the same thing: `USA / U.S.A. / United States / us`,
|
| 66 |
+
`M/F vs Male/Female`, `NY / New York / new york`, status fields, product
|
| 67 |
+
names. Rules can't enumerate these — **the small model proposes the mapping**,
|
| 68 |
+
the executor applies it, the report shows the mapping for approval.
|
| 69 |
+
|
| 70 |
+
### F. Validity / anomaly flags (flag, don't auto-delete)
|
| 71 |
+
- Out-of-range numbers (age 999, negative price), impossible dates (1899-12-31
|
| 72 |
+
Excel epoch), malformed emails/phones, values that don't match the column's
|
| 73 |
+
inferred type. Default action = **flag in the report**, not silent edit.
|
| 74 |
+
|
| 75 |
+
## 3. The trust contract (design principles)
|
| 76 |
+
|
| 77 |
+
1. **Visible** — every operation appears in a before/after diff and the report.
|
| 78 |
+
2. **Explained** — plain-English rationale per operation ("standardized 4 date
|
| 79 |
+
formats into ISO `YYYY-MM-DD`").
|
| 80 |
+
3. **Conservative by default** — destructive/assumptive ops (imputation, row
|
| 81 |
+
deletion beyond exact dups, renames) are surfaced as suggestions, applied
|
| 82 |
+
only if the user keeps them on. Safe ops (trim whitespace, normalize disguised
|
| 83 |
+
nulls, parse types) are on by default.
|
| 84 |
+
4. **Reversible** — original file untouched; output is a new file + a machine-
|
| 85 |
+
readable plan the user could replay or undo.
|
| 86 |
+
5. **No config to start** — sensible defaults run immediately on upload; the
|
| 87 |
+
plan is editable *after* the user sees it, not a wall of options before.
|
| 88 |
+
|
| 89 |
+
## 4. Competitive landscape (what to learn / what to beat)
|
| 90 |
+
|
| 91 |
+
| Tool | What it does well | Why an office worker bounces |
|
| 92 |
+
|------|-------------------|------------------------------|
|
| 93 |
+
| **Excel / Power Query** | Ubiquitous, trusted | Manual; canonicalization is hand-built; steep |
|
| 94 |
+
| **OpenRefine** | Powerful clustering/canonicalization (key-collision, kNN) | Intimidating UI, GREL expressions, local Java app |
|
| 95 |
+
| **ydata-profiling / pandas-profiling** | Great *profiling* report | Diagnoses, doesn't *fix* |
|
| 96 |
+
| **Trifacta / Tableau Prep / Alteryx** | Visual prep pipelines | Enterprise, paid, config-heavy |
|
| 97 |
+
| **OpenRefine reconciliation** | Entity canonicalization | Manual, needs setup |
|
| 98 |
+
|
| 99 |
+
**Our wedge:** OpenRefine's clustering *automated and explained by a small
|
| 100 |
+
model*, with zero config and a one-screen trust-preserving UX. We borrow
|
| 101 |
+
OpenRefine's clustering idea but the model proposes the clusters/mappings and
|
| 102 |
+
narrates them, so the user never learns a tool — they just approve sentences.
|
| 103 |
+
|
| 104 |
+
## 5. Cleaning-plan schema (v0 — drives the mock & later the model)
|
| 105 |
+
|
| 106 |
+
The model outputs this JSON; the executor consumes it. Designed so the model
|
| 107 |
+
only does *semantic/fuzzy* judgment, and all execution is deterministic.
|
| 108 |
+
|
| 109 |
+
```json
|
| 110 |
+
{
|
| 111 |
+
"dataset_summary": "Contacts export, 38 rows × 9 cols; sales-lead data.",
|
| 112 |
+
"table_operations": [
|
| 113 |
+
{"op": "drop_exact_duplicates", "rationale": "5 identical rows."},
|
| 114 |
+
{"op": "drop_empty_rows"},
|
| 115 |
+
{"op": "drop_empty_columns", "columns": ["notes2"]}
|
| 116 |
+
],
|
| 117 |
+
"columns": [
|
| 118 |
+
{
|
| 119 |
+
"name": "country",
|
| 120 |
+
"detected_semantic_type": "country",
|
| 121 |
+
"issues": ["inconsistent_categories", "whitespace", "casing"],
|
| 122 |
+
"operations": [
|
| 123 |
+
{"op": "strip_whitespace"},
|
| 124 |
+
{"op": "canonicalize_categories",
|
| 125 |
+
"mapping": {"usa": "United States", "u.s.a.": "United States",
|
| 126 |
+
"us": "United States", "uk": "United Kingdom"},
|
| 127 |
+
"rationale": "Unified 4 spellings into 2 canonical country names."}
|
| 128 |
+
],
|
| 129 |
+
"confidence": 0.93
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"name": "amount",
|
| 133 |
+
"detected_semantic_type": "currency",
|
| 134 |
+
"issues": ["numeric_stored_as_text", "currency_symbols"],
|
| 135 |
+
"operations": [
|
| 136 |
+
{"op": "parse_currency", "rationale": "Stripped $ and thousands separators; → float."}
|
| 137 |
+
],
|
| 138 |
+
"confidence": 0.97
|
| 139 |
+
}
|
| 140 |
+
],
|
| 141 |
+
"flags": [
|
| 142 |
+
{"column": "age", "row_hint": "value 999", "issue": "out_of_range",
|
| 143 |
+
"action": "flag_only", "rationale": "Likely placeholder; left for human review."}
|
| 144 |
+
]
|
| 145 |
+
}
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
### Operation vocabulary (executor must implement)
|
| 149 |
+
Safe-by-default: `strip_whitespace`, `collapse_internal_whitespace`,
|
| 150 |
+
`normalize_disguised_nulls`, `standardize_case`, `parse_currency`,
|
| 151 |
+
`parse_number`, `parse_percent`, `parse_date`, `standardize_boolean`,
|
| 152 |
+
`standardize_phone`, `normalize_email`, `drop_exact_duplicates`,
|
| 153 |
+
`drop_empty_rows`, `drop_empty_columns`, `canonicalize_categories`.
|
| 154 |
+
Opt-in (assumptive): `impute_missing`, `drop_near_duplicates`,
|
| 155 |
+
`rename_columns_snake_case`, `coerce_outliers`.
|
| 156 |
+
Flag-only: `flag_out_of_range`, `flag_invalid_format`, `flag_type_mismatch`.
|
| 157 |
+
|
| 158 |
+
## 6. Success metric for the demo (Backyard AI judging)
|
| 159 |
+
|
| 160 |
+
A real office person uploads a real ugly export, clicks one button, and says
|
| 161 |
+
"oh thank god" — then trusts the result enough to use it, because the report
|
| 162 |
+
told them exactly what changed. That sentence is the bar.
|
README.md
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: ScrubData
|
| 3 |
+
emoji: 🏔️
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 6.16.0
|
| 8 |
+
app_file: server.py
|
| 9 |
+
pinned: true
|
| 10 |
+
license: mit
|
| 11 |
+
tags:
|
| 12 |
+
- track:backyard
|
| 13 |
+
- sponsor:openai
|
| 14 |
+
- sponsor:modal
|
| 15 |
+
- achievement:offgrid
|
| 16 |
+
- achievement:welltuned
|
| 17 |
+
- achievement:offbrand
|
| 18 |
+
- achievement:llama
|
| 19 |
+
- achievement:sharing
|
| 20 |
+
- achievement:fieldnotes
|
| 21 |
+
---
|
| 22 |
+
# ScrubData — hands-off data cleaning, with the receipts
|
| 23 |
+
|
| 24 |
+
Entry for the **Build Small Hackathon** (Gradio · Hugging Face), 🏡 Backyard AI track.
|
| 25 |
+
Runs a ≤4B model — a local-runnable GGUF, no third-party AI APIs → also in the running for
|
| 26 |
+
**Tiny Titan**, **Off-Brand**, **Best Demo**, **Best Agent**, and **Bonus Quest Champion**
|
| 27 |
+
(all six quests claimed above).
|
| 28 |
+
|
| 29 |
+
<!-- SUBMISSION LINKS (all set for June 15):
|
| 30 |
+
Demo video: https://www.loom.com/share/2fa868147527496e8097d82dd546d663 [DONE]
|
| 31 |
+
Social post: https://x.com/ric_alanis/status/2066598533738692983 [DONE]
|
| 32 |
+
These links + this write-up are required by the build-small-hackathon /submit tool. -->
|
| 33 |
+
|
| 34 |
+
> **Hosted demo vs. local — read this.** This Space is a **no-install demo** that cleans with
|
| 35 |
+
> the real **Qwen3-4B fine-tune** by default (served on an A100 GPU, ~1 min/clean warm; first
|
| 36 |
+
> run after idle ~2 min on cold start) — the whole point
|
| 37 |
+
> is the small model doing the work. Your file is processed on Hugging Face / the GPU endpoint
|
| 38 |
+
> (sent to no third-party API, not stored); untick the box for an instant deterministic pass.
|
| 39 |
+
> The **privacy story is a property of running it yourself**: `SCRUBDATA_MODEL=scrubdata-ft uv
|
| 40 |
+
> run server.py` reads and cleans your file on-device with the same fine-tune — nothing leaves
|
| 41 |
+
> your machine. The app labels its own mode honestly (the ribbon says which one you're using).
|
| 42 |
+
> Same auditable plan→verify→execute pipeline either way.
|
| 43 |
+
|
| 44 |
+
> **Modal** (`sponsor:modal`): the hosted Space cleans with the Qwen3-4B fine-tune served from a
|
| 45 |
+
> **scale-to-zero Modal GPU endpoint** (`scripts/modal_serve.py`, Ollama on an A100; $0 when idle,
|
| 46 |
+
> pre-warmed on page load to hide the cold start). Modal also drove the headless training +
|
| 47 |
+
> evaluation loop behind the published model. The deterministic planner is the silent fallback
|
| 48 |
+
> if the GPU is cold or down, so the demo never hard-fails.
|
| 49 |
+
|
| 50 |
+
> **Drop a messy export. Get clean data back — every change named, reversible, and
|
| 51 |
+
> explained. Anything sensitive is protected locally. The judgment calls stay yours.**
|
| 52 |
+
>
|
| 53 |
+
> For the office/ops person trying to do their job while their data is a mess.
|
| 54 |
+
|
| 55 |
+
**Built by:** [@ricalanis](https://huggingface.co/ricalanis) (solo) · 🤗 Hugging Face: `ricalanis`
|
| 56 |
+
**Live Space:** https://huggingface.co/spaces/build-small-hackathon/scrubdata
|
| 57 |
+
**Code (open source):** https://github.com/ricalanis/scrubdata-hackathon
|
| 58 |
+
**Demo video:** https://www.loom.com/share/2fa868147527496e8097d82dd546d663
|
| 59 |
+
**Write-up / post:** https://x.com/ric_alanis/status/2066598533738692983
|
| 60 |
+
|
| 61 |
+
## How it works
|
| 62 |
+
|
| 63 |
+
A small local model is the **planner**, never a row-by-row editor:
|
| 64 |
+
|
| 65 |
+
1. **Profile** — pandas aggregates each column into a value–frequency distribution
|
| 66 |
+
(scale-invariant: a million rows profile like a hundred).
|
| 67 |
+
2. **Plan** — the model reads the profile and emits a structured JSON cleaning plan:
|
| 68 |
+
canonicalization mappings, format fixes, dedup, anomaly flags.
|
| 69 |
+
3. **Ground** — canonical forms are never invented: values reconcile against reference
|
| 70 |
+
taxonomies (GeoNames 196k cities, ISO countries/states, and a pluggable **entity
|
| 71 |
+
reference** built from harvested vocabularies — ToughTables/MusicBrainz/Wikidata/ROR,
|
| 72 |
+
~100k entities) with fuzzy retrieval; ambiguous matches **abstain** and surface for
|
| 73 |
+
human review (calibrated: 90% precision at the default threshold, ≥95% at 0.91).
|
| 74 |
+
Profiles carry **suspect_values** — rare anomalous surfaces with evidence-backed
|
| 75 |
+
candidates — so high-cardinality columns are no longer invisible to the planner
|
| 76 |
+
(measured: five all-unique-surface benchmark tables went 0.0 → 0.96 F1 at zero damage).
|
| 77 |
+
4. **Verify** — every model-proposed mapping is scored by deterministic evidence
|
| 78 |
+
(errors-are-rare frequency gates, variant similarity, reference agreement); entries
|
| 79 |
+
below the confidence threshold (`SCRUBDATA_TAU`, default 0.5) become review flags
|
| 80 |
+
instead of edits. The shipped **verified union planner** (gated model plan ∪ grounded
|
| 81 |
+
heuristic) measures **0.905 precision @ 0.413 coverage** on hospital's 509 real errors
|
| 82 |
+
— the gated model plan alone is 0.993 @ 0.287.
|
| 83 |
+
5. **Protect** — PII is detected locally (Luhn/IBAN checksums + a 44M OpenMed-PII
|
| 84 |
+
classifier): cards/SSNs masked format-preservingly, contacts flagged, **0/360 residual
|
| 85 |
+
PII** after masking in our leak test.
|
| 86 |
+
6. **Execute** — deterministic pandas applies the plan. No silent edits, by construction;
|
| 87 |
+
every run exports an audit trail (OpenTelemetry-GenAI spans + open traces).
|
| 88 |
+
|
| 89 |
+
**Model:** `Qwen3-4B-Instruct-2507` (Tiny Titan), QLoRA fine-tuned on **execution-verified**
|
| 90 |
+
synthetic + real-derived data (every training plan provably recovers the clean table),
|
| 91 |
+
runnable via llama.cpp GGUF.
|
| 92 |
+
|
| 93 |
+
## The app (what judges see)
|
| 94 |
+
A custom `gr.Server` frontend (no default Gradio chrome — the **Off-Brand** quest), built
|
| 95 |
+
around the trust story:
|
| 96 |
+
- **YOUR CALL cards** — when the model is genuinely torn (e.g. *Slovia → Slovakia 86% vs
|
| 97 |
+
Slovenia 86%*) it abstains and hands you the tie with both candidates; pick the right one
|
| 98 |
+
and **stage several decisions**, then "✓ Clean now" replays them as one plan.
|
| 99 |
+
- **Named, reversible receipts** — every edit shows as a row in the audit grid with its op +
|
| 100 |
+
rationale and a before/after diff; nothing is silent.
|
| 101 |
+
- **PII review cards** — embedded cards/SSNs (Luhn/strict-regex) flagged and masked
|
| 102 |
+
format-preservingly, on-device.
|
| 103 |
+
- **Save / replay recipe** — export the cleaning plan as JSON and re-apply it to next week's
|
| 104 |
+
export in one click (the "Monday ritual").
|
| 105 |
+
- **Honest, self-aware copy** — the app injects its own runtime state and the ribbon says
|
| 106 |
+
exactly which planner ran and where your data was processed.
|
| 107 |
+
- **A fun, size-aware ETA timer** + cold-start readiness gate + page-load GPU pre-warm, so
|
| 108 |
+
the model path feels responsive and never lies about progress.
|
| 109 |
+
- Drag-and-drop, two bundled sample exports, mobile-responsive layout.
|
| 110 |
+
|
| 111 |
+
## What real users told us (and what we changed)
|
| 112 |
+
|
| 113 |
+
Before submission we put the live Space in front of people who **aren't** data folks — the
|
| 114 |
+
exact audience the tool is for — and sent the link with one line: *"if you have a messy
|
| 115 |
+
spreadsheet, try it."* The most useful finding wasn't a bug. It was that the word
|
| 116 |
+
**"cleaning" didn't land**:
|
| 117 |
+
|
| 118 |
+
- One tester read "clean my Excel" as *deleting* data:
|
| 119 |
+
*"¿Te refieres a que elimine algo de algún archivo?"* — "You mean it removes something
|
| 120 |
+
from the file?"
|
| 121 |
+
- Another didn't know where to begin:
|
| 122 |
+
*"¿eso del Excel te lo subimos ahí o cómo?"* — "the Excel thing, do we upload it there,
|
| 123 |
+
or how?"
|
| 124 |
+
- The clearest explanation in the whole thread was one we had to type by hand in chat:
|
| 125 |
+
*"it fixes text errors — names, phones, emails, cities."* That sentence wasn't anywhere
|
| 126 |
+
in the product.
|
| 127 |
+
|
| 128 |
+
So we changed the product to **show** what cleaning means instead of naming it:
|
| 129 |
+
|
| 130 |
+
- the hero now leads with a literal before→after strip
|
| 131 |
+
(`nigeia → Nigeria`, `Calfornia → California`, `Ana@GMAIL.com → ana@gmail.com`,
|
| 132 |
+
`415.555.0192 → (415) 555-0192`) so the value is obvious *before* any upload;
|
| 133 |
+
- the headline is the sentence that worked in chat — **"Fix the messy text in your
|
| 134 |
+
spreadsheet"** — and the copy says plainly **"I never delete your data"** (killing the
|
| 135 |
+
"does it erase things?" misread);
|
| 136 |
+
- a one-click **"watch it run on a sample file"** path removes the "where do I start?" wall;
|
| 137 |
+
- jargon labels are gone ("HR payroll (with PII)" → "an HR file with sensitive data").
|
| 138 |
+
|
| 139 |
+
n is small and informal (friends-and-network, ~3 people), so this isn't a usability *study* —
|
| 140 |
+
but the feedback was real, it pointed at a failure of the *framing* rather than the engine,
|
| 141 |
+
and it changed the build. The persona "Maria" below is the controlled walk-through; the
|
| 142 |
+
quotes above are verbatim from people we know.
|
| 143 |
+
|
| 144 |
+
## Measured (not vibes)
|
| 145 |
+
|
| 146 |
+
- **Canonicalization micro-F1 0.90 (best single run; 0.80 ± 0.01 over 3 training seeds)** for the 4B
|
| 147 |
+
fine-tune vs **0.45** for a much larger generic model vs **0.15** for rules.
|
| 148 |
+
- Real errors (5-benchmark macro): grounded cleaning reaches REAL-F1 **0.225**, 3.9×
|
| 149 |
+
OpenRefine kNN (0.058) and 5.7× fingerprint (0.039); the verified-union gate repairs
|
| 150 |
+
41% of hospital's 509 real errors at **0.905 precision**, every declined merge
|
| 151 |
+
surfaced for review.
|
| 152 |
+
- Evaluated on a **65-dataset suite** (Raha benchmarks + seeded error injection over 15
|
| 153 |
+
open-data domains) with a churn-neutral metric that can't be gamed by mass rewriting.
|
| 154 |
+
- Full write-up: `docs/paper/` (preprint draft) · details in `eval/README.md`.
|
| 155 |
+
|
| 156 |
+
## Run it
|
| 157 |
+
|
| 158 |
+
```bash
|
| 159 |
+
uv sync
|
| 160 |
+
uv run server.py # gr.Server + custom UI (grounded heuristic)
|
| 161 |
+
|
| 162 |
+
# fine-tuned model as planner (needs Ollama + the GGUF, see notebooks/Modelfile):
|
| 163 |
+
ollama pull hf.co/ricalanis/scrubdata-qwen3-4b-v6-q8:Q8_0
|
| 164 |
+
ollama create scrubdata-ft -f notebooks/Modelfile
|
| 165 |
+
SCRUBDATA_MODEL=scrubdata-ft uv run server.py # model planner, heuristic fallback (on-device)
|
| 166 |
+
|
| 167 |
+
SCRUBDATA_PII_NER=1 uv run server.py # +44M NER for name/address columns
|
| 168 |
+
uv run python -m scrubdata.cli messy.csv -o clean.csv --plan plan.json
|
| 169 |
+
uv run pytest tests/ # engine + scorer tests (69)
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
The hosted Space serves the same fine-tune from a scale-to-zero **Modal A100**
|
| 173 |
+
(`scripts/modal_serve.py`) and the planner adds `format=json` on that path
|
| 174 |
+
(`SCRUBDATA_OLLAMA_FORMAT_JSON=1`) to grammar-constrain the GGUF on the A100's kernels.
|
| 175 |
+
`scripts/modal_warm.py on|off` pins/un-pins a warm container (no cold start) without a
|
| 176 |
+
redeploy — leave it `off` (scale-to-zero, $0 idle), flip `on` for a live judging window.
|
| 177 |
+
|
| 178 |
+
## Repo map
|
| 179 |
+
- `scrubdata/` — `profiler` · `planner` · `reconcile` (reference grounding + abstain) ·
|
| 180 |
+
`grounded` (RACOON wrapper) · `verifier` (selective prediction + union planner) ·
|
| 181 |
+
`pair_profile` (candidate-constrained canonicalization, opt-in) · `pii` (checksum +
|
| 182 |
+
NER tiers, mask/hash/pseudonymize) · `executor` · `observability` · `trace` ·
|
| 183 |
+
`baselines` (OpenRefine) · `cli`.
|
| 184 |
+
- `training/` — execution-verified synthetic generator + real-data derivation
|
| 185 |
+
(`real_data.py`: paired benchmarks + frequency-derived unpaired open data).
|
| 186 |
+
- `eval/` — frozen gold · wide suite + double-macro north-star (`run_real_multi.py`) ·
|
| 187 |
+
ablations · calibration (risk–coverage) · PII leak test.
|
| 188 |
+
- `docs/paper/` — preprint: *Verified Cleaning Plans: Plan-Level Selective Prediction
|
| 189 |
+
Turns Local LLM Planners into Trustworthy Table Cleaners*.
|
| 190 |
+
- `scripts/` — Modal train/eval (headless GPU loop), trace publishing.
|
| 191 |
+
|
| 192 |
+
## Research & resources
|
| 193 |
+
Everything behind the demo is public:
|
| 194 |
+
- 🚀 **Live Space** — https://huggingface.co/spaces/build-small-hackathon/scrubdata
|
| 195 |
+
- 💻 **Code (open source)** — https://github.com/ricalanis/scrubdata-hackathon
|
| 196 |
+
- 🧠 **Fine-tuned model** — https://huggingface.co/ricalanis/scrubdata-qwen3-4b
|
| 197 |
+
(Q8_0 GGUF: https://huggingface.co/ricalanis/scrubdata-qwen3-4b-v6-q8)
|
| 198 |
+
- 📊 **WildClean dataset** (real-world dirty tables + injected-error benches) —
|
| 199 |
+
https://huggingface.co/datasets/ricalanis/wildclean
|
| 200 |
+
- 🔍 **Agent traces** (OpenTelemetry-GenAI spans from real runs) —
|
| 201 |
+
https://huggingface.co/datasets/build-small-hackathon/scrubdata-traces
|
| 202 |
+
- 📄 **Preprint** — *Verified Cleaning Plans: Plan-Level Selective Prediction Turns Local
|
| 203 |
+
LLM Planners into Trustworthy Table Cleaners* (`docs/paper/main.pdf`)
|
| 204 |
+
- 📓 **Field notes** (the build story, failures included) — `docs/FIELD_NOTES.md`
|
| 205 |
+
- 🛠️ **Tool reference** (the whole system, end to end) — `docs/TOOL_REFERENCE.md`
|
| 206 |
+
|
| 207 |
+
## Built with Codex
|
| 208 |
+
The final review-and-refine pass used **OpenAI Codex** (gpt-5.5) as a reviewer / last
|
| 209 |
+
refiner — not to write the product, but to harden it. It added the executor's
|
| 210 |
+
never-corrupt-clean-data regression tests, made column sanitization collision-proof,
|
| 211 |
+
did the accessibility pass (ARIA + keyboard + reduced-motion + focus-visible), and wrote
|
| 212 |
+
characterization tests for the reference matcher. Every change was human-reviewed and
|
| 213 |
+
verified green (84 tests, golden behavior unchanged) before commit; the commits are
|
| 214 |
+
attributed to `@codex` in the git history.
|
| 215 |
+
|
| 216 |
+
## Submission checklist (verified against the build-small-hackathon `/submit` tool)
|
| 217 |
+
- [x] Public Gradio Space in the `build-small-hackathon` org
|
| 218 |
+
- [x] Every model ≤ 32B (here ≤ 4B → **Tiny Titan**-eligible): `Qwen3-4B-Instruct-2507`
|
| 219 |
+
- [x] README `tags:` set — `track:backyard` + all six `achievement:*` quests (above)
|
| 220 |
+
- [x] **Off the Grid** (`offgrid`) — no third-party AI APIs; the planner is a local-runnable GGUF (Qwen3-4B). Self-hosted = fully on-device (zero external egress); the hosted demo serves the *same* model from a self-managed Modal GPU, not a SaaS API
|
| 221 |
+
- [x] **Well-Tuned** (`welltuned`) — fine-tune published: `ricalanis/scrubdata-qwen3-4b` (+ `-v6-q8` GGUF)
|
| 222 |
+
- [x] **Off-Brand** (`offbrand`) — custom `gr.Server` HTML/CSS frontend, not default Gradio
|
| 223 |
+
- [x] **Llama Champion** (`llama`) — runs through llama.cpp (Q8_0 GGUF)
|
| 224 |
+
- [x] **Sharing is Caring** (`sharing`) — agent traces on the Hub: `build-small-hackathon/scrubdata-traces`
|
| 225 |
+
- [x] **Field Notes** (`fieldnotes`) — build report: `docs/FIELD_NOTES.md`
|
| 226 |
+
- [x] Write-up in this README (idea + tech)
|
| 227 |
+
- [x] **Demo video** link in README: https://www.loom.com/share/2fa868147527496e8097d82dd546d663
|
| 228 |
+
- [x] **Social post** link in README: https://x.com/ric_alanis/status/2066598533738692983
|
| 229 |
+
- [x] Confirm deadline time/timezone — **June 15 2026, 23:59 UTC** (confirmed on the hackathon page)
|
| 230 |
+
|
| 231 |
+
Judged (no tag needed, just qualify): Tiny Titan · Off-Brand prize · Best Demo · Best Agent · Bonus Quest Champion.
|
TRANSFER.md
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Machine transfer guide
|
| 2 |
+
|
| 3 |
+
Everything needed to continue this project on a new machine.
|
| 4 |
+
|
| 5 |
+
## 1. Clone + deps
|
| 6 |
+
```bash
|
| 7 |
+
git clone https://github.com/ricalanis/scrubdata-hackathon.git ~/Dev/hackaton-small
|
| 8 |
+
cd ~/Dev/hackaton-small && uv sync
|
| 9 |
+
uv run pytest tests/ # 25 tests should pass
|
| 10 |
+
```
|
| 11 |
+
|
| 12 |
+
## 2. Restore Claude Code memory (IMPORTANT)
|
| 13 |
+
The agent's persistent memory is bundled in `project-memory/`. On the new machine, after
|
| 14 |
+
opening the project in Claude Code once (so the project dir exists):
|
| 15 |
+
```bash
|
| 16 |
+
cp project-memory/*.md ~/.claude/projects/-Users-<USER>-Dev-hackaton-small/memory/
|
| 17 |
+
```
|
| 18 |
+
(Adjust the path-keyed directory name to the new machine's project path. `MEMORY.md` is the
|
| 19 |
+
index; the rest are the knowledge base — data-loop-playbook.md and arxiv-paper.md are the
|
| 20 |
+
operational core.)
|
| 21 |
+
|
| 22 |
+
## 3. Cloud auth (state lives in the cloud, just re-authenticate)
|
| 23 |
+
```bash
|
| 24 |
+
uv run modal token new # Modal: adapters in volume scrubdata-v5-adapter
|
| 25 |
+
# (/v5 = v5, /v5_seed21 = v6/mixA winner, seeds 1-3,25,26)
|
| 26 |
+
# results Dicts: scrubdata-train-results (seedN keys),
|
| 27 |
+
# scrubdata-eval-v5-results, scrubdata-suite-results
|
| 28 |
+
hf auth login # HF: Space build-small-hackathon/scrubdata, model repos
|
| 29 |
+
# ricalanis/scrubdata-qwen3-4b{,-v6-q8}, traces dataset
|
| 30 |
+
gh auth login # GitHub
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
## 4. Local model (optional, 4.3GB)
|
| 34 |
+
```bash
|
| 35 |
+
ollama pull hf.co/ricalanis/scrubdata-qwen3-4b-v6-q8:Q8_0
|
| 36 |
+
ollama create scrubdata-ft-v6 -f notebooks/Modelfile
|
| 37 |
+
SCRUBDATA_MODEL=scrubdata-ft-v6 uv run server.py
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
## 5. Regenerable data (data/ is gitignored)
|
| 41 |
+
Harvested alias vocabularies + paired examples are PRESERVED in `training/harvests/` —
|
| 42 |
+
copy them back so the generator finds them:
|
| 43 |
+
```bash
|
| 44 |
+
mkdir -p data && cp training/harvests/*.jsonl data/
|
| 45 |
+
```
|
| 46 |
+
Big training mixes are regenerable:
|
| 47 |
+
```bash
|
| 48 |
+
uv run python -m training.build_dataset --n 1600 --out data/v5_synth.jsonl --seed 5
|
| 49 |
+
uv run python -m training.real_data --datasets hospital beers movies_1 --per-dataset 80 --out data/v6_paired_big.jsonl
|
| 50 |
+
# mix recipe (mixA = winner): synth + paired*4, shuffled -> data/v5_train.jsonl
|
| 51 |
+
```
|
| 52 |
+
The eval suite re-fetches Raha benchmarks automatically; harvested gov/GitHub CSVs
|
| 53 |
+
(data/real/cache) re-download via training/unpaired_sources.json.
|
| 54 |
+
|
| 55 |
+
## 6. In-flight at transfer time
|
| 56 |
+
- mixH (additive-composition test, seed 30): Modal call `fc-01KTRXTHJKW3G81BT4Q0FZET8G`,
|
| 57 |
+
result lands in Dict `scrubdata-train-results` key `seed30`. Retrieve from any machine:
|
| 58 |
+
```bash
|
| 59 |
+
uv run python -c "import modal; print(modal.Dict.from_name('scrubdata-train-results').get('seed30'))"
|
| 60 |
+
```
|
| 61 |
+
- Open question it answers: whether the vocab-mix regressions (mixE/F/G ~0.57-0.59 vs mixA
|
| 62 |
+
0.748) were eval-coverage shift. See project-memory/data-loop-playbook.md.
|
| 63 |
+
|
| 64 |
+
## 7. Where everything lives
|
| 65 |
+
- Paper: `docs/paper/main.tex` (+ numbers.tex, fig) — compiles with pdflatex; COMPLETE.
|
| 66 |
+
- Submission kit: `docs/SUBMISSION.md` (demo script + social post), `docs/FIELD_NOTES.md`.
|
| 67 |
+
- Live Space: https://huggingface.co/spaces/build-small-hackathon/scrubdata
|
| 68 |
+
- arXiv next steps: cs.DB endorser etc. — project-memory/arxiv-paper.md.
|
| 69 |
+
- Hackathon deadline: 2026-06-15 (demo video + social post remain).
|
app.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ScrubData — hands-off data cleaning (Gradio app).
|
| 2 |
+
|
| 3 |
+
Runnable MOCK demo on gr.Blocks: upload → profile → plan → clean → diff +
|
| 4 |
+
report → download. The planner is a heuristic stand-in for the fine-tuned ≤4B
|
| 5 |
+
model; the rest of the pipeline is real. Final version will port this flow to
|
| 6 |
+
gr.Server + a custom HTML frontend for the Off-Brand bonus quest.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import tempfile
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
import gradio as gr
|
| 15 |
+
import pandas as pd
|
| 16 |
+
|
| 17 |
+
from scrubdata import apply_plan, mock_plan, profile_dataframe, render_report
|
| 18 |
+
from scrubdata.active import get_planner
|
| 19 |
+
from scrubdata.trace import log_run
|
| 20 |
+
|
| 21 |
+
PLANNER = get_planner() # fine-tuned model if SCRUBDATA_MODEL is set, else heuristic
|
| 22 |
+
|
| 23 |
+
SAMPLE = Path(__file__).parent / "samples" / "dirty_contacts.csv"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _read_any(path: str) -> pd.DataFrame:
|
| 27 |
+
"""Read CSV or Excel as raw strings (cleaning decides the real types)."""
|
| 28 |
+
p = Path(path)
|
| 29 |
+
if p.suffix.lower() in {".xlsx", ".xls"}:
|
| 30 |
+
return pd.read_excel(p, dtype=str)
|
| 31 |
+
return pd.read_csv(p, dtype=str, keep_default_na=False)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def clean(file_path: str):
|
| 35 |
+
if not file_path:
|
| 36 |
+
return (gr.update(), gr.update(), "Upload a CSV or Excel file to begin.", None)
|
| 37 |
+
|
| 38 |
+
raw = _read_any(file_path)
|
| 39 |
+
before = profile_dataframe(raw)
|
| 40 |
+
plan = PLANNER(raw)
|
| 41 |
+
cleaned, log = apply_plan(raw, plan)
|
| 42 |
+
after = profile_dataframe(cleaned)
|
| 43 |
+
report = render_report(plan, log, before, after)
|
| 44 |
+
|
| 45 |
+
out = Path(tempfile.gettempdir()) / "scrubbed.csv"
|
| 46 |
+
cleaned.to_csv(out, index=False)
|
| 47 |
+
|
| 48 |
+
try: # best-effort agent-trace capture (Open trace bonus quest)
|
| 49 |
+
log_run(before, raw, plan, log, model=plan.get("_generated_by", "mock_planner"))
|
| 50 |
+
except Exception:
|
| 51 |
+
pass
|
| 52 |
+
|
| 53 |
+
return raw, cleaned, report, str(out)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def load_sample():
|
| 57 |
+
return str(SAMPLE)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
with gr.Blocks(title="ScrubData") as demo:
|
| 61 |
+
gr.Markdown(
|
| 62 |
+
"# 🧽 ScrubData\n"
|
| 63 |
+
"**Upload your dirty spreadsheet. Get clean data back. No config.**\n\n"
|
| 64 |
+
"_Mock demo — heuristic planner standing in for the fine-tuned model._"
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
with gr.Row():
|
| 68 |
+
file_in = gr.File(label="Upload CSV / Excel", file_types=[".csv", ".xlsx", ".xls"],
|
| 69 |
+
type="filepath")
|
| 70 |
+
with gr.Column():
|
| 71 |
+
run_btn = gr.Button("🧽 Clean it", variant="primary")
|
| 72 |
+
sample_btn = gr.Button("Use the messy sample")
|
| 73 |
+
|
| 74 |
+
with gr.Row():
|
| 75 |
+
with gr.Column():
|
| 76 |
+
gr.Markdown("### Before")
|
| 77 |
+
before_df = gr.Dataframe(label="Original", interactive=False, wrap=True)
|
| 78 |
+
with gr.Column():
|
| 79 |
+
gr.Markdown("### After")
|
| 80 |
+
after_df = gr.Dataframe(label="Cleaned", interactive=False, wrap=True)
|
| 81 |
+
|
| 82 |
+
report_md = gr.Markdown()
|
| 83 |
+
download = gr.File(label="Download cleaned file")
|
| 84 |
+
|
| 85 |
+
run_btn.click(clean, inputs=file_in, outputs=[before_df, after_df, report_md, download])
|
| 86 |
+
sample_btn.click(load_sample, outputs=file_in)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
if __name__ == "__main__":
|
| 90 |
+
demo.launch(theme=gr.themes.Soft())
|
design/mockups/calm/index.html
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="es">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 6 |
+
<title>ScrubData — Tu lista, ordenada con calma</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root{
|
| 9 |
+
--paper:#fbf7f0;
|
| 10 |
+
--paper-2:#fffdf9;
|
| 11 |
+
--ink:#3a3530;
|
| 12 |
+
--ink-soft:#6f675d;
|
| 13 |
+
--line:#ece4d6;
|
| 14 |
+
--accent:#7ba087; /* single calm sage accent */
|
| 15 |
+
--accent-soft:#e8f0ea;
|
| 16 |
+
--accent-deep:#5e8470;
|
| 17 |
+
--warm:#d8a25e; /* gentle merit-badge gold, used sparingly */
|
| 18 |
+
--shadow:0 14px 40px -22px rgba(80,70,55,.45);
|
| 19 |
+
--radius:26px;
|
| 20 |
+
}
|
| 21 |
+
*{box-sizing:border-box;}
|
| 22 |
+
html,body{margin:0;padding:0;}
|
| 23 |
+
body{
|
| 24 |
+
font-family:"Iowan Old Style","Palatino Linotype",Palatino,Georgia,"Times New Roman",serif;
|
| 25 |
+
background:
|
| 26 |
+
radial-gradient(120% 80% at 50% -10%, #fffdf9 0%, var(--paper) 55%, #f5efe4 100%);
|
| 27 |
+
color:var(--ink);
|
| 28 |
+
line-height:1.6;
|
| 29 |
+
-webkit-font-smoothing:antialiased;
|
| 30 |
+
min-height:100vh;
|
| 31 |
+
display:flex;
|
| 32 |
+
flex-direction:column;
|
| 33 |
+
align-items:center;
|
| 34 |
+
padding:34px 20px 70px;
|
| 35 |
+
}
|
| 36 |
+
::selection{background:var(--accent-soft);}
|
| 37 |
+
|
| 38 |
+
/* ---------- top bar ---------- */
|
| 39 |
+
.topbar{
|
| 40 |
+
width:100%;
|
| 41 |
+
max-width:760px;
|
| 42 |
+
display:flex;
|
| 43 |
+
align-items:center;
|
| 44 |
+
justify-content:space-between;
|
| 45 |
+
margin-bottom:30px;
|
| 46 |
+
}
|
| 47 |
+
.brand{display:flex;align-items:center;gap:11px;}
|
| 48 |
+
.leaf{width:34px;height:34px;flex:none;}
|
| 49 |
+
.brand-name{font-size:1.18rem;font-weight:600;letter-spacing:.2px;}
|
| 50 |
+
.brand-name small{display:block;font-size:.72rem;color:var(--ink-soft);letter-spacing:.4px;font-weight:400;}
|
| 51 |
+
.lang{
|
| 52 |
+
display:flex;background:var(--paper-2);border:1px solid var(--line);
|
| 53 |
+
border-radius:999px;padding:3px;font-family:system-ui,sans-serif;font-size:.8rem;
|
| 54 |
+
}
|
| 55 |
+
.lang button{
|
| 56 |
+
border:none;background:transparent;color:var(--ink-soft);
|
| 57 |
+
padding:6px 14px;border-radius:999px;cursor:pointer;font-weight:600;letter-spacing:.3px;
|
| 58 |
+
}
|
| 59 |
+
.lang button.on{background:var(--accent);color:#fff;}
|
| 60 |
+
|
| 61 |
+
/* ---------- shared card ---------- */
|
| 62 |
+
.stage{width:100%;max-width:760px;}
|
| 63 |
+
.card{
|
| 64 |
+
background:var(--paper-2);
|
| 65 |
+
border:1px solid var(--line);
|
| 66 |
+
border-radius:var(--radius);
|
| 67 |
+
box-shadow:var(--shadow);
|
| 68 |
+
padding:46px 44px;
|
| 69 |
+
}
|
| 70 |
+
.screen{display:none;}
|
| 71 |
+
.screen.active{display:block;animation:rise .6s ease both;}
|
| 72 |
+
@keyframes rise{from{opacity:0;transform:translateY(14px);}to{opacity:1;transform:none;}}
|
| 73 |
+
|
| 74 |
+
h1{font-size:2.05rem;line-height:1.25;margin:0 0 12px;font-weight:600;letter-spacing:.2px;}
|
| 75 |
+
.lede{font-size:1.18rem;color:var(--ink-soft);margin:0 0 30px;max-width:46ch;}
|
| 76 |
+
|
| 77 |
+
/* persistent safety strip */
|
| 78 |
+
.safety{
|
| 79 |
+
display:flex;align-items:center;gap:12px;
|
| 80 |
+
background:var(--accent-soft);
|
| 81 |
+
border-radius:18px;
|
| 82 |
+
padding:14px 18px;
|
| 83 |
+
margin-top:26px;
|
| 84 |
+
font-family:system-ui,sans-serif;
|
| 85 |
+
font-size:.95rem;
|
| 86 |
+
color:var(--accent-deep);
|
| 87 |
+
}
|
| 88 |
+
.safety svg{flex:none;}
|
| 89 |
+
.safety b{font-weight:600;}
|
| 90 |
+
|
| 91 |
+
/* ---------- screen 1: drop ---------- */
|
| 92 |
+
.drop{
|
| 93 |
+
border:2px dashed #cdbfa6;
|
| 94 |
+
background:linear-gradient(180deg,#fffefb,#fbf6ec);
|
| 95 |
+
border-radius:24px;
|
| 96 |
+
padding:54px 30px;
|
| 97 |
+
text-align:center;
|
| 98 |
+
cursor:pointer;
|
| 99 |
+
transition:border-color .25s, background .25s, transform .25s;
|
| 100 |
+
}
|
| 101 |
+
.drop:hover{border-color:var(--accent);background:#fbfaf4;transform:translateY(-2px);}
|
| 102 |
+
.drop .basket{font-size:2.6rem;display:block;margin-bottom:10px;}
|
| 103 |
+
.drop .big{font-size:1.32rem;font-weight:600;margin-bottom:4px;}
|
| 104 |
+
.drop .sub{color:var(--ink-soft);font-family:system-ui,sans-serif;font-size:.95rem;}
|
| 105 |
+
.filechip{
|
| 106 |
+
display:inline-flex;align-items:center;gap:9px;margin-top:22px;
|
| 107 |
+
background:#fff;border:1px solid var(--line);border-radius:14px;
|
| 108 |
+
padding:9px 15px;font-family:system-ui,sans-serif;font-size:.9rem;color:var(--ink);
|
| 109 |
+
}
|
| 110 |
+
.filechip .dot{width:9px;height:9px;border-radius:50%;background:var(--accent);}
|
| 111 |
+
|
| 112 |
+
.btn{
|
| 113 |
+
font-family:system-ui,sans-serif;font-size:1.06rem;font-weight:600;
|
| 114 |
+
border:none;border-radius:16px;cursor:pointer;padding:16px 30px;
|
| 115 |
+
transition:transform .15s, box-shadow .25s, background .2s;
|
| 116 |
+
}
|
| 117 |
+
.btn-primary{
|
| 118 |
+
background:var(--accent);color:#fff;
|
| 119 |
+
box-shadow:0 10px 24px -12px rgba(94,132,112,.9);
|
| 120 |
+
width:100%;margin-top:26px;
|
| 121 |
+
}
|
| 122 |
+
.btn-primary:hover{background:var(--accent-deep);transform:translateY(-2px);}
|
| 123 |
+
.btn-ghost{
|
| 124 |
+
background:transparent;color:var(--accent-deep);border:1px solid #cfe0d4;
|
| 125 |
+
}
|
| 126 |
+
.btn-ghost:hover{background:var(--accent-soft);}
|
| 127 |
+
|
| 128 |
+
/* ---------- screen 2: working ---------- */
|
| 129 |
+
.working{text-align:center;padding:30px 10px 14px;}
|
| 130 |
+
.breath{
|
| 131 |
+
width:120px;height:120px;margin:6px auto 26px;border-radius:50%;
|
| 132 |
+
background:radial-gradient(circle at 50% 50%, var(--accent-soft), #fff);
|
| 133 |
+
border:1px solid var(--line);
|
| 134 |
+
display:flex;align-items:center;justify-content:center;
|
| 135 |
+
animation:breathe 3.4s ease-in-out infinite;
|
| 136 |
+
}
|
| 137 |
+
.breath span{font-size:2.4rem;}
|
| 138 |
+
@keyframes breathe{0%,100%{transform:scale(1);box-shadow:0 0 0 0 rgba(123,160,135,.25);}50%{transform:scale(1.07);box-shadow:0 0 0 18px rgba(123,160,135,0);}}
|
| 139 |
+
.working h1{font-size:1.7rem;}
|
| 140 |
+
.steps{list-style:none;padding:0;margin:24px auto 0;max-width:380px;text-align:left;font-family:system-ui,sans-serif;}
|
| 141 |
+
.steps li{
|
| 142 |
+
display:flex;align-items:center;gap:12px;padding:9px 0;color:var(--ink-soft);font-size:1rem;
|
| 143 |
+
opacity:.35;transition:opacity .4s;
|
| 144 |
+
}
|
| 145 |
+
.steps li.done{opacity:1;color:var(--ink);}
|
| 146 |
+
.steps li .tick{
|
| 147 |
+
width:22px;height:22px;border-radius:50%;border:2px solid #d8cdb8;flex:none;
|
| 148 |
+
display:flex;align-items:center;justify-content:center;font-size:.8rem;color:#fff;background:transparent;
|
| 149 |
+
}
|
| 150 |
+
.steps li.done .tick{background:var(--accent);border-color:var(--accent);}
|
| 151 |
+
|
| 152 |
+
/* ---------- screen 3: result ---------- */
|
| 153 |
+
.result-head{display:flex;align-items:flex-start;gap:16px;margin-bottom:8px;}
|
| 154 |
+
.badge{
|
| 155 |
+
width:62px;height:62px;flex:none;
|
| 156 |
+
}
|
| 157 |
+
.h-eyebrow{font-family:system-ui,sans-serif;font-size:.82rem;letter-spacing:1.4px;text-transform:uppercase;color:var(--accent-deep);font-weight:700;}
|
| 158 |
+
|
| 159 |
+
.summary{
|
| 160 |
+
background:var(--paper);
|
| 161 |
+
border:1px solid var(--line);
|
| 162 |
+
border-radius:20px;
|
| 163 |
+
padding:24px 26px;
|
| 164 |
+
margin:22px 0 8px;
|
| 165 |
+
font-size:1.12rem;
|
| 166 |
+
}
|
| 167 |
+
.summary p{margin:0 0 14px;}
|
| 168 |
+
.summary p:last-child{margin-bottom:0;}
|
| 169 |
+
.summary .num{color:var(--accent-deep);font-weight:600;}
|
| 170 |
+
|
| 171 |
+
.section-title{
|
| 172 |
+
font-family:system-ui,sans-serif;font-size:.95rem;font-weight:700;
|
| 173 |
+
color:var(--ink-soft);letter-spacing:.4px;margin:34px 0 14px;
|
| 174 |
+
display:flex;align-items:center;gap:9px;
|
| 175 |
+
}
|
| 176 |
+
.section-title .pill{font-size:.7rem;background:var(--accent-soft);color:var(--accent-deep);padding:3px 9px;border-radius:999px;font-weight:700;}
|
| 177 |
+
|
| 178 |
+
/* change cards */
|
| 179 |
+
.change{
|
| 180 |
+
border:1px solid var(--line);border-radius:18px;background:#fff;
|
| 181 |
+
padding:18px 20px;margin-bottom:14px;
|
| 182 |
+
}
|
| 183 |
+
.change .lead{font-size:1.08rem;margin:0 0 12px;}
|
| 184 |
+
.change .lead b{color:var(--ink);}
|
| 185 |
+
.ba{display:flex;gap:10px;flex-wrap:wrap;font-family:system-ui,sans-serif;font-size:.9rem;}
|
| 186 |
+
.chip{
|
| 187 |
+
padding:7px 13px;border-radius:12px;border:1px solid var(--line);
|
| 188 |
+
background:var(--paper);color:var(--ink-soft);
|
| 189 |
+
}
|
| 190 |
+
.chip.after{background:var(--accent-soft);border-color:#cfe0d4;color:var(--accent-deep);font-weight:600;}
|
| 191 |
+
.arrow{align-self:center;color:#bdb3a1;font-family:system-ui,sans-serif;}
|
| 192 |
+
|
| 193 |
+
/* gentle question card */
|
| 194 |
+
.ask{
|
| 195 |
+
border:1px solid #e7dcc4;background:linear-gradient(180deg,#fffdf6,#fbf3e3);
|
| 196 |
+
border-radius:18px;padding:20px 22px;margin-bottom:14px;
|
| 197 |
+
}
|
| 198 |
+
.ask .q{font-size:1.1rem;margin:0 0 6px;}
|
| 199 |
+
.ask .why{font-family:system-ui,sans-serif;font-size:.92rem;color:var(--ink-soft);margin:0 0 16px;}
|
| 200 |
+
.ask .row{display:flex;gap:10px;}
|
| 201 |
+
.ask .btn{padding:11px 20px;font-size:.95rem;}
|
| 202 |
+
|
| 203 |
+
/* honest flags */
|
| 204 |
+
.flag{
|
| 205 |
+
display:flex;gap:12px;align-items:flex-start;
|
| 206 |
+
background:#fff;border:1px dashed #d8cdb8;border-radius:16px;padding:16px 18px;margin-bottom:12px;
|
| 207 |
+
}
|
| 208 |
+
.flag .mark{font-size:1.2rem;flex:none;}
|
| 209 |
+
.flag p{margin:0;font-size:1rem;}
|
| 210 |
+
.flag .small{font-family:system-ui,sans-serif;font-size:.88rem;color:var(--ink-soft);}
|
| 211 |
+
|
| 212 |
+
/* bonus card */
|
| 213 |
+
.bonus{
|
| 214 |
+
background:linear-gradient(180deg,#f4faf6,#eaf3ed);
|
| 215 |
+
border:1px solid #d3e6da;border-radius:20px;padding:22px 24px;margin-top:8px;
|
| 216 |
+
display:flex;gap:16px;align-items:center;
|
| 217 |
+
}
|
| 218 |
+
.bonus .ic{font-size:2rem;flex:none;}
|
| 219 |
+
.bonus h3{margin:0 0 4px;font-size:1.15rem;}
|
| 220 |
+
.bonus p{margin:0;color:var(--ink-soft);font-size:1.02rem;}
|
| 221 |
+
|
| 222 |
+
/* download zone */
|
| 223 |
+
.download{
|
| 224 |
+
margin-top:30px;text-align:center;
|
| 225 |
+
border-top:1px solid var(--line);padding-top:30px;
|
| 226 |
+
}
|
| 227 |
+
.download .btn-primary{width:auto;display:inline-block;padding:18px 44px;font-size:1.12rem;}
|
| 228 |
+
.download .aside{font-family:system-ui,sans-serif;font-size:.92rem;color:var(--ink-soft);margin-top:14px;}
|
| 229 |
+
.download .aside a{color:var(--accent-deep);text-decoration:underline;cursor:pointer;}
|
| 230 |
+
|
| 231 |
+
.reset{display:block;margin:26px auto 0;background:none;border:none;color:var(--ink-soft);
|
| 232 |
+
font-family:system-ui,sans-serif;font-size:.85rem;cursor:pointer;text-decoration:underline;}
|
| 233 |
+
|
| 234 |
+
@media(max-width:560px){
|
| 235 |
+
.card{padding:32px 24px;}
|
| 236 |
+
h1{font-size:1.7rem;}
|
| 237 |
+
.lede{font-size:1.05rem;}
|
| 238 |
+
}
|
| 239 |
+
</style>
|
| 240 |
+
</head>
|
| 241 |
+
<body>
|
| 242 |
+
|
| 243 |
+
<div class="topbar">
|
| 244 |
+
<div class="brand">
|
| 245 |
+
<svg class="leaf" viewBox="0 0 40 40" fill="none">
|
| 246 |
+
<path d="M20 36C8 30 6 16 12 8c8 2 18 8 16 22-1 4-4 6-8 6z" fill="#e8f0ea" stroke="#7ba087" stroke-width="1.6"/>
|
| 247 |
+
<path d="M20 34c-1-8 0-16 6-22" stroke="#7ba087" stroke-width="1.6" stroke-linecap="round"/>
|
| 248 |
+
<path d="M18 24c-2-1-4-3-5-6M22 18c2 0 5 0 7-1" stroke="#7ba087" stroke-width="1.4" stroke-linecap="round"/>
|
| 249 |
+
</svg>
|
| 250 |
+
<div class="brand-name">ScrubData<small>tu lista, ordenada con calma</small></div>
|
| 251 |
+
</div>
|
| 252 |
+
<div class="lang" aria-label="idioma">
|
| 253 |
+
<button class="on">ES</button>
|
| 254 |
+
<button>EN</button>
|
| 255 |
+
</div>
|
| 256 |
+
</div>
|
| 257 |
+
|
| 258 |
+
<div class="stage">
|
| 259 |
+
|
| 260 |
+
<!-- ============ SCREEN 1 : WELCOME + DROP ============ -->
|
| 261 |
+
<section class="screen active" id="s1">
|
| 262 |
+
<div class="card">
|
| 263 |
+
<h1>Hola, Doña Lupe.<br/>Vamos a ordenar su lista, sin prisa.</h1>
|
| 264 |
+
<p class="lede">Suelte aquí su archivo y yo le echo un ojo. Usted no tiene que configurar nada.</p>
|
| 265 |
+
|
| 266 |
+
<div class="drop" onclick="goWork()">
|
| 267 |
+
<span class="basket">🧺</span>
|
| 268 |
+
<div class="big">Suelte su archivo aquí</div>
|
| 269 |
+
<div class="sub">o toque para buscarlo en su computadora · Excel o CSV</div>
|
| 270 |
+
<div class="filechip"><span class="dot"></span> ventas-del-mes.xlsx · listo para revisar</div>
|
| 271 |
+
</div>
|
| 272 |
+
|
| 273 |
+
<button class="btn btn-primary" onclick="goWork()">Ordénalo por mí</button>
|
| 274 |
+
|
| 275 |
+
<div class="safety">
|
| 276 |
+
<svg width="22" height="22" viewBox="0 0 24 24" fill="none"><path d="M12 2l8 3v6c0 5-3.5 8-8 9-4.5-1-8-4-8-9V5l8-3z" stroke="#5e8470" stroke-width="1.6"/><path d="M9 12l2 2 4-4" stroke="#5e8470" stroke-width="1.6" stroke-linecap="round" stroke-linejoin="round"/></svg>
|
| 277 |
+
<div><b>Su original se queda igualito.</b> Hago una copia limpia aparte. Nada sale de esta computadora.</div>
|
| 278 |
+
</div>
|
| 279 |
+
</div>
|
| 280 |
+
</section>
|
| 281 |
+
|
| 282 |
+
<!-- ============ SCREEN 2 : WORKING ============ -->
|
| 283 |
+
<section class="screen" id="s2">
|
| 284 |
+
<div class="card working">
|
| 285 |
+
<div class="breath"><span>🍃</span></div>
|
| 286 |
+
<h1>Trabajando aquí mismo, en su computadora…</h1>
|
| 287 |
+
<p class="lede" style="margin:8px auto 0;">Respire tranquila. Su original está a salvo. Esto toma un momentito.</p>
|
| 288 |
+
<ul class="steps" id="steps">
|
| 289 |
+
<li data-i="0"><span class="tick">✓</span> Leyendo su lista con cuidado</li>
|
| 290 |
+
<li data-i="1"><span class="tick">✓</span> Juntando los tacos que están escritos de varias formas</li>
|
| 291 |
+
<li data-i="2"><span class="tick">✓</span> Revisando teléfonos, fechas y espacios en blanco</li>
|
| 292 |
+
<li data-i="3"><span class="tick">✓</span> Apuntando lo que no estoy segura, para preguntarle</li>
|
| 293 |
+
</ul>
|
| 294 |
+
</div>
|
| 295 |
+
</section>
|
| 296 |
+
|
| 297 |
+
<!-- ============ SCREEN 3 : RESULT ============ -->
|
| 298 |
+
<section class="screen" id="s3">
|
| 299 |
+
<div class="card">
|
| 300 |
+
<div class="result-head">
|
| 301 |
+
<svg class="badge" viewBox="0 0 64 64" fill="none">
|
| 302 |
+
<circle cx="32" cy="32" r="29" fill="#fff" stroke="#d8a25e" stroke-width="2" stroke-dasharray="3 3"/>
|
| 303 |
+
<circle cx="32" cy="32" r="22" fill="#e8f0ea" stroke="#7ba087" stroke-width="1.6"/>
|
| 304 |
+
<path d="M24 33l5 5 11-12" stroke="#5e8470" stroke-width="2.6" stroke-linecap="round" stroke-linejoin="round"/>
|
| 305 |
+
</svg>
|
| 306 |
+
<div>
|
| 307 |
+
<div class="h-eyebrow">Listo · su resumen</div>
|
| 308 |
+
<h1 style="margin-top:2px;">Esto fue lo que encontré.</h1>
|
| 309 |
+
</div>
|
| 310 |
+
</div>
|
| 311 |
+
|
| 312 |
+
<div class="summary">
|
| 313 |
+
<p>Revisé su lista de <b>ventas-del-mes</b> con calma. Esto fue lo que arreglé:</p>
|
| 314 |
+
<p>· <span class="num">«Al pastor»</span> estaba escrito de 4 maneras distintas. Lo junté todo: <span class="num">1,204 vendidos</span>.</p>
|
| 315 |
+
<p>· <span class="num">23 personas</span> aparecían dos veces en su lista. Las reuní para que usted las mire.</p>
|
| 316 |
+
<p>· Puse todos los <span class="num">teléfonos</span> y las <span class="num">fechas</span> escritos igualito, fáciles de leer.</p>
|
| 317 |
+
<p>· <span class="num">14 espacios</span> decían «N/A» o solo un guion — los tomé como vacíos.</p>
|
| 318 |
+
<p style="font-family:system-ui,sans-serif;font-size:.98rem;color:var(--ink-soft);">Puede leerlo en voz alta a Yolanda o imprimirlo. Nada de esto tocó su archivo original.</p>
|
| 319 |
+
</div>
|
| 320 |
+
|
| 321 |
+
<!-- already-done change card (mechanical, safe) -->
|
| 322 |
+
<div class="section-title">Lo que ya dejé arreglado <span class="pill">hecho</span></div>
|
| 323 |
+
|
| 324 |
+
<div class="change">
|
| 325 |
+
<p class="lead">El mismo taco, escrito de varias formas — lo conté junto:</p>
|
| 326 |
+
<div class="ba">
|
| 327 |
+
<span class="chip">al pastor</span>
|
| 328 |
+
<span class="chip">Al Pastor</span>
|
| 329 |
+
<span class="chip">pastor</span>
|
| 330 |
+
<span class="chip">al pastór</span>
|
| 331 |
+
<span class="arrow">→</span>
|
| 332 |
+
<span class="chip after">Al pastor · 1,204</span>
|
| 333 |
+
</div>
|
| 334 |
+
</div>
|
| 335 |
+
|
| 336 |
+
<div class="change">
|
| 337 |
+
<p class="lead">Los teléfonos ahora se ven todos iguales:</p>
|
| 338 |
+
<div class="ba">
|
| 339 |
+
<span class="chip">55-1234 5678</span>
|
| 340 |
+
<span class="chip">5512345678</span>
|
| 341 |
+
<span class="arrow">→</span>
|
| 342 |
+
<span class="chip after">(55) 1234-5678</span>
|
| 343 |
+
</div>
|
| 344 |
+
</div>
|
| 345 |
+
|
| 346 |
+
<!-- gentle confirms (money / identity) -->
|
| 347 |
+
<div class="section-title">Antes de seguir, dos preguntitas <span class="pill" style="background:#f6ecd6;color:#a9742f;">usted decide</span></div>
|
| 348 |
+
|
| 349 |
+
<div class="ask">
|
| 350 |
+
<p class="q">Encontré <b>31 filas en $0.00</b> — parece un error del sistema.</p>
|
| 351 |
+
<p class="why">Si las dejo dentro, bajan su total del mes. ¿Las saco de la suma?</p>
|
| 352 |
+
<div class="row">
|
| 353 |
+
<button class="btn btn-primary" style="width:auto;margin:0;" onclick="this.closest('.ask').style.opacity=.55;this.closest('.row').innerHTML='<span style="font-family:system-ui;color:#5e8470;font-weight:600">✓ Hecho — las dejé fuera del total.</span>'">Sí, sácalas</button>
|
| 354 |
+
<button class="btn btn-ghost" onclick="this.closest('.row').innerHTML='<span style="font-family:system-ui;color:#6f675d">De acuerdo, las dejo en la suma.</span>'">No, déjalas</button>
|
| 355 |
+
</div>
|
| 356 |
+
</div>
|
| 357 |
+
|
| 358 |
+
<div class="ask">
|
| 359 |
+
<p class="q">Estas dos parecen <b>la misma persona</b>: «Yolanda R.» y «Yolanda Reyes».</p>
|
| 360 |
+
<p class="why">¿Las cuento como una sola, o son personas distintas?</p>
|
| 361 |
+
<div class="row">
|
| 362 |
+
<button class="btn btn-primary" style="width:auto;margin:0;" onclick="this.closest('.row').innerHTML='<span style="font-family:system-ui;color:#5e8470;font-weight:600">✓ Las junté en una.</span>'">Sí, es la misma</button>
|
| 363 |
+
<button class="btn btn-ghost" onclick="this.closest('.row').innerHTML='<span style="font-family:system-ui;color:#6f675d">Las dejé separadas.</span>'">Son distintas</button>
|
| 364 |
+
</div>
|
| 365 |
+
</div>
|
| 366 |
+
|
| 367 |
+
<!-- honest flags -->
|
| 368 |
+
<div class="section-title">No estuve segura de esto — se lo dejé a usted</div>
|
| 369 |
+
|
| 370 |
+
<div class="flag">
|
| 371 |
+
<span class="mark">🤔</span>
|
| 372 |
+
<p>Dos teléfonos tenían solo 7 dígitos. No quise inventar los que faltan.<br/>
|
| 373 |
+
<span class="small">Los dejé tal cual para que usted los revise contra su libreta.</span></p>
|
| 374 |
+
</div>
|
| 375 |
+
<div class="flag">
|
| 376 |
+
<span class="mark">🧮</span>
|
| 377 |
+
<p>El total de su caja dice <b>$48,920</b>, pero su lista suma <b>$48,655</b>.<br/>
|
| 378 |
+
<span class="small">No cuadran por $265 — aquí se lo marco para que lo compare con su efectivo.</span></p>
|
| 379 |
+
</div>
|
| 380 |
+
|
| 381 |
+
<!-- bonus -->
|
| 382 |
+
<div class="bonus">
|
| 383 |
+
<span class="ic">🌶️</span>
|
| 384 |
+
<div>
|
| 385 |
+
<h3>Ah, y una cosita más…</h3>
|
| 386 |
+
<p>Se le está acabando el <b>adobo de pastor</b> — fue el más vendido del mes. Quizá conviene pedir más antes del finde.</p>
|
| 387 |
+
</div>
|
| 388 |
+
</div>
|
| 389 |
+
|
| 390 |
+
<!-- download -->
|
| 391 |
+
<div class="download">
|
| 392 |
+
<button class="btn btn-primary">Descargar mi copia limpia</button>
|
| 393 |
+
<p class="aside">
|
| 394 |
+
Su original sigue a salvo en su USB. ·
|
| 395 |
+
<a onclick="alert('Su archivo original nunca se tocó — está justo donde lo dejó.')">Devolver todo como estaba</a><br/>
|
| 396 |
+
También puede <a onclick="window.print()">imprimir este resumen</a> para Yolanda.
|
| 397 |
+
</p>
|
| 398 |
+
</div>
|
| 399 |
+
|
| 400 |
+
<div class="safety" style="margin-top:30px;">
|
| 401 |
+
<svg width="22" height="22" viewBox="0 0 24 24" fill="none"><path d="M12 2l8 3v6c0 5-3.5 8-8 9-4.5-1-8-4-8-9V5l8-3z" stroke="#5e8470" stroke-width="1.6"/><path d="M9 12l2 2 4-4" stroke="#5e8470" stroke-width="1.6" stroke-linecap="round" stroke-linejoin="round"/></svg>
|
| 402 |
+
<div><b>Buen trabajo, Doña Lupe.</b> Su lista quedó en buen estado, y usted la revisó con sus propios ojos. Nada salió de esta computadora.</div>
|
| 403 |
+
</div>
|
| 404 |
+
|
| 405 |
+
<button class="reset" onclick="reset()">Empezar de nuevo con otro archivo</button>
|
| 406 |
+
</div>
|
| 407 |
+
</section>
|
| 408 |
+
|
| 409 |
+
</div>
|
| 410 |
+
|
| 411 |
+
<script>
|
| 412 |
+
function show(id){
|
| 413 |
+
document.querySelectorAll('.screen').forEach(s=>s.classList.remove('active'));
|
| 414 |
+
document.getElementById(id).classList.add('active');
|
| 415 |
+
window.scrollTo({top:0,behavior:'smooth'});
|
| 416 |
+
}
|
| 417 |
+
function goWork(){
|
| 418 |
+
show('s2');
|
| 419 |
+
const lis=document.querySelectorAll('#steps li');
|
| 420 |
+
lis.forEach(l=>l.classList.remove('done'));
|
| 421 |
+
let i=0;
|
| 422 |
+
const t=setInterval(()=>{
|
| 423 |
+
if(i<lis.length){lis[i].classList.add('done');i++;}
|
| 424 |
+
else{clearInterval(t);setTimeout(()=>show('s3'),650);}
|
| 425 |
+
},720);
|
| 426 |
+
}
|
| 427 |
+
function reset(){show('s1');}
|
| 428 |
+
</script>
|
| 429 |
+
</body>
|
| 430 |
+
</html>
|
design/mockups/cozy/index.html
ADDED
|
@@ -0,0 +1,526 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="es">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="utf-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 6 |
+
<title>ScrubData — tu ayudante de cocina para los números</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root{
|
| 9 |
+
--paper:#fbf4e7;
|
| 10 |
+
--paper-2:#f5ead4;
|
| 11 |
+
--card:#fffaf0;
|
| 12 |
+
--ink:#4a3a2c;
|
| 13 |
+
--ink-soft:#7a6a58;
|
| 14 |
+
--line:#e6d6ba;
|
| 15 |
+
--moss:#6f8f5a;
|
| 16 |
+
--moss-deep:#52733f;
|
| 17 |
+
--berry:#c4694e;
|
| 18 |
+
--gold:#d9a441;
|
| 19 |
+
--sky:#8aa9b8;
|
| 20 |
+
--shadow:0 10px 30px rgba(120,90,50,.12);
|
| 21 |
+
--shadow-soft:0 4px 14px rgba(120,90,50,.10);
|
| 22 |
+
--radius:22px;
|
| 23 |
+
}
|
| 24 |
+
*{box-sizing:border-box}
|
| 25 |
+
html,body{margin:0}
|
| 26 |
+
body{
|
| 27 |
+
font-family:"Iowan Old Style","Palatino Linotype","Book Antiqua",Georgia,"Segoe UI",serif;
|
| 28 |
+
color:var(--ink);
|
| 29 |
+
background:
|
| 30 |
+
radial-gradient(circle at 15% 8%, #fdf8ee 0%, transparent 40%),
|
| 31 |
+
radial-gradient(circle at 90% 92%, #f6ecd6 0%, transparent 45%),
|
| 32 |
+
var(--paper);
|
| 33 |
+
line-height:1.55;
|
| 34 |
+
-webkit-font-smoothing:antialiased;
|
| 35 |
+
min-height:100vh;
|
| 36 |
+
}
|
| 37 |
+
/* faint paper grain + dotted-trail texture */
|
| 38 |
+
body::before{
|
| 39 |
+
content:"";position:fixed;inset:0;pointer-events:none;z-index:0;opacity:.5;
|
| 40 |
+
background-image:radial-gradient(rgba(180,150,100,.10) 1px, transparent 1.4px);
|
| 41 |
+
background-size:22px 22px;
|
| 42 |
+
}
|
| 43 |
+
.wrap{position:relative;z-index:1;max-width:880px;margin:0 auto;padding:28px 20px 80px}
|
| 44 |
+
|
| 45 |
+
/* ---------- top bar ---------- */
|
| 46 |
+
.topbar{display:flex;align-items:center;justify-content:space-between;gap:12px;margin-bottom:14px}
|
| 47 |
+
.brand{display:flex;align-items:center;gap:12px}
|
| 48 |
+
.logo{width:46px;height:46px;flex:0 0 auto}
|
| 49 |
+
.brand h1{font-size:1.32rem;margin:0;letter-spacing:.2px}
|
| 50 |
+
.brand .tag{margin:0;font-size:.86rem;color:var(--ink-soft);font-style:italic}
|
| 51 |
+
.lang{display:flex;background:var(--card);border:1.5px solid var(--line);border-radius:999px;padding:3px;box-shadow:var(--shadow-soft)}
|
| 52 |
+
.lang button{border:0;background:transparent;font:inherit;font-size:.85rem;color:var(--ink-soft);padding:5px 13px;border-radius:999px;cursor:pointer}
|
| 53 |
+
.lang button.on{background:var(--moss);color:#fff;box-shadow:0 2px 6px rgba(80,110,60,.35)}
|
| 54 |
+
|
| 55 |
+
/* ---------- persistent safety ribbon ---------- */
|
| 56 |
+
.safe{
|
| 57 |
+
display:flex;align-items:center;gap:12px;
|
| 58 |
+
background:linear-gradient(180deg,#f2f7ec,#eaf2e0);
|
| 59 |
+
border:1.5px solid #d6e3c4;border-radius:16px;
|
| 60 |
+
padding:11px 16px;margin-bottom:22px;box-shadow:var(--shadow-soft);
|
| 61 |
+
}
|
| 62 |
+
.safe svg{flex:0 0 auto}
|
| 63 |
+
.safe p{margin:0;font-size:.92rem;color:var(--moss-deep)}
|
| 64 |
+
.safe b{color:var(--moss-deep)}
|
| 65 |
+
|
| 66 |
+
/* ---------- cards / screens ---------- */
|
| 67 |
+
.screen{display:none;animation:rise .5s ease both}
|
| 68 |
+
.screen.active{display:block}
|
| 69 |
+
@keyframes rise{from{opacity:0;transform:translateY(14px)}to{opacity:1;transform:none}}
|
| 70 |
+
|
| 71 |
+
.card{
|
| 72 |
+
background:var(--card);border:1.5px solid var(--line);
|
| 73 |
+
border-radius:var(--radius);box-shadow:var(--shadow);
|
| 74 |
+
padding:30px 30px 32px;position:relative;
|
| 75 |
+
}
|
| 76 |
+
.card + .card{margin-top:20px}
|
| 77 |
+
|
| 78 |
+
/* ---------- welcome / drop ---------- */
|
| 79 |
+
.hello{text-align:center}
|
| 80 |
+
.hello h2{font-size:1.75rem;margin:6px 0 6px}
|
| 81 |
+
.hello .sub{color:var(--ink-soft);font-size:1.05rem;margin:0 auto 24px;max-width:520px}
|
| 82 |
+
.drop{
|
| 83 |
+
border:2.5px dashed #d8b873;border-radius:20px;
|
| 84 |
+
background:linear-gradient(180deg,#fffdf6,#fdf3df);
|
| 85 |
+
padding:38px 24px;text-align:center;cursor:pointer;transition:.2s;
|
| 86 |
+
}
|
| 87 |
+
.drop:hover{border-color:var(--gold);background:#fff8e8;transform:translateY(-2px)}
|
| 88 |
+
.drop .basket{font-size:0;line-height:0;margin-bottom:10px}
|
| 89 |
+
.drop h3{margin:8px 0 4px;font-size:1.2rem}
|
| 90 |
+
.drop p{margin:0;color:var(--ink-soft);font-size:.95rem}
|
| 91 |
+
.or{color:var(--ink-soft);font-size:.85rem;margin:14px 0 4px}
|
| 92 |
+
.filechip{
|
| 93 |
+
display:inline-flex;align-items:center;gap:10px;background:#fff;border:1.5px solid var(--line);
|
| 94 |
+
border-radius:14px;padding:9px 14px;margin-top:6px;font-size:.92rem;box-shadow:var(--shadow-soft)
|
| 95 |
+
}
|
| 96 |
+
.filechip .x{color:var(--ink-soft);font-size:.8rem}
|
| 97 |
+
.btn{
|
| 98 |
+
border:0;font:inherit;cursor:pointer;border-radius:16px;font-size:1.06rem;
|
| 99 |
+
padding:14px 30px;font-weight:600;letter-spacing:.2px;transition:.16s;
|
| 100 |
+
}
|
| 101 |
+
.btn-go{background:var(--berry);color:#fff;box-shadow:0 6px 16px rgba(196,105,78,.35);margin-top:24px}
|
| 102 |
+
.btn-go:hover{transform:translateY(-2px);box-shadow:0 9px 22px rgba(196,105,78,.42)}
|
| 103 |
+
.btn-ghost{background:#fff;color:var(--ink);border:1.5px solid var(--line)}
|
| 104 |
+
.btn-ghost:hover{background:#fffdf6}
|
| 105 |
+
|
| 106 |
+
/* ---------- tidying ---------- */
|
| 107 |
+
.tidy{text-align:center;padding:54px 30px}
|
| 108 |
+
.tidy h2{font-size:1.5rem;margin:18px 0 6px}
|
| 109 |
+
.tidy p{color:var(--ink-soft);margin:0 auto;max-width:440px}
|
| 110 |
+
.scene{width:160px;height:120px;margin:0 auto 6px;position:relative}
|
| 111 |
+
.broom{position:absolute;left:46px;top:6px;transform-origin:78px 12px;animation:sweep 1.1s ease-in-out infinite}
|
| 112 |
+
@keyframes sweep{0%,100%{transform:rotate(-13deg)}50%{transform:rotate(13deg)}}
|
| 113 |
+
.spk{position:absolute;font-size:0;animation:twinkle 1.4s ease-in-out infinite}
|
| 114 |
+
.spk:nth-child(2){left:24px;top:70px;animation-delay:.1s}
|
| 115 |
+
.spk:nth-child(3){left:120px;top:54px;animation-delay:.5s}
|
| 116 |
+
.spk:nth-child(4){left:70px;top:96px;animation-delay:.8s}
|
| 117 |
+
@keyframes twinkle{0%,100%{opacity:.2;transform:scale(.7)}50%{opacity:1;transform:scale(1.1)}}
|
| 118 |
+
.bar{height:12px;background:#efe2c8;border-radius:99px;overflow:hidden;max-width:340px;margin:22px auto 0;border:1px solid var(--line)}
|
| 119 |
+
.bar i{display:block;height:100%;width:0;background:linear-gradient(90deg,var(--moss),var(--gold));border-radius:99px;animation:fill 4.2s ease forwards}
|
| 120 |
+
@keyframes fill{to{width:100%}}
|
| 121 |
+
.tidy .micro{font-size:.85rem;color:var(--moss-deep);margin-top:14px}
|
| 122 |
+
|
| 123 |
+
/* ---------- result ---------- */
|
| 124 |
+
.result-head{text-align:center;margin-bottom:6px}
|
| 125 |
+
.badge-row{display:flex;justify-content:center;gap:10px;margin-bottom:8px}
|
| 126 |
+
.merit{display:flex;flex-direction:column;align-items:center;gap:4px;font-size:.72rem;color:var(--moss-deep);width:84px;text-align:center}
|
| 127 |
+
.result-head h2{font-size:1.6rem;margin:6px 0 2px}
|
| 128 |
+
.result-head .sub{color:var(--ink-soft);margin:0 0 6px}
|
| 129 |
+
|
| 130 |
+
.summary{background:linear-gradient(180deg,#fffdf6,#fbf3e0);border:1.5px solid var(--line)}
|
| 131 |
+
.summary h3{margin:0 0 4px;font-size:1.22rem}
|
| 132 |
+
.summary .read{font-size:.82rem;color:var(--ink-soft);font-style:italic;margin:0 0 14px}
|
| 133 |
+
.sline{display:flex;gap:13px;align-items:flex-start;padding:11px 0;border-top:1px dotted var(--line)}
|
| 134 |
+
.sline:first-of-type{border-top:0}
|
| 135 |
+
.sline .ic{flex:0 0 auto;margin-top:2px}
|
| 136 |
+
.sline p{margin:0;font-size:1.02rem}
|
| 137 |
+
.sline b{color:var(--moss-deep)}
|
| 138 |
+
|
| 139 |
+
.secttitle{font-size:1.05rem;color:var(--ink-soft);margin:26px 4px 10px;display:flex;align-items:center;gap:8px;font-style:italic}
|
| 140 |
+
|
| 141 |
+
/* change cards */
|
| 142 |
+
.chg{padding:18px 20px}
|
| 143 |
+
.chg.done{border-left:6px solid var(--moss)}
|
| 144 |
+
.chg.ask{border-left:6px solid var(--gold);background:linear-gradient(180deg,#fffdf3,#fdf6e2)}
|
| 145 |
+
.chg.flag{border-left:6px solid var(--sky)}
|
| 146 |
+
.chg h4{margin:0 0 10px;font-size:1.08rem;display:flex;align-items:center;gap:9px}
|
| 147 |
+
.chk{font-size:.72rem;background:#eaf2e0;color:var(--moss-deep);padding:2px 9px;border-radius:99px;font-weight:600;letter-spacing:.3px}
|
| 148 |
+
.pill-ask{font-size:.72rem;background:#f7ead0;color:#9a7a2e;padding:2px 9px;border-radius:99px;font-weight:600;letter-spacing:.3px}
|
| 149 |
+
.pill-flag{font-size:.72rem;background:#e3edf2;color:#5b7d8c;padding:2px 9px;border-radius:99px;font-weight:600;letter-spacing:.3px}
|
| 150 |
+
|
| 151 |
+
.ba{display:flex;gap:12px;align-items:stretch;flex-wrap:wrap}
|
| 152 |
+
.ba .col{flex:1 1 200px;border:1.5px solid var(--line);border-radius:14px;overflow:hidden;background:#fff}
|
| 153 |
+
.ba .col .ttl{font-size:.74rem;letter-spacing:.6px;text-transform:uppercase;color:var(--ink-soft);padding:7px 12px;background:#faf4e6;border-bottom:1px solid var(--line)}
|
| 154 |
+
.ba .col.after .ttl{background:#eef5e6;color:var(--moss-deep)}
|
| 155 |
+
.row{display:flex;justify-content:space-between;gap:10px;padding:7px 12px;font-size:.95rem;border-top:1px dashed #efe6d2}
|
| 156 |
+
.row:first-of-type{border-top:0}
|
| 157 |
+
.row .q{color:var(--ink-soft)}
|
| 158 |
+
.ba .col.before .was{color:#a9947d}
|
| 159 |
+
.ba .col.after .now{color:var(--moss-deep);font-weight:600}
|
| 160 |
+
.arrow{display:flex;align-items:center;color:var(--gold);font-size:1.3rem}
|
| 161 |
+
@media(max-width:560px){.arrow{transform:rotate(90deg)}}
|
| 162 |
+
|
| 163 |
+
.askbtns{display:flex;gap:10px;margin-top:14px;flex-wrap:wrap}
|
| 164 |
+
.askbtns .yes{background:var(--moss);color:#fff;padding:9px 18px;border-radius:13px;border:0;font:inherit;font-weight:600;cursor:pointer}
|
| 165 |
+
.askbtns .no{background:#fff;color:var(--ink);border:1.5px solid var(--line);padding:9px 18px;border-radius:13px;font:inherit;cursor:pointer}
|
| 166 |
+
.askbtns .yes:hover{background:var(--moss-deep)}
|
| 167 |
+
.answered{display:none;align-items:center;gap:8px;color:var(--moss-deep);font-size:.92rem;margin-top:12px;background:#eef5e6;padding:8px 12px;border-radius:11px}
|
| 168 |
+
|
| 169 |
+
/* bonus card */
|
| 170 |
+
.bonus{background:linear-gradient(135deg,#fdf6e6,#f6efe0);border:1.5px solid #ecd9b0}
|
| 171 |
+
.bonus h4{margin:0 0 6px;font-size:1.12rem;display:flex;align-items:center;gap:9px}
|
| 172 |
+
.bonus ul{margin:8px 0 0;padding-left:4px;list-style:none}
|
| 173 |
+
.bonus li{padding:5px 0;font-size:1rem;display:flex;gap:9px;align-items:center}
|
| 174 |
+
.bonus li .dot{width:9px;height:9px;border-radius:99px;background:var(--berry);flex:0 0 auto}
|
| 175 |
+
|
| 176 |
+
/* download footer */
|
| 177 |
+
.getit{text-align:center;background:linear-gradient(180deg,#f2f7ec,#e9f1de);border:1.5px solid #d6e3c4}
|
| 178 |
+
.getit h3{margin:0 0 4px;font-size:1.3rem;color:var(--moss-deep)}
|
| 179 |
+
.getit p{margin:0 0 18px;color:var(--ink-soft)}
|
| 180 |
+
.getit .btns{display:flex;gap:12px;justify-content:center;flex-wrap:wrap}
|
| 181 |
+
.btn-dl{background:var(--moss);color:#fff;box-shadow:0 6px 16px rgba(80,110,60,.32)}
|
| 182 |
+
.btn-dl:hover{transform:translateY(-2px)}
|
| 183 |
+
.undo{margin-top:18px;font-size:.9rem;color:var(--moss-deep)}
|
| 184 |
+
.undo a{color:var(--berry);text-decoration:underline;cursor:pointer}
|
| 185 |
+
|
| 186 |
+
.restart{display:block;margin:26px auto 0;color:var(--ink-soft);background:none;border:0;font:inherit;font-size:.85rem;text-decoration:underline;cursor:pointer}
|
| 187 |
+
.footnote{text-align:center;color:var(--ink-soft);font-size:.8rem;margin-top:30px;font-style:italic}
|
| 188 |
+
.es{display:none}
|
| 189 |
+
body.es-on .en{display:none}
|
| 190 |
+
body.es-on .es{display:inline}
|
| 191 |
+
body.es-on .es.block{display:block}
|
| 192 |
+
</style>
|
| 193 |
+
</head>
|
| 194 |
+
<body class="es-on">
|
| 195 |
+
<div class="wrap">
|
| 196 |
+
|
| 197 |
+
<!-- top bar -->
|
| 198 |
+
<div class="topbar">
|
| 199 |
+
<div class="brand">
|
| 200 |
+
<svg class="logo" viewBox="0 0 48 48" fill="none">
|
| 201 |
+
<path d="M24 4c7 0 12 4 12 4s-2 8-2 14c0 9-5 18-10 18S14 31 14 22c0-6-2-14-2-14s5-4 12-4z" fill="#7e9f63" stroke="#52733f" stroke-width="1.6"/>
|
| 202 |
+
<path d="M24 9v28" stroke="#52733f" stroke-width="1.4"/>
|
| 203 |
+
<path d="M24 18l6-5M24 24l-6-5M24 30l6-5" stroke="#52733f" stroke-width="1.3"/>
|
| 204 |
+
</svg>
|
| 205 |
+
<div>
|
| 206 |
+
<h1>ScrubData</h1>
|
| 207 |
+
<p class="tag"><span class="es">tu ayudante para ordenar tus listas</span><span class="en">your little helper for tidy lists</span></p>
|
| 208 |
+
</div>
|
| 209 |
+
</div>
|
| 210 |
+
<div class="lang">
|
| 211 |
+
<button id="bES" class="on" onclick="setLang('es')">Español</button>
|
| 212 |
+
<button id="bEN" onclick="setLang('en')">English</button>
|
| 213 |
+
</div>
|
| 214 |
+
</div>
|
| 215 |
+
|
| 216 |
+
<!-- persistent safety ribbon -->
|
| 217 |
+
<div class="safe">
|
| 218 |
+
<svg width="26" height="26" viewBox="0 0 24 24" fill="none"><path d="M12 2l8 3v6c0 5-3.5 9-8 11C7.5 20 4 16 4 11V5l8-3z" fill="#cfe0bd" stroke="#52733f" stroke-width="1.4"/><path d="M8.5 12l2.5 2.5L16 9" stroke="#52733f" stroke-width="1.7" stroke-linecap="round" stroke-linejoin="round"/></svg>
|
| 219 |
+
<p>
|
| 220 |
+
<span class="es"><b>Tu archivo original queda igualito.</b> Nada sale de esta computadora — todo se hace aquí mismo.</span>
|
| 221 |
+
<span class="en"><b>Your original stays exactly as it is.</b> Nothing leaves this computer — it all happens right here.</span>
|
| 222 |
+
</p>
|
| 223 |
+
</div>
|
| 224 |
+
|
| 225 |
+
<!-- ===================== SCREEN 1: WELCOME ===================== -->
|
| 226 |
+
<section id="s1" class="screen active">
|
| 227 |
+
<div class="card hello">
|
| 228 |
+
<div style="font-size:0;line-height:0">
|
| 229 |
+
<svg width="86" height="74" viewBox="0 0 86 74" fill="none" style="margin:0 auto">
|
| 230 |
+
<ellipse cx="43" cy="64" rx="30" ry="6" fill="#ead9b9"/>
|
| 231 |
+
<path d="M16 40h54l-5 22a4 4 0 0 1-4 3H25a4 4 0 0 1-4-3L16 40z" fill="#e7b86b" stroke="#b9863a" stroke-width="1.6"/>
|
| 232 |
+
<path d="M16 40h54" stroke="#b9863a" stroke-width="1.6"/>
|
| 233 |
+
<path d="M22 40c0-12 9-21 21-21s21 9 21 21" stroke="#b9863a" stroke-width="1.6" fill="#f3d9a3"/>
|
| 234 |
+
<circle cx="34" cy="33" r="3" fill="#c4694e"/><circle cx="50" cy="31" r="3" fill="#6f8f5a"/><circle cx="43" cy="36" r="3" fill="#d9a441"/>
|
| 235 |
+
</svg>
|
| 236 |
+
</div>
|
| 237 |
+
<h2>
|
| 238 |
+
<span class="es">Hola, Doña Lupe. ¿Le ayudo con su lista?</span>
|
| 239 |
+
<span class="en">Hi, Lupe. Want a hand with your list?</span>
|
| 240 |
+
</h2>
|
| 241 |
+
<p class="sub">
|
| 242 |
+
<span class="es">Suéltela aquí y yo la reviso con calma — sin botones raros ni cosas que configurar.</span>
|
| 243 |
+
<span class="en">Drop it here and I'll look it over, calmly — no strange buttons, nothing to set up.</span>
|
| 244 |
+
</p>
|
| 245 |
+
|
| 246 |
+
<div class="drop" onclick="pick()">
|
| 247 |
+
<div class="basket">
|
| 248 |
+
<svg width="58" height="50" viewBox="0 0 58 50" fill="none" style="margin:0 auto">
|
| 249 |
+
<path d="M6 22h46l-4 22a3 3 0 0 1-3 3H13a3 3 0 0 1-3-3L6 22z" fill="#f3e3c2" stroke="#c79a52" stroke-width="1.6"/>
|
| 250 |
+
<path d="M6 22h46M16 22l3 25M40 22l-3 25M29 22v25" stroke="#c79a52" stroke-width="1.2"/>
|
| 251 |
+
<path d="M16 22c0-9 6-15 13-15s13 6 13 15" stroke="#c79a52" stroke-width="1.6"/>
|
| 252 |
+
</svg>
|
| 253 |
+
</div>
|
| 254 |
+
<h3><span class="es">Suelte su archivo aquí</span><span class="en">Drop your file here</span></h3>
|
| 255 |
+
<p><span class="es">Excel o CSV — yo me encargo del resto.</span><span class="en">Excel or CSV — I'll handle the rest.</span></p>
|
| 256 |
+
<p class="or"><span class="es">— o —</span><span class="en">— or —</span></p>
|
| 257 |
+
<span class="filechip">
|
| 258 |
+
<svg width="16" height="16" viewBox="0 0 24 24" fill="none"><path d="M6 3h8l5 5v13a1 1 0 0 1-1 1H6a1 1 0 0 1-1-1V4a1 1 0 0 1 1-1z" fill="#f3e3c2" stroke="#c79a52" stroke-width="1.4"/></svg>
|
| 259 |
+
<span class="es">elegir de mi computadora</span><span class="en">choose from my computer</span>
|
| 260 |
+
</span>
|
| 261 |
+
</div>
|
| 262 |
+
|
| 263 |
+
<div style="margin-top:8px">
|
| 264 |
+
<span class="filechip" style="border-color:#cfe0bd;background:#f2f7ec">
|
| 265 |
+
<svg width="15" height="15" viewBox="0 0 24 24"><circle cx="12" cy="12" r="10" fill="#cfe0bd"/><path d="M8 12l3 3 5-6" stroke="#52733f" stroke-width="2" fill="none" stroke-linecap="round"/></svg>
|
| 266 |
+
ventas-mayo.xlsx <span class="x">· 1,431 <span class="es">renglones</span><span class="en">lines</span></span>
|
| 267 |
+
</span>
|
| 268 |
+
</div>
|
| 269 |
+
|
| 270 |
+
<button class="btn btn-go" onclick="go()">
|
| 271 |
+
<span class="es">Vamos a ordenarla ✦</span><span class="en">Let's tidy it up ✦</span>
|
| 272 |
+
</button>
|
| 273 |
+
</div>
|
| 274 |
+
</section>
|
| 275 |
+
|
| 276 |
+
<!-- ===================== SCREEN 2: TIDYING ===================== -->
|
| 277 |
+
<section id="s2" class="screen">
|
| 278 |
+
<div class="card tidy">
|
| 279 |
+
<div class="scene">
|
| 280 |
+
<span class="spk"><svg width="14" height="14" viewBox="0 0 24 24"><path d="M12 2l2 8 8 2-8 2-2 8-2-8-8-2 8-2 2-8z" fill="#d9a441"/></svg></span>
|
| 281 |
+
<span class="spk"><svg width="11" height="11" viewBox="0 0 24 24"><path d="M12 2l2 8 8 2-8 2-2 8-2-8-8-2 8-2 2-8z" fill="#6f8f5a"/></svg></span>
|
| 282 |
+
<span class="spk"><svg width="13" height="13" viewBox="0 0 24 24"><path d="M12 2l2 8 8 2-8 2-2 8-2-8-8-2 8-2 2-8z" fill="#c4694e"/></svg></span>
|
| 283 |
+
<div class="broom">
|
| 284 |
+
<svg width="64" height="110" viewBox="0 0 64 110" fill="none">
|
| 285 |
+
<rect x="30" y="2" width="5" height="64" rx="2.5" fill="#b9863a"/>
|
| 286 |
+
<path d="M18 64h28l6 38c0 3-3 4-6 4H18c-3 0-6-1-6-4l6-38z" fill="#e7b86b" stroke="#b9863a" stroke-width="1.6"/>
|
| 287 |
+
<path d="M22 78v24M30 78v26M38 78v24M46 78v22" stroke="#b9863a" stroke-width="1.3"/>
|
| 288 |
+
</svg>
|
| 289 |
+
</div>
|
| 290 |
+
</div>
|
| 291 |
+
<h2><span class="es">Ordenando con cuidado…</span><span class="en">Tidying up, gently…</span></h2>
|
| 292 |
+
<p>
|
| 293 |
+
<span class="es">Estoy aquí mismo en su computadora, sin prisas. Su archivo original sigue a salvo.</span>
|
| 294 |
+
<span class="en">I'm right here on your computer, taking my time. Your original is safe.</span>
|
| 295 |
+
</p>
|
| 296 |
+
<div class="bar"><i></i></div>
|
| 297 |
+
<p class="micro" id="step">
|
| 298 |
+
<span class="es">Juntando los tacos que están escritos de varias maneras…</span>
|
| 299 |
+
<span class="en">Gathering the items written a few different ways…</span>
|
| 300 |
+
</p>
|
| 301 |
+
</div>
|
| 302 |
+
</section>
|
| 303 |
+
|
| 304 |
+
<!-- ===================== SCREEN 3: RESULT ===================== -->
|
| 305 |
+
<section id="s3" class="screen">
|
| 306 |
+
|
| 307 |
+
<!-- merit + hero -->
|
| 308 |
+
<div class="card result-head">
|
| 309 |
+
<div class="badge-row">
|
| 310 |
+
<div class="merit">
|
| 311 |
+
<svg width="52" height="52" viewBox="0 0 52 52"><circle cx="26" cy="26" r="22" fill="#eef5e6" stroke="#6f8f5a" stroke-width="2"/><path d="M26 6l4 5 6-2-1 6 6 3-5 4 2 6-6-1-3 6-3-6-6 1 2-6-5-4 6-3-1-6 6 2 4-5z" fill="#cfe0bd"/><path d="M20 26l4 4 8-9" stroke="#52733f" stroke-width="2.4" fill="none" stroke-linecap="round" stroke-linejoin="round"/></svg>
|
| 312 |
+
<span><span class="es">lista ordenada</span><span class="en">tidy list</span></span>
|
| 313 |
+
</div>
|
| 314 |
+
<div class="merit">
|
| 315 |
+
<svg width="52" height="52" viewBox="0 0 52 52"><circle cx="26" cy="26" r="22" fill="#fdf2dc" stroke="#d9a441" stroke-width="2"/><path d="M26 14a8 8 0 0 1 8 8c0 5-8 12-8 12s-8-7-8-12a8 8 0 0 1 8-8z" fill="#f3d9a3" stroke="#b9863a" stroke-width="1.4"/><circle cx="26" cy="22" r="3" fill="#c4694e"/></svg>
|
| 316 |
+
<span><span class="es">nada se subió</span><span class="en">nothing uploaded</span></span>
|
| 317 |
+
</div>
|
| 318 |
+
</div>
|
| 319 |
+
<h2><span class="es">Listo. Esto fue lo que encontré 🌿</span><span class="en">All done. Here's what I found 🌿</span></h2>
|
| 320 |
+
<p class="sub"><span class="es">Léalo con calma. Usted decide lo que toca el dinero.</span><span class="en">Read it calmly. You decide anything that touches money.</span></p>
|
| 321 |
+
</div>
|
| 322 |
+
|
| 323 |
+
<!-- THE SUMMARY (hero) -->
|
| 324 |
+
<div class="card summary">
|
| 325 |
+
<h3><span class="es">Su resumen, en palabras sencillas</span><span class="en">Your summary, in plain words</span></h3>
|
| 326 |
+
<p class="read"><span class="es">— puede leerlo en voz alta a Yolanda, o imprimirlo.</span><span class="en">— you can read it aloud to Yolanda, or print it.</span></p>
|
| 327 |
+
|
| 328 |
+
<div class="sline">
|
| 329 |
+
<span class="ic"><svg width="22" height="22" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#eef5e6"/><path d="M7 12l3.5 3.5L17 8" stroke="#52733f" stroke-width="2.2" fill="none" stroke-linecap="round" stroke-linejoin="round"/></svg></span>
|
| 330 |
+
<p class="es">El <b>al pastor</b> estaba escrito de <b>4 maneras</b> (al pastor, Al Pastor, pastor, "al pstr"). Los junté todos: <b>1,204 vendidos</b>.</p>
|
| 331 |
+
<p class="en"><b>Al pastor</b> was written <b>4 ways</b> (al pastor, Al Pastor, pastor, "al pstr"). I counted them together: <b>1,204 sold</b>.</p>
|
| 332 |
+
</div>
|
| 333 |
+
<div class="sline">
|
| 334 |
+
<span class="ic"><svg width="22" height="22" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#eef5e6"/><path d="M7 12l3.5 3.5L17 8" stroke="#52733f" stroke-width="2.2" fill="none" stroke-linecap="round" stroke-linejoin="round"/></svg></span>
|
| 335 |
+
<p class="es">Unos espacios estaban en blanco (escritos como <b>"N/A"</b> o solo una raya). Los traté como vacíos.</p>
|
| 336 |
+
<p class="en">Some spots were left blank (written as <b>"N/A"</b> or just a dash). I treated those as empty.</p>
|
| 337 |
+
</div>
|
| 338 |
+
<div class="sline">
|
| 339 |
+
<span class="ic"><svg width="22" height="22" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#eef5e6"/><path d="M7 12l3.5 3.5L17 8" stroke="#52733f" stroke-width="2.2" fill="none" stroke-linecap="round" stroke-linejoin="round"/></svg></span>
|
| 340 |
+
<p class="es">Puse todos los <b>teléfonos</b> y las <b>fechas</b> igualitos, para que se lean fácil.</p>
|
| 341 |
+
<p class="en">I made all the <b>phone numbers</b> and <b>dates</b> match, so they're easy to read.</p>
|
| 342 |
+
</div>
|
| 343 |
+
<div class="sline">
|
| 344 |
+
<span class="ic"><svg width="22" height="22" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#fdf2dc"/><path d="M12 6v7M12 16.5v.5" stroke="#b9863a" stroke-width="2.2" fill="none" stroke-linecap="round"/></svg></span>
|
| 345 |
+
<p class="es">Hay <b>2 cositas</b> que prefiero <b>preguntarle</b> antes de tocar — porque tienen que ver con dinero. Están abajo. 👇</p>
|
| 346 |
+
<p class="en">There are <b>2 things</b> I'd rather <b>ask you</b> about before touching — because they involve money. They're below. 👇</p>
|
| 347 |
+
</div>
|
| 348 |
+
</div>
|
| 349 |
+
|
| 350 |
+
<!-- DONE change card with before/after -->
|
| 351 |
+
<div class="secttitle">
|
| 352 |
+
<svg width="18" height="18" viewBox="0 0 24 24"><circle cx="12" cy="12" r="10" fill="#cfe0bd"/><path d="M8 12l3 3 5-6" stroke="#52733f" stroke-width="2" fill="none" stroke-linecap="round"/></svg>
|
| 353 |
+
<span class="es">Lo que ya dejé arregladito</span><span class="en">What I already tidied for you</span>
|
| 354 |
+
</div>
|
| 355 |
+
|
| 356 |
+
<div class="card chg done">
|
| 357 |
+
<h4><span class="es">El mismo taco, contado junto</span><span class="en">The same taco, counted together</span> <span class="chk"><span class="es">YA HECHO</span><span class="en">DONE</span></span></h4>
|
| 358 |
+
<div class="ba">
|
| 359 |
+
<div class="col before">
|
| 360 |
+
<div class="ttl"><span class="es">Antes</span><span class="en">Before</span></div>
|
| 361 |
+
<div class="row"><span class="q was">al pastor</span><span class="was">312</span></div>
|
| 362 |
+
<div class="row"><span class="q was">Al Pastor</span><span class="was">520</span></div>
|
| 363 |
+
<div class="row"><span class="q was">pastor</span><span class="was">301</span></div>
|
| 364 |
+
<div class="row"><span class="q was">al pstr</span><span class="was">71</span></div>
|
| 365 |
+
</div>
|
| 366 |
+
<div class="arrow">➜</div>
|
| 367 |
+
<div class="col after">
|
| 368 |
+
<div class="ttl"><span class="es">Después</span><span class="en">After</span></div>
|
| 369 |
+
<div class="row"><span class="q">Al pastor</span><span class="now">1,204</span></div>
|
| 370 |
+
<div class="row" style="color:var(--ink-soft)"><span class="q" style="font-style:italic"><span class="es">una sola fila, bien clara</span><span class="en">one tidy line</span></span><span></span></div>
|
| 371 |
+
</div>
|
| 372 |
+
</div>
|
| 373 |
+
</div>
|
| 374 |
+
|
| 375 |
+
<div class="card chg done">
|
| 376 |
+
<h4><span class="es">Los blancos disfrazados</span><span class="en">The disguised blanks</span> <span class="chk"><span class="es">YA HECHO</span><span class="en">DONE</span></span></h4>
|
| 377 |
+
<div class="ba">
|
| 378 |
+
<div class="col before">
|
| 379 |
+
<div class="ttl"><span class="es">Antes</span><span class="en">Before</span></div>
|
| 380 |
+
<div class="row"><span class="q was">tel.</span><span class="was">N/A</span></div>
|
| 381 |
+
<div class="row"><span class="q was">notas</span><span class="was">—</span></div>
|
| 382 |
+
<div class="row"><span class="q was">extra</span><span class="was">none</span></div>
|
| 383 |
+
</div>
|
| 384 |
+
<div class="arrow">➜</div>
|
| 385 |
+
<div class="col after">
|
| 386 |
+
<div class="ttl"><span class="es">Después</span><span class="en">After</span></div>
|
| 387 |
+
<div class="row"><span class="q">tel.</span><span class="now"><span class="es">(vacío)</span><span class="en">(empty)</span></span></div>
|
| 388 |
+
<div class="row"><span class="q">notas</span><span class="now"><span class="es">(vacío)</span><span class="en">(empty)</span></span></div>
|
| 389 |
+
<div class="row"><span class="q">extra</span><span class="now"><span class="es">(vacío)</span><span class="en">(empty)</span></span></div>
|
| 390 |
+
</div>
|
| 391 |
+
</div>
|
| 392 |
+
</div>
|
| 393 |
+
|
| 394 |
+
<!-- ASK cards (money / identity) -->
|
| 395 |
+
<div class="secttitle">
|
| 396 |
+
<svg width="18" height="18" viewBox="0 0 24 24"><circle cx="12" cy="12" r="10" fill="#f3d9a3"/><path d="M12 7v6M12 16v.5" stroke="#b9863a" stroke-width="2" fill="none" stroke-linecap="round"/></svg>
|
| 397 |
+
<span class="es">Aquí mejor le pregunto a usted</span><span class="en">Here I'd better ask you</span>
|
| 398 |
+
</div>
|
| 399 |
+
|
| 400 |
+
<div class="card chg ask">
|
| 401 |
+
<h4><span class="es">31 renglones marcaron $0.00</span><span class="en">31 lines showed $0.00</span> <span class="pill-ask"><span class="es">¿ME DICE?</span><span class="en">YOUR CALL</span></span></h4>
|
| 402 |
+
<p style="margin:0 0 4px">
|
| 403 |
+
<span class="es">Encontré <b>31 ventas en $0.00</b> — eso casi siempre es una falla de la caja, no una venta de verdad. ¿Quiere que las <b>deje fuera del total</b> del mes?</span>
|
| 404 |
+
<span class="en">I found <b>31 sales at $0.00</b> — that's usually a register glitch, not a real sale. Want me to <b>leave them out of the month's total</b>?</span>
|
| 405 |
+
</p>
|
| 406 |
+
<div class="askbtns">
|
| 407 |
+
<button class="yes" onclick="answer(this)"><span class="es">Sí, déjalas fuera</span><span class="en">Yes, leave them out</span></button>
|
| 408 |
+
<button class="no" onclick="answer(this)"><span class="es">No, déjalas</span><span class="en">No, keep them</span></button>
|
| 409 |
+
</div>
|
| 410 |
+
<div class="answered">
|
| 411 |
+
<svg width="18" height="18" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#cfe0bd"/><path d="M7 12l3.5 3.5L17 8" stroke="#52733f" stroke-width="2.2" fill="none" stroke-linecap="round"/></svg>
|
| 412 |
+
<span class="es">Listo — usted decidió. Lo anoté en su resumen.</span><span class="en">Done — your call. I noted it in your summary.</span>
|
| 413 |
+
</div>
|
| 414 |
+
</div>
|
| 415 |
+
|
| 416 |
+
<div class="card chg ask">
|
| 417 |
+
<h4><span class="es">Dos clientes parecen el mismo</span><span class="en">Two customers look like the same one</span> <span class="pill-ask"><span class="es">¿ME DICE?</span><span class="en">YOUR CALL</span></span></h4>
|
| 418 |
+
<p style="margin:0 0 4px">
|
| 419 |
+
<span class="es"><b>"Yolanda R."</b> y <b>"Yolanda Reyes"</b> tienen el mismo teléfono. ¿Los <b>cuento como una sola persona</b>?</span>
|
| 420 |
+
<span class="en"><b>"Yolanda R."</b> and <b>"Yolanda Reyes"</b> share the same phone. Should I <b>count them as one person</b>?</span>
|
| 421 |
+
</p>
|
| 422 |
+
<div class="askbtns">
|
| 423 |
+
<button class="yes" onclick="answer(this)"><span class="es">Sí, es la misma</span><span class="en">Yes, same person</span></button>
|
| 424 |
+
<button class="no" onclick="answer(this)"><span class="es">No, son distintas</span><span class="en">No, keep both</span></button>
|
| 425 |
+
</div>
|
| 426 |
+
<div class="answered">
|
| 427 |
+
<svg width="18" height="18" viewBox="0 0 24 24"><circle cx="12" cy="12" r="11" fill="#cfe0bd"/><path d="M7 12l3.5 3.5L17 8" stroke="#52733f" stroke-width="2.2" fill="none" stroke-linecap="round"/></svg>
|
| 428 |
+
<span class="es">Listo — usted decidió.</span><span class="en">Done — your call.</span>
|
| 429 |
+
</div>
|
| 430 |
+
</div>
|
| 431 |
+
|
| 432 |
+
<!-- HONEST FLAGS -->
|
| 433 |
+
<div class="secttitle">
|
| 434 |
+
<svg width="18" height="18" viewBox="0 0 24 24"><circle cx="12" cy="12" r="10" fill="#e3edf2"/><path d="M12 7v5M12 15v.5" stroke="#5b7d8c" stroke-width="2" fill="none" stroke-linecap="round"/></svg>
|
| 435 |
+
<span class="es">No estuve segura de esto — lo dejé para usted</span><span class="en">I wasn't sure about these — I left them for you</span>
|
| 436 |
+
</div>
|
| 437 |
+
<div class="card chg flag">
|
| 438 |
+
<h4><span class="es">Dos teléfonos raros y una nota de catering</span><span class="en">Two odd phones and a catering note</span> <span class="pill-flag"><span class="es">PARA REVISAR</span><span class="en">FOR YOU</span></span></h4>
|
| 439 |
+
<p style="margin:0">
|
| 440 |
+
<span class="es">Dos teléfonos tienen muy pocos números, y una nota dice "evento — preguntar a Memo". No quise adivinar, así que <b>los dejé tal cual</b> para que usted los vea con calma.</span>
|
| 441 |
+
<span class="en">Two phones have too few digits, and one note says "event — ask Memo." I didn't want to guess, so I <b>left them exactly as they were</b> for you to peek at.</span>
|
| 442 |
+
</p>
|
| 443 |
+
</div>
|
| 444 |
+
|
| 445 |
+
<!-- BONUS -->
|
| 446 |
+
<div class="card bonus">
|
| 447 |
+
<h4>
|
| 448 |
+
<svg width="24" height="24" viewBox="0 0 24 24"><path d="M5 9h14l-1.3 9.2A2 2 0 0 1 15.7 20H8.3a2 2 0 0 1-2-1.8L5 9z" fill="#f3d9a3" stroke="#b9863a" stroke-width="1.3"/><path d="M8 9a4 4 0 0 1 8 0" stroke="#b9863a" stroke-width="1.3" fill="none"/></svg>
|
| 449 |
+
<span class="es">De pilón: lo que se le está acabando</span><span class="en">A little bonus: what you're running low on</span>
|
| 450 |
+
</h4>
|
| 451 |
+
<p style="margin:0;color:var(--ink-soft)">
|
| 452 |
+
<span class="es">Ya que andábamos en sus números, le aparté esto para el pedido:</span>
|
| 453 |
+
<span class="en">While I was in your numbers, I set this aside for your reorder:</span>
|
| 454 |
+
</p>
|
| 455 |
+
<ul>
|
| 456 |
+
<li><span class="dot"></span><span class="es"><b>Marinada de pastor</b> — para ~3 días. Tal vez pedir el lunes.</span><span class="en"><b>Pastor marinade</b> — about 3 days left. Maybe order Monday.</span></li>
|
| 457 |
+
<li><span class="dot"></span><span class="es"><b>Tortillas</b> — bajando rápido este fin de semana.</span><span class="en"><b>Tortillas</b> — going fast this weekend.</span></li>
|
| 458 |
+
</ul>
|
| 459 |
+
</div>
|
| 460 |
+
|
| 461 |
+
<!-- GET MY CLEAN COPY -->
|
| 462 |
+
<div class="card getit">
|
| 463 |
+
<svg width="58" height="58" viewBox="0 0 58 58" style="margin:0 auto 6px"><circle cx="29" cy="29" r="26" fill="#dcebcb" stroke="#6f8f5a" stroke-width="2"/><path d="M29 16v18M22 28l7 7 7-7" stroke="#52733f" stroke-width="3" fill="none" stroke-linecap="round" stroke-linejoin="round"/><path d="M19 40h20" stroke="#52733f" stroke-width="3" stroke-linecap="round"/></svg>
|
| 464 |
+
<h3><span class="es">¡Quedó preciosa, Doña Lupe!</span><span class="en">It looks lovely, Lupe!</span></h3>
|
| 465 |
+
<p><span class="es">Aquí está su copia limpia y su resumen para imprimir o mandar por correo.</span><span class="en">Here's your clean copy and your summary to print or email.</span></p>
|
| 466 |
+
<div class="btns">
|
| 467 |
+
<button class="btn btn-dl"><span class="es">Bajar mi copia limpia</span><span class="en">Get my clean copy</span></button>
|
| 468 |
+
<button class="btn btn-ghost"><span class="es">Imprimir el resumen</span><span class="en">Print the summary</span></button>
|
| 469 |
+
</div>
|
| 470 |
+
<p class="undo">
|
| 471 |
+
<svg width="15" height="15" viewBox="0 0 24 24" style="vertical-align:-2px"><path d="M12 5V2L7 7l5 5V8a6 6 0 1 1-6 6" stroke="#52733f" stroke-width="1.8" fill="none" stroke-linecap="round" stroke-linejoin="round"/></svg>
|
| 472 |
+
<span class="es">Su archivo original sigue a salvo. <a>Déjelo como estaba</a> cuando quiera.</span>
|
| 473 |
+
<span class="en">Your original is safe. <a>Put it back the way it was</a> any time.</span>
|
| 474 |
+
</p>
|
| 475 |
+
</div>
|
| 476 |
+
|
| 477 |
+
<button class="restart" onclick="reset()"><span class="es">↺ empezar de nuevo con otro archivo</span><span class="en">↺ start over with another file</span></button>
|
| 478 |
+
</section>
|
| 479 |
+
|
| 480 |
+
<p class="footnote">
|
| 481 |
+
<span class="es">Hecho con cariño para una hora tranquila en la mesa de la cocina · funciona sin internet</span>
|
| 482 |
+
<span class="en">Made with care for a quiet hour at the kitchen table · works without internet</span>
|
| 483 |
+
</p>
|
| 484 |
+
</div>
|
| 485 |
+
|
| 486 |
+
<script>
|
| 487 |
+
function setLang(l){
|
| 488 |
+
document.body.classList.toggle('es-on', l==='es');
|
| 489 |
+
document.getElementById('bES').classList.toggle('on', l==='es');
|
| 490 |
+
document.getElementById('bEN').classList.toggle('on', l!=='es');
|
| 491 |
+
document.documentElement.lang = l;
|
| 492 |
+
}
|
| 493 |
+
function show(id){
|
| 494 |
+
document.querySelectorAll('.screen').forEach(s=>s.classList.remove('active'));
|
| 495 |
+
document.getElementById(id).classList.add('active');
|
| 496 |
+
window.scrollTo({top:0,behavior:'smooth'});
|
| 497 |
+
}
|
| 498 |
+
function pick(){ /* mock: file already shown as chosen */ }
|
| 499 |
+
function go(){
|
| 500 |
+
show('s2');
|
| 501 |
+
const isES = document.body.classList.contains('es-on');
|
| 502 |
+
const steps = isES ? [
|
| 503 |
+
'Juntando los tacos que están escritos de varias maneras…',
|
| 504 |
+
'Emparejando los teléfonos y las fechas…',
|
| 505 |
+
'Buscando blancos disfrazados como "N/A" o una raya…',
|
| 506 |
+
'Apartando lo que mejor le pregunto a usted…'
|
| 507 |
+
] : [
|
| 508 |
+
'Gathering the items written a few different ways…',
|
| 509 |
+
'Matching up the phone numbers and dates…',
|
| 510 |
+
'Looking for blanks disguised as "N/A" or a dash…',
|
| 511 |
+
'Setting aside the things I should ask you about…'
|
| 512 |
+
];
|
| 513 |
+
let i=0;
|
| 514 |
+
const el = document.getElementById('step');
|
| 515 |
+
const t = setInterval(()=>{ i++; if(i<steps.length){ el.textContent = steps[i]; } }, 1050);
|
| 516 |
+
setTimeout(()=>{ clearInterval(t); show('s3'); }, 4400);
|
| 517 |
+
}
|
| 518 |
+
function answer(btn){
|
| 519 |
+
const card = btn.closest('.chg');
|
| 520 |
+
card.querySelector('.askbtns').style.display='none';
|
| 521 |
+
card.querySelector('.answered').style.display='flex';
|
| 522 |
+
}
|
| 523 |
+
function reset(){ show('s1'); }
|
| 524 |
+
</script>
|
| 525 |
+
</body>
|
| 526 |
+
</html>
|
design/mockups/helper/index.html
ADDED
|
@@ -0,0 +1,517 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="es">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 6 |
+
<title>ScrubData — tu ayudante de listas</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root{
|
| 9 |
+
--paper:#fbf4e7;
|
| 10 |
+
--paper-2:#f4e9d4;
|
| 11 |
+
--card:#fffdf8;
|
| 12 |
+
--ink:#4a3b2e;
|
| 13 |
+
--ink-soft:#6f5d49;
|
| 14 |
+
--line:#e6d6b8;
|
| 15 |
+
--accent:#e07a3f; /* warm terracotta */
|
| 16 |
+
--accent-soft:#f6c89a;
|
| 17 |
+
--leaf:#6e8a5a; /* trail green */
|
| 18 |
+
--leaf-soft:#dfe7cf;
|
| 19 |
+
--sky:#8fb0c4;
|
| 20 |
+
--shadow:0 10px 28px rgba(120,90,50,.14);
|
| 21 |
+
--shadow-sm:0 4px 12px rgba(120,90,50,.10);
|
| 22 |
+
--radius:22px;
|
| 23 |
+
--font: "Segoe UI", "Helvetica Neue", system-ui, -apple-system, "Trebuchet MS", sans-serif;
|
| 24 |
+
}
|
| 25 |
+
*{box-sizing:border-box;}
|
| 26 |
+
html,body{margin:0;padding:0;}
|
| 27 |
+
body{
|
| 28 |
+
font-family:var(--font);
|
| 29 |
+
color:var(--ink);
|
| 30 |
+
background:
|
| 31 |
+
radial-gradient(1200px 600px at 80% -10%, #fdf6e8 0%, rgba(253,246,232,0) 60%),
|
| 32 |
+
radial-gradient(900px 500px at 0% 100%, #f6ecd6 0%, rgba(246,236,214,0) 55%),
|
| 33 |
+
var(--paper);
|
| 34 |
+
-webkit-font-smoothing:antialiased;
|
| 35 |
+
line-height:1.5;
|
| 36 |
+
min-height:100vh;
|
| 37 |
+
}
|
| 38 |
+
/* tiny hand-drawn paper texture via repeating soft dots */
|
| 39 |
+
body::before{
|
| 40 |
+
content:"";position:fixed;inset:0;pointer-events:none;z-index:0;
|
| 41 |
+
background-image:radial-gradient(rgba(180,150,100,.06) 1px, transparent 1px);
|
| 42 |
+
background-size:22px 22px;
|
| 43 |
+
}
|
| 44 |
+
.wrap{position:relative;z-index:1;max-width:880px;margin:0 auto;padding:26px 20px 80px;}
|
| 45 |
+
|
| 46 |
+
/* ---- top bar ---- */
|
| 47 |
+
.topbar{display:flex;align-items:center;justify-content:space-between;margin-bottom:18px;}
|
| 48 |
+
.brand{display:flex;align-items:center;gap:11px;font-weight:800;font-size:20px;letter-spacing:.2px;}
|
| 49 |
+
.brand .logo{
|
| 50 |
+
width:40px;height:40px;border-radius:14px;
|
| 51 |
+
background:linear-gradient(150deg,var(--accent),#f0a05f);
|
| 52 |
+
display:grid;place-items:center;color:#fff;font-size:20px;
|
| 53 |
+
box-shadow:var(--shadow-sm);transform:rotate(-4deg);
|
| 54 |
+
}
|
| 55 |
+
.brand small{display:block;font-weight:600;font-size:12px;color:var(--ink-soft);letter-spacing:0;}
|
| 56 |
+
.lang{
|
| 57 |
+
display:flex;background:var(--card);border:1.5px solid var(--line);
|
| 58 |
+
border-radius:999px;padding:4px;box-shadow:var(--shadow-sm);font-weight:700;font-size:13px;
|
| 59 |
+
}
|
| 60 |
+
.lang button{
|
| 61 |
+
border:0;background:transparent;color:var(--ink-soft);
|
| 62 |
+
padding:6px 14px;border-radius:999px;cursor:pointer;font:inherit;font-weight:700;
|
| 63 |
+
}
|
| 64 |
+
.lang button.on{background:var(--accent);color:#fff;}
|
| 65 |
+
|
| 66 |
+
/* ---- persistent safety ribbon ---- */
|
| 67 |
+
.safety{
|
| 68 |
+
display:flex;align-items:center;gap:10px;
|
| 69 |
+
background:var(--leaf-soft);color:#41522f;
|
| 70 |
+
border:1.5px solid #cdd9bb;border-radius:999px;
|
| 71 |
+
padding:9px 16px;font-size:14px;font-weight:600;margin-bottom:24px;
|
| 72 |
+
box-shadow:var(--shadow-sm);
|
| 73 |
+
}
|
| 74 |
+
.safety .dot{font-size:16px;}
|
| 75 |
+
|
| 76 |
+
/* ---- card base ---- */
|
| 77 |
+
.card{
|
| 78 |
+
background:var(--card);border:1.5px solid var(--line);
|
| 79 |
+
border-radius:var(--radius);box-shadow:var(--shadow);
|
| 80 |
+
padding:30px;margin-bottom:22px;
|
| 81 |
+
}
|
| 82 |
+
h1{font-size:30px;margin:.1em 0 .25em;line-height:1.2;}
|
| 83 |
+
h2{font-size:22px;margin:.1em 0 .5em;}
|
| 84 |
+
.lead{font-size:18px;color:var(--ink-soft);margin:0 0 6px;}
|
| 85 |
+
|
| 86 |
+
/* ---- screen toggling ---- */
|
| 87 |
+
.screen{display:none;}
|
| 88 |
+
.screen.active{display:block;animation:fade .5s ease;}
|
| 89 |
+
@keyframes fade{from{opacity:0;transform:translateY(8px);}to{opacity:1;transform:none;}}
|
| 90 |
+
|
| 91 |
+
/* ---- step pills ---- */
|
| 92 |
+
.steps{display:flex;gap:8px;justify-content:center;margin-bottom:20px;flex-wrap:wrap;}
|
| 93 |
+
.steps .pill{
|
| 94 |
+
font-size:12.5px;font-weight:700;color:var(--ink-soft);
|
| 95 |
+
background:var(--paper-2);border:1.5px solid var(--line);
|
| 96 |
+
padding:6px 13px;border-radius:999px;cursor:pointer;transition:.2s;
|
| 97 |
+
}
|
| 98 |
+
.steps .pill.on{background:var(--accent);color:#fff;border-color:var(--accent);}
|
| 99 |
+
|
| 100 |
+
/* ---- drop zone ---- */
|
| 101 |
+
.drop{
|
| 102 |
+
border:2.5px dashed var(--accent-soft);border-radius:26px;
|
| 103 |
+
background:linear-gradient(180deg,#fffdf7,#fdf3e2);
|
| 104 |
+
padding:46px 24px;text-align:center;cursor:pointer;transition:.2s;
|
| 105 |
+
}
|
| 106 |
+
.drop:hover{border-color:var(--accent);transform:translateY(-2px);box-shadow:var(--shadow);}
|
| 107 |
+
.drop .big{font-size:54px;line-height:1;margin-bottom:10px;}
|
| 108 |
+
.drop .title{font-size:21px;font-weight:800;margin-bottom:4px;}
|
| 109 |
+
.drop .sub{color:var(--ink-soft);font-size:15px;}
|
| 110 |
+
.file-chip{
|
| 111 |
+
display:inline-flex;align-items:center;gap:9px;margin-top:18px;
|
| 112 |
+
background:var(--leaf-soft);border:1.5px solid #cdd9bb;border-radius:14px;
|
| 113 |
+
padding:9px 15px;font-weight:700;font-size:14.5px;color:#41522f;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
/* ---- big friendly button ---- */
|
| 117 |
+
.btn{
|
| 118 |
+
border:0;cursor:pointer;font:inherit;font-weight:800;font-size:18px;
|
| 119 |
+
background:linear-gradient(150deg,var(--accent),#ef9a55);color:#fff;
|
| 120 |
+
padding:16px 30px;border-radius:18px;box-shadow:0 8px 18px rgba(224,122,63,.30);
|
| 121 |
+
transition:.15s;display:inline-flex;align-items:center;gap:10px;
|
| 122 |
+
}
|
| 123 |
+
.btn:hover{transform:translateY(-2px);box-shadow:0 12px 22px rgba(224,122,63,.38);}
|
| 124 |
+
.btn.ghost{
|
| 125 |
+
background:var(--card);color:var(--ink);border:1.5px solid var(--line);
|
| 126 |
+
box-shadow:var(--shadow-sm);font-size:15px;padding:12px 20px;
|
| 127 |
+
}
|
| 128 |
+
.btn.ghost:hover{box-shadow:var(--shadow-sm);}
|
| 129 |
+
.center{text-align:center;}
|
| 130 |
+
.mt{margin-top:22px;}
|
| 131 |
+
|
| 132 |
+
/* ---- working state ---- */
|
| 133 |
+
.work{text-align:center;padding:20px 10px 6px;}
|
| 134 |
+
.pot{font-size:64px;display:inline-block;animation:stir 1.6s ease-in-out infinite;}
|
| 135 |
+
@keyframes stir{0%,100%{transform:rotate(-6deg);}50%{transform:rotate(6deg);}}
|
| 136 |
+
.progress{height:14px;background:var(--paper-2);border-radius:999px;overflow:hidden;margin:22px auto;max-width:430px;border:1.5px solid var(--line);}
|
| 137 |
+
.progress > i{display:block;height:100%;width:0;background:linear-gradient(90deg,var(--accent),var(--leaf));border-radius:999px;animation:fill 3.4s ease forwards;}
|
| 138 |
+
@keyframes fill{to{width:100%;}}
|
| 139 |
+
.work-note{color:var(--ink-soft);font-size:15px;min-height:22px;}
|
| 140 |
+
|
| 141 |
+
/* ---- summary hero ---- */
|
| 142 |
+
.badge-row{display:flex;align-items:center;gap:16px;flex-wrap:wrap;margin-bottom:6px;}
|
| 143 |
+
.merit{
|
| 144 |
+
width:78px;height:78px;flex:none;border-radius:50%;
|
| 145 |
+
background:radial-gradient(circle at 50% 35%,#fbe2c2,#f0b277);
|
| 146 |
+
border:3px dashed #d98b4e;display:grid;place-items:center;
|
| 147 |
+
color:#7a4a1f;font-size:30px;box-shadow:var(--shadow-sm);transform:rotate(-5deg);
|
| 148 |
+
}
|
| 149 |
+
.summary-list{margin:18px 0 4px;padding:0;list-style:none;display:grid;gap:12px;}
|
| 150 |
+
.summary-list li{
|
| 151 |
+
display:flex;gap:13px;align-items:flex-start;font-size:16.5px;
|
| 152 |
+
background:var(--paper);border:1.5px solid var(--line);border-radius:16px;padding:13px 16px;
|
| 153 |
+
}
|
| 154 |
+
.summary-list .ic{font-size:22px;flex:none;line-height:1.2;}
|
| 155 |
+
.summary-list b{color:var(--ink);}
|
| 156 |
+
|
| 157 |
+
/* ---- change cards (before/after) ---- */
|
| 158 |
+
.change{
|
| 159 |
+
border:1.5px solid var(--line);border-radius:18px;background:var(--card);
|
| 160 |
+
padding:18px 18px 16px;margin-bottom:16px;box-shadow:var(--shadow-sm);
|
| 161 |
+
}
|
| 162 |
+
.change .head{font-weight:800;font-size:17px;margin-bottom:4px;display:flex;align-items:center;gap:9px;}
|
| 163 |
+
.change .say{color:var(--ink-soft);font-size:14.5px;margin-bottom:14px;}
|
| 164 |
+
.ba{display:grid;grid-template-columns:1fr auto 1fr;gap:12px;align-items:center;}
|
| 165 |
+
.ba .col{background:var(--paper);border:1.5px solid var(--line);border-radius:14px;padding:12px 14px;}
|
| 166 |
+
.ba .lab{font-size:11.5px;font-weight:800;letter-spacing:.5px;text-transform:uppercase;color:var(--ink-soft);margin-bottom:7px;}
|
| 167 |
+
.ba .col.after{background:var(--leaf-soft);border-color:#cdd9bb;}
|
| 168 |
+
.ba .row{font-size:15px;padding:3px 0;color:var(--ink);}
|
| 169 |
+
.ba .row.dim{color:#a98f6e;}
|
| 170 |
+
.ba .arrow{font-size:26px;color:var(--accent);text-align:center;}
|
| 171 |
+
|
| 172 |
+
/* gentle confirm card */
|
| 173 |
+
.ask{
|
| 174 |
+
border:1.5px solid var(--accent-soft);background:linear-gradient(180deg,#fffaf2,#fdf1e0);
|
| 175 |
+
border-radius:18px;padding:18px;margin-bottom:16px;box-shadow:var(--shadow-sm);
|
| 176 |
+
}
|
| 177 |
+
.ask .q{font-weight:800;font-size:17px;margin-bottom:5px;display:flex;gap:9px;align-items:center;}
|
| 178 |
+
.ask .detail{color:var(--ink-soft);font-size:14.5px;margin-bottom:14px;}
|
| 179 |
+
.ask .actions{display:flex;gap:10px;flex-wrap:wrap;}
|
| 180 |
+
.yes{background:var(--leaf);color:#fff;border:0;font-weight:800;border-radius:13px;padding:11px 20px;cursor:pointer;font:inherit;font-weight:800;}
|
| 181 |
+
.no{background:var(--card);color:var(--ink);border:1.5px solid var(--line);font-weight:700;border-radius:13px;padding:11px 20px;cursor:pointer;font:inherit;font-weight:700;}
|
| 182 |
+
.answered{font-weight:800;color:var(--leaf);font-size:15px;display:none;align-items:center;gap:8px;margin-top:4px;}
|
| 183 |
+
|
| 184 |
+
/* honest flags */
|
| 185 |
+
.flags{background:#fcf6ea;border:1.5px dashed #e0c9a0;border-radius:18px;padding:18px;margin-bottom:16px;}
|
| 186 |
+
.flags .q{font-weight:800;font-size:16.5px;margin-bottom:8px;display:flex;gap:9px;align-items:center;}
|
| 187 |
+
.flags ul{margin:6px 0 0;padding-left:4px;list-style:none;}
|
| 188 |
+
.flags li{font-size:14.5px;color:var(--ink-soft);padding:6px 0;border-top:1px dashed #e7d6b6;}
|
| 189 |
+
.flags li:first-child{border-top:0;}
|
| 190 |
+
|
| 191 |
+
/* bonus card */
|
| 192 |
+
.bonus{
|
| 193 |
+
background:linear-gradient(150deg,#eef3e3,#e3ecd2);border:1.5px solid #cdd9bb;
|
| 194 |
+
border-radius:18px;padding:20px;margin-bottom:16px;display:flex;gap:15px;align-items:center;
|
| 195 |
+
}
|
| 196 |
+
.bonus .em{font-size:42px;flex:none;}
|
| 197 |
+
.bonus .t{font-weight:800;font-size:17px;color:#3f5230;margin-bottom:3px;}
|
| 198 |
+
.bonus .d{color:#4f6240;font-size:14.5px;}
|
| 199 |
+
|
| 200 |
+
/* download band */
|
| 201 |
+
.download{
|
| 202 |
+
text-align:center;background:linear-gradient(180deg,#fffdf7,#fdf2e1);
|
| 203 |
+
border:1.5px solid var(--line);border-radius:20px;padding:26px 20px;margin-bottom:8px;
|
| 204 |
+
}
|
| 205 |
+
.download .small{color:var(--ink-soft);font-size:13.5px;margin-top:12px;}
|
| 206 |
+
|
| 207 |
+
.section-title{font-size:14px;font-weight:800;letter-spacing:.6px;text-transform:uppercase;color:var(--ink-soft);margin:26px 4px 12px;}
|
| 208 |
+
|
| 209 |
+
.footnote{text-align:center;color:var(--ink-soft);font-size:13px;margin-top:30px;}
|
| 210 |
+
@media(max-width:560px){
|
| 211 |
+
.ba{grid-template-columns:1fr;}
|
| 212 |
+
.ba .arrow{transform:rotate(90deg);}
|
| 213 |
+
h1{font-size:25px;}
|
| 214 |
+
}
|
| 215 |
+
</style>
|
| 216 |
+
</head>
|
| 217 |
+
<body>
|
| 218 |
+
<div class="wrap">
|
| 219 |
+
|
| 220 |
+
<!-- TOP BAR -->
|
| 221 |
+
<div class="topbar">
|
| 222 |
+
<div class="brand">
|
| 223 |
+
<span class="logo">🧺</span>
|
| 224 |
+
<span>ScrubData<small data-es="tu ayudante de listas" data-en="your list helper">tu ayudante de listas</small></span>
|
| 225 |
+
</div>
|
| 226 |
+
<div class="lang">
|
| 227 |
+
<button class="on" onclick="setLang('es',this)">Español</button>
|
| 228 |
+
<button onclick="setLang('en',this)">English</button>
|
| 229 |
+
</div>
|
| 230 |
+
</div>
|
| 231 |
+
|
| 232 |
+
<!-- PERSISTENT SAFETY RIBBON -->
|
| 233 |
+
<div class="safety">
|
| 234 |
+
<span class="dot">🌿</span>
|
| 235 |
+
<span data-es="Tu archivo original no se toca. Nada sale de esta computadora. Siempre puedes dejarlo como estaba."
|
| 236 |
+
data-en="Your original file stays exactly as it is. Nothing leaves this computer. You can always put it back the way it was.">
|
| 237 |
+
Tu archivo original no se toca. Nada sale de esta computadora. Siempre puedes dejarlo como estaba.
|
| 238 |
+
</span>
|
| 239 |
+
</div>
|
| 240 |
+
|
| 241 |
+
<!-- STEP PILLS (let reviewer walk the arc) -->
|
| 242 |
+
<div class="steps">
|
| 243 |
+
<span class="pill on" onclick="go(0,this)" data-es="1 · Bienvenida" data-en="1 · Welcome">1 · Bienvenida</span>
|
| 244 |
+
<span class="pill" onclick="go(1,this)" data-es="2 · Acomodando" data-en="2 · Tidying">2 · Acomodando</span>
|
| 245 |
+
<span class="pill" onclick="go(2,this)" data-es="3 · Lo que encontré" data-en="3 · What I found">3 · Lo que encontré</span>
|
| 246 |
+
</div>
|
| 247 |
+
|
| 248 |
+
<!-- ============ SCREEN 1 — WELCOME + DROP ============ -->
|
| 249 |
+
<section class="screen active" id="s0">
|
| 250 |
+
<div class="card">
|
| 251 |
+
<h1 data-es="Hola, Lupita. Vamos a poner tu lista bonita. 🌼"
|
| 252 |
+
data-en="Hi, Lupita. Let's make your list nice and tidy. 🌼">
|
| 253 |
+
Hola, Lupita. Vamos a poner tu lista bonita. 🌼
|
| 254 |
+
</h1>
|
| 255 |
+
<p class="lead" data-es="Sube tu archivo y yo lo reviso contigo, despacito. Sin botones raros, sin configurar nada."
|
| 256 |
+
data-en="Drop your file and I'll look through it with you, nice and slow. No strange buttons, nothing to set up.">
|
| 257 |
+
Sube tu archivo y yo lo reviso contigo, despacito. Sin botones raros, sin configurar nada.
|
| 258 |
+
</p>
|
| 259 |
+
|
| 260 |
+
<div class="drop" onclick="go(1)">
|
| 261 |
+
<div class="big">📂</div>
|
| 262 |
+
<div class="title" data-es="Suelta tu archivo aquí — yo le echo un ojo."
|
| 263 |
+
data-en="Drop your file here — I'll take a look.">Suelta tu archivo aquí — yo le echo un ojo.</div>
|
| 264 |
+
<div class="sub" data-es="Excel o CSV está bien. Tu original se queda igualito."
|
| 265 |
+
data-en="Excel or CSV is fine. Your original stays exactly as it is.">Excel o CSV está bien. Tu original se queda igualito.</div>
|
| 266 |
+
<div class="file-chip">📄 resumen-del-mes-mayo.xlsx</div>
|
| 267 |
+
</div>
|
| 268 |
+
|
| 269 |
+
<div class="center mt">
|
| 270 |
+
<button class="btn" onclick="go(1)">
|
| 271 |
+
<span>🧽</span><span data-es="Acomódalo por mí" data-en="Clean it up">Acomódalo por mí</span>
|
| 272 |
+
</button>
|
| 273 |
+
</div>
|
| 274 |
+
</div>
|
| 275 |
+
</section>
|
| 276 |
+
|
| 277 |
+
<!-- ============ SCREEN 2 — WORKING ============ -->
|
| 278 |
+
<section class="screen" id="s1">
|
| 279 |
+
<div class="card work">
|
| 280 |
+
<div class="pot">🍲</div>
|
| 281 |
+
<h2 data-es="Estoy acomodando tu lista…" data-en="I'm tidying your list…">Estoy acomodando tu lista…</h2>
|
| 282 |
+
<div class="progress"><i></i></div>
|
| 283 |
+
<p class="work-note" id="workNote"
|
| 284 |
+
data-es="Trabajando aquí mismo, en tu computadora. Tu original está a salvo."
|
| 285 |
+
data-en="Working right here on your computer. Your original is safe.">
|
| 286 |
+
Trabajando aquí mismo, en tu computadora. Tu original está a salvo.
|
| 287 |
+
</p>
|
| 288 |
+
<div class="center mt">
|
| 289 |
+
<button class="btn ghost" onclick="go(2)" data-es="Ver lo que encontré →" data-en="See what I found →">Ver lo que encontré →</button>
|
| 290 |
+
</div>
|
| 291 |
+
</div>
|
| 292 |
+
</section>
|
| 293 |
+
|
| 294 |
+
<!-- ============ SCREEN 3 — RESULT ============ -->
|
| 295 |
+
<section class="screen" id="s2">
|
| 296 |
+
|
| 297 |
+
<!-- SUMMARY HERO -->
|
| 298 |
+
<div class="card">
|
| 299 |
+
<div class="badge-row">
|
| 300 |
+
<div class="merit">🏅</div>
|
| 301 |
+
<div>
|
| 302 |
+
<h1 style="margin:0" data-es="¡Listo! Tu lista quedó bien bonita."
|
| 303 |
+
data-en="All done! Your list is in great shape.">¡Listo! Tu lista quedó bien bonita.</h1>
|
| 304 |
+
<p class="lead" style="margin:2px 0 0" data-es="Esto fue lo que acomodé por ti — léelo en voz alta a Yolanda si quieres."
|
| 305 |
+
data-en="Here's what I tidied for you — read it out loud to Yolanda if you like.">
|
| 306 |
+
Esto fue lo que acomodé por ti — léelo en voz alta a Yolanda si quieres.
|
| 307 |
+
</p>
|
| 308 |
+
</div>
|
| 309 |
+
</div>
|
| 310 |
+
|
| 311 |
+
<ul class="summary-list">
|
| 312 |
+
<li><span class="ic">🌮</span><span data-es="<b>“Al pastor”</b> estaba escrito de 4 maneras. Los junté: <b>1,204 vendidos</b> en mayo."
|
| 313 |
+
data-en="<b>“Al pastor”</b> was written 4 different ways. I counted them together: <b>1,204 sold</b> in May.">
|
| 314 |
+
<b>“Al pastor”</b> estaba escrito de 4 maneras. Los junté: <b>1,204 vendidos</b> en mayo.</span></li>
|
| 315 |
+
<li><span class="ic">👥</span><span data-es="<b>3 clientes</b> aparecían dos veces. Los reuní para que los revises."
|
| 316 |
+
data-en="<b>3 customers</b> showed up twice. I gathered each one for you to check.">
|
| 317 |
+
<b>3 clientes</b> aparecían dos veces. Los reuní para que los revises.</span></li>
|
| 318 |
+
<li><span class="ic">📞</span><span data-es="Acomodé <b>todos los teléfonos</b> para que se lean igualito."
|
| 319 |
+
data-en="I made <b>all the phone numbers</b> match so they're easy to read.">
|
| 320 |
+
Acomodé <b>todos los teléfonos</b> para que se lean igualito.</span></li>
|
| 321 |
+
<li><span class="ic">🗓️</span><span data-es="Puse <b>todas las fechas</b> escritas de la misma forma."
|
| 322 |
+
data-en="I made <b>all the dates</b> written the same way.">
|
| 323 |
+
Puse <b>todas las fechas</b> escritas de la misma forma.</span></li>
|
| 324 |
+
<li><span class="ic">⬜</span><span data-es="Algunos espacios decían “N/A” o un guion. Los dejé como <b>vacíos</b>."
|
| 325 |
+
data-en="Some spots said “N/A” or just a dash. I treated those as <b>empty</b>.">
|
| 326 |
+
Algunos espacios decían “N/A” o un guion. Los dejé como <b>vacíos</b>.</span></li>
|
| 327 |
+
</ul>
|
| 328 |
+
</div>
|
| 329 |
+
|
| 330 |
+
<!-- CHANGE CARDS (story, not diff) -->
|
| 331 |
+
<div class="section-title" data-es="Aquí está lo que cambió — antes y después" data-en="Here's what changed — before and after">
|
| 332 |
+
Aquí está lo que cambió — antes y después
|
| 333 |
+
</div>
|
| 334 |
+
|
| 335 |
+
<div class="change">
|
| 336 |
+
<div class="head">🌮 <span data-es="El mismo taco, escrito de varias maneras" data-en="The same taco, written a few ways">El mismo taco, escrito de varias maneras</span></div>
|
| 337 |
+
<div class="say" data-es="La computadora por fin entiende que es el mismo taco. Los conté juntos."
|
| 338 |
+
data-en="The computer finally understands it's the same taco. I counted them together.">
|
| 339 |
+
La computadora por fin entiende que es el mismo taco. Los conté juntos.</div>
|
| 340 |
+
<div class="ba">
|
| 341 |
+
<div class="col">
|
| 342 |
+
<div class="lab" data-es="Antes" data-en="Before">Antes</div>
|
| 343 |
+
<div class="row dim">al pastor</div>
|
| 344 |
+
<div class="row dim">Al Pastor</div>
|
| 345 |
+
<div class="row dim">pastor</div>
|
| 346 |
+
<div class="row dim">tacos al pastor</div>
|
| 347 |
+
</div>
|
| 348 |
+
<div class="arrow">→</div>
|
| 349 |
+
<div class="col after">
|
| 350 |
+
<div class="lab" data-es="Después" data-en="After">Después</div>
|
| 351 |
+
<div class="row"><b>Al pastor</b></div>
|
| 352 |
+
<div class="row" data-es="1,204 vendidos" data-en="1,204 sold">1,204 vendidos</div>
|
| 353 |
+
</div>
|
| 354 |
+
</div>
|
| 355 |
+
</div>
|
| 356 |
+
|
| 357 |
+
<div class="change">
|
| 358 |
+
<div class="head">📞 <span data-es="Los teléfonos, todos parejitos" data-en="Phone numbers, all matching">Los teléfonos, todos parejitos</span></div>
|
| 359 |
+
<div class="say" data-es="Los dejé escritos igual para que sean fáciles de leer y marcar."
|
| 360 |
+
data-en="I made them all match so they're easy to read and dial.">
|
| 361 |
+
Los dejé escritos igual para que sean fáciles de leer y marcar.</div>
|
| 362 |
+
<div class="ba">
|
| 363 |
+
<div class="col">
|
| 364 |
+
<div class="lab" data-es="Antes" data-en="Before">Antes</div>
|
| 365 |
+
<div class="row dim">55-1234.5678</div>
|
| 366 |
+
<div class="row dim">(55) 12345678</div>
|
| 367 |
+
<div class="row dim">5512345678</div>
|
| 368 |
+
</div>
|
| 369 |
+
<div class="arrow">→</div>
|
| 370 |
+
<div class="col after">
|
| 371 |
+
<div class="lab" data-es="Después" data-en="After">Después</div>
|
| 372 |
+
<div class="row"><b>55 1234 5678</b></div>
|
| 373 |
+
</div>
|
| 374 |
+
</div>
|
| 375 |
+
</div>
|
| 376 |
+
|
| 377 |
+
<!-- GENTLE CONFIRM — money -->
|
| 378 |
+
<div class="ask" id="ask1">
|
| 379 |
+
<div class="q">💵 <span data-es="¿Dejo fuera del total las filas de $0.00?" data-en="Leave the $0.00 rows out of the total?">¿Dejo fuera del total las filas de $0.00?</span></div>
|
| 380 |
+
<div class="detail" data-es="Encontré <b>31 filas que marcan $0.00</b>. Eso parece un error del sistema, no una venta. Tú decides — yo no toco el dinero sin preguntarte."
|
| 381 |
+
data-en="I found <b>31 rows showing $0.00</b>. That looks like a glitch, not a sale. You decide — I won't touch money without asking.">
|
| 382 |
+
Encontré <b>31 filas que marcan $0.00</b>. Eso parece un error del sistema, no una venta. Tú decides — yo no toco el dinero sin preguntarte.</div>
|
| 383 |
+
<div class="actions">
|
| 384 |
+
<button class="yes" onclick="answer('ask1')" data-es="Sí, déjalas fuera" data-en="Yes, leave them out">Sí, déjalas fuera</button>
|
| 385 |
+
<button class="no" onclick="answer('ask1')" data-es="No, déjalas" data-en="No, keep them">No, déjalas</button>
|
| 386 |
+
</div>
|
| 387 |
+
<div class="answered" id="ans-ask1">✓ <span data-es="Anotado. Tú mandas." data-en="Got it. You're in charge.">Anotado. Tú mandas.</span></div>
|
| 388 |
+
</div>
|
| 389 |
+
|
| 390 |
+
<!-- GENTLE CONFIRM — duplicates -->
|
| 391 |
+
<div class="ask" id="ask2">
|
| 392 |
+
<div class="q">👥 <span data-es="¿Estos dos son la misma persona?" data-en="Are these two the same person?">¿Estos dos son la misma persona?</span></div>
|
| 393 |
+
<div class="detail" data-es="<b>“Yolanda Pérez”</b> y <b>“Yola Perez”</b> tienen el mismo teléfono. ¿Los junto en uno solo?"
|
| 394 |
+
data-en="<b>“Yolanda Pérez”</b> and <b>“Yola Perez”</b> share the same phone. Shall I count them as one?">
|
| 395 |
+
<b>“Yolanda Pérez”</b> y <b>“Yola Perez”</b> tienen el mismo teléfono. ¿Los junto en uno solo?</div>
|
| 396 |
+
<div class="actions">
|
| 397 |
+
<button class="yes" onclick="answer('ask2')" data-es="Sí, es la misma" data-en="Yes, same person">Sí, es la misma</button>
|
| 398 |
+
<button class="no" onclick="answer('ask2')" data-es="No, déjalas aparte" data-en="No, keep separate">No, déjalas aparte</button>
|
| 399 |
+
</div>
|
| 400 |
+
<div class="answered" id="ans-ask2">✓ <span data-es="Listo, como tú digas." data-en="Done, as you say.">Listo, como tú digas.</span></div>
|
| 401 |
+
</div>
|
| 402 |
+
|
| 403 |
+
<!-- HONEST FLAGS -->
|
| 404 |
+
<div class="flags">
|
| 405 |
+
<div class="q">🤔 <span data-es="De estas no estuve segura — te las dejé para que las veas" data-en="I wasn't sure about these — I left them for you">De estas no estuve segura — te las dejé para que las veas</span></div>
|
| 406 |
+
<ul>
|
| 407 |
+
<li data-es="Dos teléfonos tienen muy pocos números. No los cambié por si tú los conoces."
|
| 408 |
+
data-en="Two phone numbers have too few digits. I didn't change them in case you know them.">
|
| 409 |
+
Dos teléfonos tienen muy pocos números. No los cambié por si tú los conoces.</li>
|
| 410 |
+
<li data-es="Las notas del catering (“fiesta Sra. Mendoza”) no las entendí bien. Las dejé tal cual."
|
| 411 |
+
data-en="The catering notes (“Mrs. Mendoza's party”) I didn't quite understand. I left them as they were.">
|
| 412 |
+
Las notas del catering (“fiesta Sra. Mendoza”) no las entendí bien. Las dejé tal cual.</li>
|
| 413 |
+
<li data-es="El total de mayo y la suma de las filas no cuadran por $84. Aquí te lo marco para que lo cheques con tu caja."
|
| 414 |
+
data-en="May's total and the rows add up $84 apart. I'm flagging it so you can check it against your cash.">
|
| 415 |
+
El total de mayo y la suma de las filas no cuadran por $84. Aquí te lo marco para que lo cheques con tu caja.</li>
|
| 416 |
+
</ul>
|
| 417 |
+
</div>
|
| 418 |
+
|
| 419 |
+
<!-- BONUS CARD -->
|
| 420 |
+
<div class="bonus">
|
| 421 |
+
<div class="em">🫙</div>
|
| 422 |
+
<div>
|
| 423 |
+
<div class="t" data-es="De pasada: se te está acabando el adobo de pastor"
|
| 424 |
+
data-en="By the way: you're running low on pastor marinade">De pasada: se te está acabando el adobo de pastor</div>
|
| 425 |
+
<div class="d" data-es="Con lo que vendiste, te alcanza para unos 6 días. Buen momento para pedir más."
|
| 426 |
+
data-en="At this pace you have about 6 days left. Good time to reorder.">
|
| 427 |
+
Con lo que vendiste, te alcanza para unos 6 días. Buen momento para pedir más.</div>
|
| 428 |
+
</div>
|
| 429 |
+
</div>
|
| 430 |
+
|
| 431 |
+
<!-- DOWNLOAD BAND -->
|
| 432 |
+
<div class="download">
|
| 433 |
+
<button class="btn" onclick="return false">
|
| 434 |
+
<span>💾</span><span data-es="Dame mi copia limpia" data-en="Get my clean copy">Dame mi copia limpia</span>
|
| 435 |
+
</button>
|
| 436 |
+
<div style="margin-top:14px;">
|
| 437 |
+
<button class="btn ghost" onclick="return false" data-es="🖨️ Imprimir el resumen en palabras sencillas" data-en="🖨️ Print the plain-words summary">
|
| 438 |
+
🖨️ Imprimir el resumen en palabras sencillas</button>
|
| 439 |
+
</div>
|
| 440 |
+
<div class="small" data-es="Tu original (resumen-del-mes-mayo.xlsx) sigue intacto. Esto es una copia nueva."
|
| 441 |
+
data-en="Your original (resumen-del-mes-mayo.xlsx) is untouched. This is a fresh new copy.">
|
| 442 |
+
Tu original (resumen-del-mes-mayo.xlsx) sigue intacto. Esto es una copia nueva.</div>
|
| 443 |
+
</div>
|
| 444 |
+
|
| 445 |
+
<!-- REVERSIBILITY -->
|
| 446 |
+
<div class="center mt">
|
| 447 |
+
<button class="btn ghost" onclick="return false" data-es="↩️ Mejor déjalo como estaba" data-en="↩️ Put it back the way it was">↩️ Mejor déjalo como estaba</button>
|
| 448 |
+
</div>
|
| 449 |
+
|
| 450 |
+
<div class="footnote" data-es="Lo hiciste tú misma, y está bien. 🌙 Buenas noches, Lupita."
|
| 451 |
+
data-en="You did it yourself, and it's right. 🌙 Goodnight, Lupita.">
|
| 452 |
+
Lo hiciste tú misma, y está bien. 🌙 Buenas noches, Lupita.
|
| 453 |
+
</div>
|
| 454 |
+
</section>
|
| 455 |
+
|
| 456 |
+
</div>
|
| 457 |
+
|
| 458 |
+
<script>
|
| 459 |
+
var screens = ['s0','s1','s2'];
|
| 460 |
+
var pills = document.querySelectorAll('.steps .pill');
|
| 461 |
+
|
| 462 |
+
function go(i, el){
|
| 463 |
+
screens.forEach(function(id,n){
|
| 464 |
+
document.getElementById(id).classList.toggle('active', n===i);
|
| 465 |
+
});
|
| 466 |
+
pills.forEach(function(p,n){ p.classList.toggle('on', n===i); });
|
| 467 |
+
window.scrollTo({top:0,behavior:'smooth'});
|
| 468 |
+
if(i===1){ runWork(); }
|
| 469 |
+
}
|
| 470 |
+
|
| 471 |
+
// working state: cycle reassuring notes, then auto-advance
|
| 472 |
+
var workTimers = [];
|
| 473 |
+
function runWork(){
|
| 474 |
+
workTimers.forEach(clearTimeout); workTimers = [];
|
| 475 |
+
var note = document.getElementById('workNote');
|
| 476 |
+
var es = [
|
| 477 |
+
"Trabajando aquí mismo, en tu computadora. Tu original está a salvo.",
|
| 478 |
+
"Estoy juntando los tacos que están escritos de varias maneras…",
|
| 479 |
+
"Acomodando teléfonos y fechas para que se lean igualito…",
|
| 480 |
+
"Casi listo — guardando una copia nueva, sin tocar tu original."
|
| 481 |
+
];
|
| 482 |
+
var en = [
|
| 483 |
+
"Working right here on your computer. Your original is safe.",
|
| 484 |
+
"Gathering the tacos that are written a few different ways…",
|
| 485 |
+
"Tidying phone numbers and dates so they're easy to read…",
|
| 486 |
+
"Almost there — saving a fresh copy, leaving your original untouched."
|
| 487 |
+
];
|
| 488 |
+
var k = (lang==='es') ? es : en;
|
| 489 |
+
var step = 0;
|
| 490 |
+
note.textContent = k[0];
|
| 491 |
+
for(var s=1;s<k.length;s++){
|
| 492 |
+
(function(s){ workTimers.push(setTimeout(function(){ note.textContent = k[s]; }, s*900)); })(s);
|
| 493 |
+
}
|
| 494 |
+
workTimers.push(setTimeout(function(){ if(document.getElementById('s1').classList.contains('active')) go(2); }, 3700));
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
function answer(id){
|
| 498 |
+
var card = document.getElementById(id);
|
| 499 |
+
card.querySelector('.actions').style.display='none';
|
| 500 |
+
document.getElementById('ans-'+id).style.display='flex';
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
// language toggle
|
| 504 |
+
var lang = 'es';
|
| 505 |
+
function setLang(l, el){
|
| 506 |
+
lang = l;
|
| 507 |
+
document.querySelectorAll('.lang button').forEach(function(b){b.classList.remove('on');});
|
| 508 |
+
el.classList.add('on');
|
| 509 |
+
document.documentElement.lang = l;
|
| 510 |
+
document.querySelectorAll('[data-es]').forEach(function(node){
|
| 511 |
+
var v = node.getAttribute('data-'+l);
|
| 512 |
+
if(v!=null) node.innerHTML = v;
|
| 513 |
+
});
|
| 514 |
+
}
|
| 515 |
+
</script>
|
| 516 |
+
</body>
|
| 517 |
+
</html>
|
design/mockups/office/index.html
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="utf-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 6 |
+
<title>ScrubData — clean spreadsheets, with the receipts</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root{
|
| 9 |
+
--paper:#faf7f2; --card:#fffdfa; --ink:#23201c; --ink-soft:#6b6359;
|
| 10 |
+
--line:#ece5da; --accent:#2f6f5e; --accent-soft:#e7f1ec;
|
| 11 |
+
--done:#3f7d5f; --done-bg:#eef5ef; --done-line:#cfe3d4;
|
| 12 |
+
--call:#b06a1f; --call-bg:#fbf1e2; --call-line:#f0dcbf;
|
| 13 |
+
--flag:#7a7367; --flag-bg:#f3efe8;
|
| 14 |
+
--shadow:0 1px 2px rgba(40,30,20,.04),0 8px 24px rgba(40,30,20,.06);
|
| 15 |
+
--r:15px;
|
| 16 |
+
}
|
| 17 |
+
*{box-sizing:border-box}
|
| 18 |
+
body{margin:0;background:var(--paper);color:var(--ink);
|
| 19 |
+
font-family:Inter,-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,sans-serif;
|
| 20 |
+
line-height:1.5;-webkit-font-smoothing:antialiased}
|
| 21 |
+
.wrap{max-width:760px;margin:0 auto;padding:0 22px}
|
| 22 |
+
a{color:var(--accent)}
|
| 23 |
+
|
| 24 |
+
/* privacy ribbon */
|
| 25 |
+
.ribbon{background:var(--accent-soft);color:#234e42;font-size:13.5px;
|
| 26 |
+
text-align:center;padding:9px 16px;border-bottom:1px solid #d6e7df}
|
| 27 |
+
.ribbon b{font-weight:600}
|
| 28 |
+
|
| 29 |
+
/* header */
|
| 30 |
+
header{padding:40px 0 8px}
|
| 31 |
+
.logo{display:flex;align-items:center;gap:9px;font-weight:700;font-size:18px;letter-spacing:-.2px}
|
| 32 |
+
.logo .mark{width:26px;height:26px;border-radius:8px;background:var(--accent);
|
| 33 |
+
display:grid;place-items:center;color:#fff;font-size:15px}
|
| 34 |
+
h1{font-size:30px;line-height:1.15;letter-spacing:-.6px;margin:22px 0 8px;font-weight:740}
|
| 35 |
+
.sub{color:var(--ink-soft);font-size:16.5px;max-width:560px}
|
| 36 |
+
|
| 37 |
+
/* file chip */
|
| 38 |
+
.filebar{display:flex;align-items:center;gap:12px;margin:26px 0 6px;
|
| 39 |
+
background:var(--card);border:1px solid var(--line);border-radius:var(--r);
|
| 40 |
+
padding:14px 16px;box-shadow:var(--shadow)}
|
| 41 |
+
.fileicon{width:34px;height:34px;border-radius:9px;background:#eef4f1;color:var(--accent);
|
| 42 |
+
display:grid;place-items:center;font-size:16px;flex:none}
|
| 43 |
+
.filebar .nm{font-weight:600}
|
| 44 |
+
.filebar .meta{color:var(--ink-soft);font-size:13.5px}
|
| 45 |
+
.filebar .spacer{flex:1}
|
| 46 |
+
.pill-done-mini{font-size:12px;font-weight:600;color:var(--done);
|
| 47 |
+
background:var(--done-bg);border:1px solid var(--done-line);padding:3px 9px;border-radius:20px}
|
| 48 |
+
|
| 49 |
+
/* summary */
|
| 50 |
+
section{margin:34px 0}
|
| 51 |
+
.eyebrow{font-size:12.5px;font-weight:700;letter-spacing:.06em;text-transform:uppercase;
|
| 52 |
+
color:var(--ink-soft);margin-bottom:13px}
|
| 53 |
+
.result-h{font-size:22px;font-weight:720;letter-spacing:-.3px;margin:0 0 4px}
|
| 54 |
+
.result-sub{color:var(--ink-soft);margin:0 0 4px}
|
| 55 |
+
.summary{background:var(--card);border:1px solid var(--line);border-radius:var(--r);
|
| 56 |
+
padding:6px 20px;box-shadow:var(--shadow)}
|
| 57 |
+
.summary li{list-style:none;padding:14px 0;border-bottom:1px solid var(--line);
|
| 58 |
+
display:flex;gap:13px;align-items:flex-start;font-size:15.5px}
|
| 59 |
+
.summary li:last-child{border-bottom:0}
|
| 60 |
+
.summary .ic{flex:none;margin-top:1px;font-size:16px}
|
| 61 |
+
.summary b{font-weight:650}
|
| 62 |
+
.handoff{color:var(--call)}
|
| 63 |
+
|
| 64 |
+
/* change cards */
|
| 65 |
+
.card{background:var(--card);border:1px solid var(--line);border-left-width:4px;
|
| 66 |
+
border-radius:var(--r);padding:17px 19px;margin:13px 0;box-shadow:var(--shadow)}
|
| 67 |
+
.card.done{border-left-color:var(--done)}
|
| 68 |
+
.card.call{border-left-color:var(--call)}
|
| 69 |
+
.card.flag{border-left-color:#cdbfa6}
|
| 70 |
+
.card-top{display:flex;align-items:center;gap:10px;margin-bottom:4px}
|
| 71 |
+
.card-title{font-weight:650;font-size:15.5px}
|
| 72 |
+
.pill{font-size:11.5px;font-weight:700;letter-spacing:.04em;padding:3px 9px;border-radius:20px;margin-left:auto;flex:none}
|
| 73 |
+
.pill.done{color:var(--done);background:var(--done-bg);border:1px solid var(--done-line)}
|
| 74 |
+
.pill.call{color:var(--call);background:var(--call-bg);border:1px solid var(--call-line)}
|
| 75 |
+
.pill.flag{color:var(--flag);background:var(--flag-bg);border:1px solid #e2d9c9}
|
| 76 |
+
.card-body{color:var(--ink-soft);font-size:14.5px}
|
| 77 |
+
|
| 78 |
+
/* before/after */
|
| 79 |
+
.ba{display:grid;grid-template-columns:1fr auto 1fr;gap:10px;align-items:center;margin-top:13px}
|
| 80 |
+
.ba .col{background:#fbf9f5;border:1px solid var(--line);border-radius:11px;padding:11px 13px}
|
| 81 |
+
.ba .lab{font-size:11px;text-transform:uppercase;letter-spacing:.05em;color:var(--ink-soft);margin-bottom:6px}
|
| 82 |
+
.ba .val{font-size:13.5px;font-family:"SF Mono",ui-monospace,Menlo,monospace}
|
| 83 |
+
.ba .was{color:#9a8d7c}
|
| 84 |
+
.ba .arrow{color:var(--accent);font-size:18px;text-align:center}
|
| 85 |
+
.ba .ann{color:var(--done);font-weight:600;font-size:12.5px}
|
| 86 |
+
.strike{text-decoration:line-through;text-decoration-color:#c9bcab;color:#9a8d7c}
|
| 87 |
+
|
| 88 |
+
/* your-call buttons */
|
| 89 |
+
.actions{display:flex;gap:9px;margin-top:14px}
|
| 90 |
+
.btn{font:inherit;font-size:14px;font-weight:600;padding:9px 15px;border-radius:10px;cursor:pointer;border:1px solid var(--line);background:#fff;color:var(--ink)}
|
| 91 |
+
.btn.primary{background:var(--accent);border-color:var(--accent);color:#fff}
|
| 92 |
+
.btn.ghost{background:transparent}
|
| 93 |
+
|
| 94 |
+
/* download */
|
| 95 |
+
.download{background:linear-gradient(180deg,#fffdfa,#f7f2ea);border:1px solid var(--line);
|
| 96 |
+
border-radius:18px;padding:26px;text-align:center;box-shadow:var(--shadow)}
|
| 97 |
+
.download h3{margin:0 0 4px;font-size:19px;font-weight:720}
|
| 98 |
+
.download p{margin:0 0 18px;color:var(--ink-soft);font-size:14.5px}
|
| 99 |
+
.dl-row{display:flex;gap:11px;justify-content:center;flex-wrap:wrap}
|
| 100 |
+
.btn.big{padding:12px 22px;font-size:15px}
|
| 101 |
+
.revert{margin-top:16px;font-size:13px;color:var(--ink-soft)}
|
| 102 |
+
|
| 103 |
+
footer{padding:30px 0 50px;text-align:center;color:#9a8d7c;font-size:13px;border-top:1px solid var(--line);margin-top:36px}
|
| 104 |
+
.restart{display:inline-block;margin-top:22px;font-size:14px;color:var(--accent);font-weight:600;text-decoration:none}
|
| 105 |
+
</style>
|
| 106 |
+
</head>
|
| 107 |
+
<body>
|
| 108 |
+
|
| 109 |
+
<div class="ribbon">🔒 <b>Runs entirely on your machine.</b> Your original file is untouched — nothing is uploaded.</div>
|
| 110 |
+
|
| 111 |
+
<div class="wrap">
|
| 112 |
+
<header>
|
| 113 |
+
<div class="logo"><span class="mark">✦</span> ScrubData</div>
|
| 114 |
+
<h1>Done. Here's what changed.</h1>
|
| 115 |
+
<p class="sub">I did the tedious part — matching spellings, fixing formats, finding the blanks. Everything below is reversible, and I left the judgment calls for you.</p>
|
| 116 |
+
</header>
|
| 117 |
+
|
| 118 |
+
<div class="filebar">
|
| 119 |
+
<div class="fileicon">▦</div>
|
| 120 |
+
<div>
|
| 121 |
+
<div class="nm">crm-export-may.csv</div>
|
| 122 |
+
<div class="meta">3,840 rows · 11 columns · cleaned in 4.2s, locally</div>
|
| 123 |
+
</div>
|
| 124 |
+
<div class="spacer"></div>
|
| 125 |
+
<div class="pill-done-mini">6 fixes applied</div>
|
| 126 |
+
</div>
|
| 127 |
+
|
| 128 |
+
<!-- SUMMARY -->
|
| 129 |
+
<section>
|
| 130 |
+
<div class="eyebrow">The summary, in plain English</div>
|
| 131 |
+
<ul class="summary">
|
| 132 |
+
<li><span class="ic">🗂️</span><div><b>Unified 4 spellings of "United States"</b> (US, U.S., usa, United States) into one. 2,108 rows affected.</div></li>
|
| 133 |
+
<li><span class="ic">🏷️</span><div><b>Merged 4 ways of writing the same deal stage</b> ("Closed Won", "closed-won", "Won", "CW") into one. 1,204 rows.</div></li>
|
| 134 |
+
<li><span class="ic">⬜</span><div><b>Treated 47 disguised blanks</b> ("N/A", "none", "—") as empty, so your counts and filters behave.</div></li>
|
| 135 |
+
<li><span class="ic">📅</span><div><b>Standardized all dates to YYYY-MM-DD</b> and phone numbers to one format.</div></li>
|
| 136 |
+
<li class="handoff"><span class="ic">✋</span><div><b>2 changes touch money or identity, so I didn't make them.</b> They're below for your call.</div></li>
|
| 137 |
+
</ul>
|
| 138 |
+
</section>
|
| 139 |
+
|
| 140 |
+
<!-- DONE -->
|
| 141 |
+
<section>
|
| 142 |
+
<div class="eyebrow">Handled — already applied (and reversible)</div>
|
| 143 |
+
|
| 144 |
+
<div class="card done">
|
| 145 |
+
<div class="card-top"><span class="card-title">Same country, counted as one</span><span class="pill done">DONE</span></div>
|
| 146 |
+
<div class="card-body">Four spellings were splitting your "United States" rows across the report.</div>
|
| 147 |
+
<div class="ba">
|
| 148 |
+
<div class="col"><div class="lab">Before</div>
|
| 149 |
+
<div class="val was">US · U.S. · usa<br>United States</div></div>
|
| 150 |
+
<div class="arrow">→</div>
|
| 151 |
+
<div class="col"><div class="lab">After</div>
|
| 152 |
+
<div class="val">United States</div><div class="ann">one value · 2,108 rows</div></div>
|
| 153 |
+
</div>
|
| 154 |
+
</div>
|
| 155 |
+
|
| 156 |
+
<div class="card done">
|
| 157 |
+
<div class="card-top"><span class="card-title">Phone numbers, one format</span><span class="pill done">DONE</span></div>
|
| 158 |
+
<div class="card-body">Mixed formats standardized so lookups and dedupes line up.</div>
|
| 159 |
+
<div class="ba">
|
| 160 |
+
<div class="col"><div class="lab">Before</div>
|
| 161 |
+
<div class="val was">(415) 555.0192<br>415-555-0147<br>+1 415 555 0188</div></div>
|
| 162 |
+
<div class="arrow">→</div>
|
| 163 |
+
<div class="col"><div class="lab">After</div>
|
| 164 |
+
<div class="val">(415) 555-0192<br>(415) 555-0147<br>(415) 555-0188</div></div>
|
| 165 |
+
</div>
|
| 166 |
+
</div>
|
| 167 |
+
</section>
|
| 168 |
+
|
| 169 |
+
<!-- YOUR CALL -->
|
| 170 |
+
<section>
|
| 171 |
+
<div class="eyebrow">Needs your call — I didn't touch these</div>
|
| 172 |
+
|
| 173 |
+
<div class="card call">
|
| 174 |
+
<div class="card-top"><span class="card-title">31 deals show $0.00</span><span class="pill call">YOUR CALL</span></div>
|
| 175 |
+
<div class="card-body">Usually a sync glitch, not a real deal. Leaving them in drags your win total down. Exclude them from the total?</div>
|
| 176 |
+
<div class="actions">
|
| 177 |
+
<button class="btn primary">Leave them out</button>
|
| 178 |
+
<button class="btn ghost">Keep them</button>
|
| 179 |
+
</div>
|
| 180 |
+
</div>
|
| 181 |
+
|
| 182 |
+
<div class="card call">
|
| 183 |
+
<div class="card-top"><span class="card-title">Possible duplicate contact</span><span class="pill call">YOUR CALL</span></div>
|
| 184 |
+
<div class="card-body">"Yolanda R." and "Yolanda Reyes" share an email (y.reyes@northwind.co). Count them as one contact?</div>
|
| 185 |
+
<div class="actions">
|
| 186 |
+
<button class="btn primary">Merge them</button>
|
| 187 |
+
<button class="btn ghost">Keep both</button>
|
| 188 |
+
</div>
|
| 189 |
+
</div>
|
| 190 |
+
</section>
|
| 191 |
+
|
| 192 |
+
<!-- FLAGGED -->
|
| 193 |
+
<section>
|
| 194 |
+
<div class="eyebrow">Worth a look — left exactly as they were</div>
|
| 195 |
+
<div class="card flag">
|
| 196 |
+
<div class="card-top"><span class="card-title">3 cells I wouldn't guess at</span><span class="pill flag">FLAGGED</span></div>
|
| 197 |
+
<div class="card-body">Two phone numbers have too few digits, and one note reads <span style="font-family:ui-monospace,monospace;font-size:13px">"follow up?? — check w/ Dana"</span>. I didn't guess. Left them untouched for you to check.</div>
|
| 198 |
+
</div>
|
| 199 |
+
</section>
|
| 200 |
+
|
| 201 |
+
<!-- DOWNLOAD -->
|
| 202 |
+
<section>
|
| 203 |
+
<div class="download">
|
| 204 |
+
<h3>Your clean copy is ready</h3>
|
| 205 |
+
<p>Take the cleaned file and the change log. Both are yours to keep.</p>
|
| 206 |
+
<div class="dl-row">
|
| 207 |
+
<button class="btn primary big">↓ Download clean file</button>
|
| 208 |
+
<button class="btn big">Export change log</button>
|
| 209 |
+
</div>
|
| 210 |
+
<div class="revert">Your original is untouched. Revert any change — or all of them — whenever you want.</div>
|
| 211 |
+
</div>
|
| 212 |
+
<div style="text-align:center"><a class="restart" href="#">← Clean another file</a></div>
|
| 213 |
+
</section>
|
| 214 |
+
</div>
|
| 215 |
+
|
| 216 |
+
<footer>Runs locally. Nothing leaves your machine, ever.</footer>
|
| 217 |
+
|
| 218 |
+
</body>
|
| 219 |
+
</html>
|
docs/DATASETS.md
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dataset inventory — every source the system trains on, evaluates on, or must clean
|
| 2 |
+
|
| 3 |
+
Stage-3 consolidated registry (2026-06-11). Assignment discipline: a source is
|
| 4 |
+
TRAIN, EVAL, or BENCH — never both sides of train/eval.
|
| 5 |
+
|
| 6 |
+
## Paired dirty/clean (27 — eval/paired_bench.py → docs/PAIRED_BENCH.md)
|
| 7 |
+
|
| 8 |
+
| source | origin | license | assignment | notes |
|
| 9 |
+
|---|---|---|---|---|
|
| 10 |
+
| hospital, beers, movies_1 | Raha (BigDaMa) | Apache-2.0 | TRAIN | champion mix since v6 |
|
| 11 |
+
| flights, rayyan | Raha | Apache-2.0 | EVAL (GEN) | held-out real errors |
|
| 12 |
+
| tax | Raha | Apache-2.0 | unused | numeric-heavy, huge |
|
| 13 |
+
| ed2_restaurants | BigDaMa ED2 | research | EVAL (GEN) | real NYC variants; errors past row 2k |
|
| 14 |
+
| fodors_zagats | Magellan EM | BSD-ish data | TRAIN | variant-masked EM table |
|
| 15 |
+
| dblp_acm, dblp_scholar | Magellan EM | research | BENCH only | out-of-regime (unique titles / convention-mismatch gold) |
|
| 16 |
+
| cleanml_company, cleanml_movie | CleanML | research | TRAIN | Company = org canon |
|
| 17 |
+
| gidcl_imdb | SICS-FRC GIDCL | none stated | TRAIN (v9+) | 1M-row pair; 57k errors; subset 86k rows |
|
| 18 |
+
| zeroed_billionaire, zeroed_tax100k | WelkinNi/ZeroED | none stated | BENCH | injected; rich categoricals |
|
| 19 |
+
| dgov_* (5 tables) | LUH-DBS Matelda | Apache-2.0 | BENCH | real data.gov tables, injected typos (6,692 more available) |
|
| 20 |
+
| tt_* (8 tables) | ToughTables 2T_WD | CC-BY-4.0 | BENCH | gold-anchored entity misspellings, 370–33.5k corrections each |
|
| 21 |
+
|
| 22 |
+
## Wild messy tables (35 — eval/wild_bench.py → docs/WILD_BENCH.md)
|
| 23 |
+
|
| 24 |
+
24 portal tables (training/unpaired_sources.json cache: NYC/Chicago/SF/LA/Seattle/TX/WA
|
| 25 |
+
portals, spotify, billboard, titanic, worldcities, airlines) + 12 stage-3 additions
|
| 26 |
+
(training/harvest_wild.py): bx_books (mojibake), salary_survey, fec_indiv80 (PII,
|
| 27 |
+
headerless), acnc_charities (AU), uk_price_paid (headerless UK), irs_eo1,
|
| 28 |
+
glassdoor_jobs (multiline cells), paris_trees (FR), online_retail, bl_flickr_books,
|
| 29 |
+
open_food_facts (211 cols), ct_real_estate. Backlog: CMS doctors (API 400), NHTSA
|
| 30 |
+
FLAT_CMPL (multi-GB), Canada contracts (627MB).
|
| 31 |
+
|
| 32 |
+
## Alias vocabularies (training generator material)
|
| 33 |
+
|
| 34 |
+
| vocab | size | license | regime |
|
| 35 |
+
|---|---|---|---|
|
| 36 |
+
| toughtables_aliases | 49,629 | CC-BY-4.0 | real entity misspellings (gold-anchored) |
|
| 37 |
+
| musicbrainz_hint_aliases | 34,017 | CC0 | community-recorded artist misspellings |
|
| 38 |
+
| rxnorm_aliases | 17,701 | public domain | drug name synonyms |
|
| 39 |
+
| ror_aliases | 73k orgs | CC0 | research orgs |
|
| 40 |
+
| geonames_city_aliases | 80k cities | CC-BY | city aliases |
|
| 41 |
+
| wikidata_company_aliases | 10.2k | CC0 | company aliases |
|
| 42 |
+
| onet_jobtitle_aliases | 1,016 | CC-BY-4.0 | job titles |
|
| 43 |
+
| nickname_aliases | 555 | Apache-2.0 | first names |
|
| 44 |
+
| openflights_airports | 7,698 | ODbL/DbCL | airports reference |
|
| 45 |
+
| libpostal_aliases | — | MIT | address abbreviations |
|
| 46 |
+
|
| 47 |
+
## Measured conclusions that govern future widening
|
| 48 |
+
|
| 49 |
+
1. Pre-paired corpus discovery is SATURATED (3 verified hunts) — synthesis from
|
| 50 |
+
vocabularies is the widening path.
|
| 51 |
+
2. Pair volume / vocab training does NOT move held-out generalization (v7–v9, 4
|
| 52 |
+
retrains + tt-transfer test): the planner's value_counts cap (80) structurally
|
| 53 |
+
hides high-cardinality dirty cells. The unlock is architectural: error-suspect /
|
| 54 |
+
windowed profiling and cross-row entity voting.
|
| 55 |
+
3. The deterministic side (grounding + ops + verifier union) carries never-seen
|
| 56 |
+
tables today; every op added from a measured regime (normalize_punctuation)
|
| 57 |
+
moved GEN; convention/encoding ops are the cheapest remaining wins.
|
docs/DEGENERATE_BASELINES.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Degenerate baselines + cost-weighted damage (W4.3 + W4.4)
|
| 2 |
+
|
| 3 |
+
Same 42 dirty/clean pairs as `eval/paired_bench.py`, scored with `run_real_multi.score()` (churn-neutral F1 + damage). The degenerate policies pin
|
| 4 |
+
the metric: no-op = floor (F1 0, damage 0), oracle = ceiling (F1 1, damage 0),
|
| 5 |
+
random-edit (seeded, 5% of cells) = vandalism the metric must punish. Abstain-all
|
| 6 |
+
is score-identical to no-op — the repair metric is flag-blind by design.
|
| 7 |
+
|
| 8 |
+
| policy | macro F1 | macro P | macro R | macro damage | fixed | damage cells |
|
| 9 |
+
|---|---|---|---|---|---|---|
|
| 10 |
+
| no-op | 0.000 | 1.000 | 0.000 | 0.0000 | 0 | 0 |
|
| 11 |
+
| abstain-all | 0.000 | 1.000 | 0.000 | 0.0000 | 0 | 0 |
|
| 12 |
+
| random-edit | 0.000 | 0.001 | 0.001 | 0.0485 | 39 | 80042 |
|
| 13 |
+
| oracle | 1.000 | 1.000 | 1.000 | 0.0000 | 163607 | 0 |
|
| 14 |
+
| shipped | 0.343 | 0.576 | 0.308 | 0.0229 | 83543 | 61679 |
|
| 15 |
+
|
| 16 |
+
## Cost-weighted scores (Effective-Reliability style, W4.4)
|
| 17 |
+
|
| 18 |
+
score_c = fixes − c·damage_cells, micro-summed over all pairs; per-error =
|
| 19 |
+
score_c / 163607 total benchmark errors.
|
| 20 |
+
|
| 21 |
+
| policy | c=1 (per-error) | c=5 (per-error) | c=10 (per-error) |
|
| 22 |
+
|---|---|---|---|
|
| 23 |
+
| no-op | 0 (+0.000) | 0 (+0.000) | 0 (+0.000) |
|
| 24 |
+
| abstain-all | 0 (+0.000) | 0 (+0.000) | 0 (+0.000) |
|
| 25 |
+
| random-edit | -80003 (-0.489) | -400171 (-2.446) | -800381 (-4.892) |
|
| 26 |
+
| oracle | 163607 (+1.000) | 163607 (+1.000) | 163607 (+1.000) |
|
| 27 |
+
| shipped | 21864 (+0.134) | -224852 (-1.374) | -533247 (-3.259) |
|
| 28 |
+
|
| 29 |
+
Acceptance: oracle F1 = 1.0 on all pairs: **True** · no-op damage = 0.0 on all pairs: **True**
|
| 30 |
+
Repro: `uv run python -m eval.degenerate` (seed 7, edit fraction 0.05).
|
docs/FIELD_NOTES.md
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Field notes — building ScrubData small, on purpose
|
| 2 |
+
|
| 3 |
+
*Build Small Hackathon, June 2026. A ≤4B model, a Gradio Space, and two weeks of
|
| 4 |
+
finding out what "small but honest" actually costs.*
|
| 5 |
+
|
| 6 |
+
## The bet
|
| 7 |
+
|
| 8 |
+
The person who most needs data cleaning — the ops coordinator with a messy CRM export
|
| 9 |
+
and a Monday deadline — will never write a pandas script, and shouldn't have to ship
|
| 10 |
+
her customer data to a frontier API either. The bet: a 4B model running locally is
|
| 11 |
+
enough, **if you stop asking it to edit data and start asking it to plan**.
|
| 12 |
+
|
| 13 |
+
So the model never touches a cell. It reads an aggregated profile (per-value frequency
|
| 14 |
+
counts — so the model sees a bounded, fixed-size summary whether the table has a hundred
|
| 15 |
+
rows or a million) and emits a JSON plan; deterministic pandas executes it. Every change is named, reversible, and logged. Silent edits are
|
| 16 |
+
impossible by construction. That decomposition turned out to be the whole project.
|
| 17 |
+
|
| 18 |
+
## Things that broke, in order
|
| 19 |
+
|
| 20 |
+
**The fine-tune that aced the test and failed the job.** v4 hit canonicalization F1
|
| 21 |
+
0.90 on held-out synthetic data — and scored exactly 0.000 on real hospital typos. It
|
| 22 |
+
had never seen a high-cardinality real column. Fix: derive training pairs from real
|
| 23 |
+
dirty/clean benchmark tables by cell alignment, keeping only *learnable*
|
| 24 |
+
canonicalizations (a surface form that's a string variant of its target and never a
|
| 25 |
+
legitimate value elsewhere). Real repair recall: 0.00 → 0.42. Synthetic data teaches
|
| 26 |
+
the format; real data teaches the job.
|
| 27 |
+
|
| 28 |
+
**The GGUF that lobotomized the model.** Same adapter, two exports: Q8_0 worked
|
| 29 |
+
perfectly, Q4_K_M degenerated into `<tool_call>` loops. Hours of template debugging
|
| 30 |
+
later: the quantization itself was corrupting the export. Then the bf16 path had its
|
| 31 |
+
own version — training converged (loss 0.16) but free-running generation *still*
|
| 32 |
+
emitted tool-call loops, because Qwen3's tool-calling prior dominates the first token.
|
| 33 |
+
The fix is two tokens long: `suppress_tokens=[151657, 151658]`.
|
| 34 |
+
|
| 35 |
+
**The model that invented cities.** Asked for canonical forms, a generative model
|
| 36 |
+
generates — including `guntxrsvillx → huntsville` (wrong town). Frequency clustering
|
| 37 |
+
can't fix this either: a lone column has no signal to vote against the error (GARF
|
| 38 |
+
proves this structurally). The fix came from the literature: never free-generate a
|
| 39 |
+
canonical. Retrieve candidates from a reference taxonomy (GeoNames, ISO), require a
|
| 40 |
+
similarity threshold *and* an ambiguity margin, and **abstain** when unsure. `boxz` is
|
| 41 |
+
equally close to `Box` and `Boaz` — so the system declines and asks. We measured the
|
| 42 |
+
abstention: precision rises monotonically with the threshold (90% at the default, 95%
|
| 43 |
+
at 0.91). Knowing when not to act turned out to be the most valuable feature.
|
| 44 |
+
|
| 45 |
+
**The eval that graded itself too kindly — twice.** Our own ablations caught two metric
|
| 46 |
+
artifacts: (1) convention-tolerant scoring counted bulk case-rewrites as "good
|
| 47 |
+
changes," inflating precision — removing case-matching *gained* +0.12 until we made
|
| 48 |
+
the metric churn-neutral; (2) our adversarial traps included `Boazz`, which grounding
|
| 49 |
+
correctly maps to the real city Boaz — the trap was punishing correct behavior. Both
|
| 50 |
+
fixes are reported in the paper as results, because an eval you haven't tried to break
|
| 51 |
+
is an eval you can't trust.
|
| 52 |
+
|
| 53 |
+
**The honest negative result.** On *injected* typos, classical frequency clustering
|
| 54 |
+
remains a strong baseline — by construction: injection puts the canonical in the
|
| 55 |
+
column, which is clustering's ideal regime. Grounding's edge is real errors, tail
|
| 56 |
+
entities, and not wrong-merging. We report both slices separately rather than
|
| 57 |
+
averaging the difference away.
|
| 58 |
+
|
| 59 |
+
**The verifier that made the model shippable.** The fine-tune's hospital numbers told
|
| 60 |
+
an awkward story: recall 0.475 (best we'd measured for a local model) at precision
|
| 61 |
+
0.185 — it fixed errors *and* invented merges. Instead of retraining, we scored every
|
| 62 |
+
proposed mapping with three deterministic gates distilled from its actual failures: a
|
| 63 |
+
value occurring ≥3 times is data, not a typo (*errors are rare*); a repair target must
|
| 64 |
+
dominate its source in frequency (no mapping one typo onto another); digit-bearing
|
| 65 |
+
codes only repair when the letter part is near-identical (`amix-2 → ami-2` yes,
|
| 66 |
+
`ak_ → al_` no). The gated model plan alone: **0.993 precision at 0.287 coverage** —
|
| 67 |
+
146 of 147 changes correct. Union it with the grounded heuristic and you get **0.905
|
| 68 |
+
precision at 0.413 coverage** on hospital's 509 real errors. Every dropped mapping
|
| 69 |
+
becomes a review flag, not a silent skip. That composition — verify the model's
|
| 70 |
+
output, never trust it — is what the app now ships as its default planner.
|
| 71 |
+
|
| 72 |
+
## The PII turn
|
| 73 |
+
|
| 74 |
+
A friend pointed at the OpenMed project (small Apache-2.0 token classifiers; their
|
| 75 |
+
paper is the sister result to our thesis — small specialized beats big generic). Their
|
| 76 |
+
44M PII model, trained on clinical *sentences*, turned out to transfer perfectly to
|
| 77 |
+
bare CSV cells: 100% on names and addresses, no prompt template needed. We put it
|
| 78 |
+
behind a sensitive-type allowlist and a column-level vote, added a deterministic
|
| 79 |
+
checksum tier (Luhn, IBAN mod-97 — math, not vibes), and made masking an executor
|
| 80 |
+
operation. Leak test: 0/360 residual detectable PII after masking. OOD type detection:
|
| 81 |
+
5/5 with 0/7 false positives. The privacy ribbon at the top of the app — "nothing
|
| 82 |
+
leaves this machine" — now describes the PII handling too, not just the inference.
|
| 83 |
+
|
| 84 |
+
## The word that broke the demo
|
| 85 |
+
|
| 86 |
+
We shipped the engine, then sent the live Space to people who actually have messy
|
| 87 |
+
spreadsheets and aren't data people. The most useful feedback wasn't a bug report — it
|
| 88 |
+
was that the word **"cleaning" didn't mean anything to them**. One tester read "clean my
|
| 89 |
+
Excel" as *deleting* data: *"¿Te refieres a que elimine algo de algún archivo?"* ("you
|
| 90 |
+
mean it removes something from the file?"). Another didn't know where to start: *"¿eso
|
| 91 |
+
del Excel te lo subimos ahí o cómo?"* ("the Excel thing — do we upload it there, or
|
| 92 |
+
how?"). The clearest explanation of the whole product turned out to be a sentence we
|
| 93 |
+
typed by hand in a chat reply — *"it fixes text errors: names, phones, emails, cities"* —
|
| 94 |
+
and that sentence was nowhere in the app.
|
| 95 |
+
|
| 96 |
+
The engine was fine. The *framing* was the failure. So we changed the product to **show**
|
| 97 |
+
what cleaning is instead of naming it: the hero now opens with a literal before→after
|
| 98 |
+
strip (`nigeia → Nigeria`, `Calfornia → California`) before any upload, the headline is
|
| 99 |
+
the sentence that worked in chat ("Fix the messy text in your spreadsheet"), the copy
|
| 100 |
+
says plainly "I never delete your data," jargon labels are gone ("with PII" → "with
|
| 101 |
+
sensitive data"), and a one-click "watch it run on a sample" path removes the "where do I
|
| 102 |
+
even start" wall. One honesty footnote from the rewrite: our first before→after example
|
| 103 |
+
added a `+52` country code to a phone number — which the executor doesn't actually do — so
|
| 104 |
+
we cut it. The demo strip can only show what the engine truly does.
|
| 105 |
+
|
| 106 |
+
n was small and informal (~3 people we know), so this isn't a usability study. But you
|
| 107 |
+
only need to watch one person mistake your tool for a delete button to learn the lesson:
|
| 108 |
+
the people who most need the tool don't share your vocabulary, and the demo has to teach
|
| 109 |
+
the concept before it can show the feature.
|
| 110 |
+
|
| 111 |
+
## What we'd tell the next person
|
| 112 |
+
|
| 113 |
+
1. **Planner/executor is the trust unlock.** Auditability isn't a feature you add;
|
| 114 |
+
it's a decomposition you choose.
|
| 115 |
+
2. **Verify supervision by executing it.** Every training example we kept provably
|
| 116 |
+
recovers the clean table. Bad plans can't become labels.
|
| 117 |
+
3. **Ground generation in references and budget for abstention.** A small model that
|
| 118 |
+
declines correctly beats a big model that guesses confidently.
|
| 119 |
+
4. **Attack your own eval before reviewers do.** Both of our metric bugs were found by
|
| 120 |
+
ablations we almost didn't run.
|
| 121 |
+
5. **Small models are enough more often than you think** — and roughly $35 of GPU
|
| 122 |
+
credit covers an embarrassing number of mistakes if each one teaches you something.
|
| 123 |
+
6. **Test the framing on someone outside your vocabulary.** The engine can be correct and
|
| 124 |
+
the product still unusable if the first screen assumes a word — "cleaning" — that your
|
| 125 |
+
user doesn't have. Show the concept before you name the feature.
|
| 126 |
+
|
| 127 |
+
— Built with a ≤4B planner, a 44M PII classifier, checksums, and a reference gazetteer.
|
| 128 |
+
Total model weight: under 4.1B parameters. Total cloud spend: about $35.
|
docs/GITTABLES_AUDIT.md
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GitTables N=250 audit — trust contract at scale
|
| 2 |
+
|
| 3 |
+
Shipped pipeline over 239 real GitHub tables (Matelda GitTables-subsets,
|
| 4 |
+
Apache-2.0). IMPORTANT framing: this subset is a CLEAN LAKE (dirty == clean for
|
| 5 |
+
238/239 tables), so the repair-F1 dimension is void and `macro_damage` is NOT
|
| 6 |
+
damage — it is an INTERVENTION-RATE upper bound (any semantic normalization the
|
| 7 |
+
pipeline performs counts against gold=input, including intended format parsing).
|
| 8 |
+
What this audit certifies: robustness (0 pipeline failures), schema validity
|
| 9 |
+
(239/239), and ZERO silent edits across 239 arbitrary real-world tables — the
|
| 10 |
+
trust contract at scale. The ~5.5% intervention rate (43 tables untouched) is
|
| 11 |
+
the conservative measure of how much the pipeline chooses to act on arbitrary
|
| 12 |
+
tables.
|
| 13 |
+
|
| 14 |
+
| metric | value |
|
| 15 |
+
|---|---|
|
| 16 |
+
| tables_audited | 239 |
|
| 17 |
+
| pipeline_failures | 0 |
|
| 18 |
+
| plan_valid | 239 |
|
| 19 |
+
| tables_with_silent_edits | 0 |
|
| 20 |
+
| tables_with_errors | 1 |
|
| 21 |
+
| macro_f1_on_errored | 0.0 |
|
| 22 |
+
| macro_damage | 0.055 |
|
| 23 |
+
| zero_damage_tables | 43 |
|
| 24 |
+
| seconds | 796.9 |
|
docs/PAIRED_BENCH.md
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Paired Bench — shipped system on every cell-aligned pair
|
| 2 |
+
|
| 3 |
+
Churn-neutral repairs metric + variant-class recall; `seen` = source fed
|
| 4 |
+
the champion's training mix (flagged, not hidden).
|
| 5 |
+
|
| 6 |
+
| dataset | seen | rows×cols | errors | variant | F1 | precision | recall | VR | damage |
|
| 7 |
+
|---|---|---|---|---|---|---|---|---|---|
|
| 8 |
+
| dgov_2_10_budget_presentation_award_summary | | 16×6 | 9 | 9 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
|
| 9 |
+
| dgov_emergency_operating_center_tools | | 7×3 | 4 | 3 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
|
| 10 |
+
| dgov_illinois_obesity_by_county | | 102×5 | 17 | 17 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
|
| 11 |
+
| fodors_zagats | ✓ | 112×6 | 206 | 206 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0536 |
|
| 12 |
+
| rayyan | | 1000×11 | 948 | 171 | 0.0 | 0.0 | 0.0 | 0.0 | 0.1178 |
|
| 13 |
+
| zeroed_tax100k | | 20000×15 | 952 | 117 | 0.0 | 0.0 | 0.006 | 0.051 | 0.0822 |
|
| 14 |
+
| ed2_restaurants | | 20000×15 | 309 | 76 | 0.001 | 0.0 | 0.026 | 0.105 | 0.0718 |
|
| 15 |
+
| dblp_acm | | 2224×4 | 2128 | 2128 | 0.003 | 0.273 | 0.001 | 0.001 | 0.001 |
|
| 16 |
+
| cleanml_movie | ✓ | 9329×8 | 4779 | 8 | 0.008 | 0.019 | 0.005 | 0.0 | 0.0172 |
|
| 17 |
+
| dblp_scholar | | 2408×4 | 3099 | 3099 | 0.008 | 0.012 | 0.006 | 0.006 | 0.233 |
|
| 18 |
+
| tt_cn5wvwhh | | 8302×5 | 370 | 370 | 0.021 | 0.046 | 0.014 | 0.014 | 0.0025 |
|
| 19 |
+
| beers | ✓ | 2410×11 | 4362 | 693 | 0.026 | 0.042 | 0.019 | 0.117 | 0.0044 |
|
| 20 |
+
| dgov_mva_vehicle_sales_counts_by_month_for_ca | | 248×6 | 43 | 24 | 0.042 | 0.2 | 0.023 | 0.042 | 0.0 |
|
| 21 |
+
| zeroed_billionaire | | 2614×22 | 5248 | 1146 | 0.103 | 0.232 | 0.067 | 0.305 | 0.0042 |
|
| 22 |
+
| dgov_field_listings | | 122×20 | 317 | 250 | 0.106 | 0.133 | 0.088 | 0.112 | 0.0523 |
|
| 23 |
+
| flights | | 2376×7 | 4920 | 1049 | 0.164 | 0.265 | 0.119 | 0.247 | 0.0839 |
|
| 24 |
+
| dgov_grocery_stores_2013 | | 506×17 | 420 | 332 | 0.21 | 0.265 | 0.174 | 0.193 | 0.0192 |
|
| 25 |
+
| cleanml_company | ✓ | 20000×9 | 65 | 65 | 0.243 | 0.147 | 0.708 | 0.708 | 0.0015 |
|
| 26 |
+
| dgov_median_household_income | | 174×19 | 138 | 83 | 0.25 | 0.579 | 0.159 | 0.265 | 0.0 |
|
| 27 |
+
| hospital | ✓ | 1000×20 | 509 | 379 | 0.258 | 0.169 | 0.542 | 0.607 | 0.0662 |
|
| 28 |
+
| dgov_louisville_metro_ky_inspection_results_p | | 521×18 | 1126 | 1044 | 0.31 | 0.933 | 0.186 | 0.2 | 0.0002 |
|
| 29 |
+
| dgov_la_county_covid_cases | | 975×14 | 579 | 579 | 0.34 | 0.983 | 0.206 | 0.206 | 0.0 |
|
| 30 |
+
| dgov_allegheny_county_tobacco_vendors | | 1248×12 | 2392 | 2109 | 0.343 | 0.882 | 0.213 | 0.242 | 0.0008 |
|
| 31 |
+
| dgov_legislative_bridge_names | | 252×16 | 415 | 396 | 0.358 | 0.614 | 0.253 | 0.265 | 0.0091 |
|
| 32 |
+
| tt_co23z7go | | 15477×4 | 33542 | 33542 | 0.36 | 0.929 | 0.223 | 0.223 | 0.0004 |
|
| 33 |
+
| dgov_louisville_metro_ky_permitted_hotels_and | | 131×13 | 191 | 182 | 0.424 | 0.898 | 0.277 | 0.291 | 0.0007 |
|
| 34 |
+
| dgov_health_conditions_among_children_under_a | | 2744×16 | 2900 | 2844 | 0.426 | 0.357 | 0.528 | 0.539 | 0.0569 |
|
| 35 |
+
| gidcl_imdb | ✓ | 20000×6 | 13320 | 7890 | 0.438 | 0.489 | 0.396 | 0.669 | 0.0297 |
|
| 36 |
+
| tt_uma1dnf6 | | 8302×5 | 5080 | 5080 | 0.442 | 0.911 | 0.292 | 0.292 | 0.0026 |
|
| 37 |
+
| dgov_medicare_part_d_opioid_prescribing_rates | | 677×17 | 547 | 547 | 0.447 | 0.775 | 0.314 | 0.314 | 0.0026 |
|
| 38 |
+
| dgov_access_control | | 4928×13 | 4180 | 4161 | 0.551 | 0.933 | 0.391 | 0.392 | 0.0 |
|
| 39 |
+
| dgov_3_09_census_acs_post_secondary_education | | 53×17 | 82 | 82 | 0.552 | 0.941 | 0.39 | 0.39 | 0.0 |
|
| 40 |
+
| dgov_305b_assessed_lake_2020 | | 182×23 | 442 | 424 | 0.556 | 0.766 | 0.437 | 0.455 | 0.0139 |
|
| 41 |
+
| dgov_ah_provisional_diabetes_death_counts_for | | 226×16 | 142 | 141 | 0.571 | 0.951 | 0.408 | 0.411 | 0.0 |
|
| 42 |
+
| dgov_jefferson_county_ky_post_offices | | 32×9 | 26 | 26 | 0.651 | 0.824 | 0.538 | 0.538 | 0.0115 |
|
| 43 |
+
| dgov_national_obesity_by_state_1 | | 52×5 | 13 | 13 | 0.7 | 1.0 | 0.538 | 0.538 | 0.0 |
|
| 44 |
+
| movies_1 | ✓ | 7390×17 | 7006 | 5567 | 0.705 | 0.639 | 0.786 | 0.989 | 0.0226 |
|
| 45 |
+
| tt_3n6s2fcx | | 9396×3 | 9510 | 9510 | 0.955 | 0.998 | 0.916 | 0.916 | 0.0 |
|
| 46 |
+
| tt_2zwsmotj | | 10855×3 | 10977 | 10977 | 0.956 | 0.997 | 0.918 | 0.918 | 0.0 |
|
| 47 |
+
| tt_8yinkydr | | 14008×3 | 14188 | 14188 | 0.956 | 0.997 | 0.918 | 0.918 | 0.0 |
|
| 48 |
+
| tt_dvnkv0xu | | 15477×4 | 15676 | 15676 | 0.956 | 0.997 | 0.919 | 0.919 | 0.0 |
|
| 49 |
+
| tt_00e2h310 | | 12285×3 | 12433 | 12433 | 0.957 | 0.998 | 0.919 | 0.919 | 0.0 |
|
docs/PAPER.md
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
> **SUPERSEDED SCAFFOLD (2026-06-12).** The paper was reframed; current title:
|
| 2 |
+
> "Verified Cleaning Plans: Plan-Level Selective Prediction Turns Local LLM
|
| 3 |
+
> Planners into Trustworthy Table Cleaners". This file is the original outline,
|
| 4 |
+
> kept for history. The live paper is docs/paper/main.tex.
|
| 5 |
+
|
| 6 |
+
# ScrubData — paper scaffold & related-work map
|
| 7 |
+
|
| 8 |
+
**Working title:** *Small fine-tuned planners with execution-verified data and calibrated
|
| 9 |
+
abstention match larger models on tabular canonicalization.*
|
| 10 |
+
|
| 11 |
+
**One-line claim (measured):** a ≤4B fine-tune that emits a *cleaning plan* (not edited cells)
|
| 12 |
+
reaches `canon_f1 0.86` on alias-level canonicalization vs `0.45` for a large generic model and
|
| 13 |
+
`0.13` for a rule heuristic — and, with reference grounding + calibrated abstention, beats the
|
| 14 |
+
tool people actually use (OpenRefine) on a wide validation suite at far lower damage.
|
| 15 |
+
|
| 16 |
+
## Contributions (the combination is the novelty — not "LLM cleans data")
|
| 17 |
+
1. **Planner/executor decomposition.** The model proposes a structured JSON plan; deterministic
|
| 18 |
+
pandas executes it. Auditable, reversible, **no silent edits** (`observability.py`,
|
| 19 |
+
`trace.py`). This is the trust/monitorability contract.
|
| 20 |
+
2. **Execution-self-verified synthetic SFT.** Every training example's plan is checked to
|
| 21 |
+
actually recover the known-clean original by *running the executor* (`training/build_dataset.py`).
|
| 22 |
+
A clean, citable data-generation method (drops non-recovering examples).
|
| 23 |
+
3. **Reference grounding + calibrated abstention.** Canonicalization is reconciled against a
|
| 24 |
+
type-scoped taxonomy (GeoNames/pycountry; `reconcile.py`, `grounded.py`); the system ABSTAINS
|
| 25 |
+
under ambiguity instead of hallucinating a canonical (`eval/calibration.py`: risk-coverage +
|
| 26 |
+
ECE). Structural fix for the over-correction larger models also exhibit.
|
| 27 |
+
4. **Aggregation + column-batching.** Prompt size scales with *distinct values*, not rows
|
| 28 |
+
(`profiler.py` value_counts + `model_planner.make_batched_planner`).
|
| 29 |
+
|
| 30 |
+
## Related work (position against — reviewers know this field)
|
| 31 |
+
- **Error detection/repair:** Raha & Baran (Mahdavi et al.), HoloClean (Rekatsinas et al. 2017,
|
| 32 |
+
`arXiv 1702.00820`), GARF — we *use* their hospital/beers/flights/rayyan as OOD eval and cite
|
| 33 |
+
GARF as the frequency-only baseline our grounding beats (it cannot supply a canonical for a lone
|
| 34 |
+
column).
|
| 35 |
+
- **LLMs for data wrangling:** "Can Foundation Models Wrangle Your Data?" (Narayan et al. 2022),
|
| 36 |
+
Jellyfish, Table-GPT/TableLlama (`2311.09206`), RetClean (`2303.16909`). We differ by being a
|
| 37 |
+
*small fine-tuned planner* + grounding + abstain, not a large zero-shot value-editor.
|
| 38 |
+
- **Grounding / entity disambiguation:** RACOON (`2409.14556`), TURL (`2006.14806`), Belotti et al.
|
| 39 |
+
table-EL (`2408.06423`), MTab — motivate retrieval-then-abstain and warn against memorizing
|
| 40 |
+
canonicals into weights (TURL ~40% OOD collapse). See `taxonomy-grounding.md`.
|
| 41 |
+
- **The tool we beat:** **OpenRefine** clustering — fingerprint (key collision) + nearest-neighbor
|
| 42 |
+
(kNN/edit-distance), reimplemented as `scrubdata/baselines.py` for head-to-head.
|
| 43 |
+
- **Selective prediction:** calibrated abstention / risk-coverage (El-Yaniv & Wiener; Geifman &
|
| 44 |
+
El-Yaniv) — our ECE/AURC study; also the AI-safety monitorability framing.
|
| 45 |
+
|
| 46 |
+
## Experiments
|
| 47 |
+
- **Headline:** canon_f1 vs large-generic vs heuristic on frozen synthetic gold (Layer 1).
|
| 48 |
+
- **Wide north-star (`eval/run_real_multi.py`):** double-macro (error-type × domain) F1 + damage +
|
| 49 |
+
abstain over Raha real-error sets **+ seeded error-injection** on 20+ harvested gov/GitHub clean
|
| 50 |
+
domains (`eval/inject.py`); multi-seed 95% CIs. Hospital is 1 dataset of many.
|
| 51 |
+
- **Money result:** grounded vs OpenRefine fingerprint & kNN on the same suite (grounded wins F1 +
|
| 52 |
+
damage; kNN over-merges — higher recall, low precision, high damage).
|
| 53 |
+
- **Calibration (`eval/calibration.py`):** risk-coverage, AURC, ECE; operating point for ≥95%
|
| 54 |
+
precision via the abstain threshold.
|
| 55 |
+
- **Ablations to add:** −grounding, −abstain, −execution-verification, −aggregation.
|
| 56 |
+
|
| 57 |
+
## Honest limitations (the integrity reviewers reward)
|
| 58 |
+
- Reference *coverage* is the recall ceiling (Belotti) — uncovered entities abstain by design.
|
| 59 |
+
- Convention vs error: standardization (date→ISO, `%`→fraction) is product value, not damage —
|
| 60 |
+
the metric is case/whitespace-normalized but a format-aware variant is future work.
|
| 61 |
+
- ECE shows mild over-confidence (difflib-ratio scores) — temperature/Platt scaling is future work.
|
| 62 |
+
- Some benchmark sources gated (CleanML/TableEG behind Dropbox/Drive; licenses noted).
|
| 63 |
+
|
| 64 |
+
## To-do before submission
|
| 65 |
+
multi-seed CIs (running) · −ablations · OpenRefine table with CIs · cs.DB endorser (primary cs.DB, cross-list cs.CL+cs.LG; endorser targets = the data-cleaning authors we cite) · selective-
|
| 66 |
+
prediction figure · keep the eval README's convention-vs-error honesty.
|
docs/SCALING_ARM.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# W1.c — ≤32B Zero-Label Repair Scaling Arm (multi-family, zero-shot)
|
| 2 |
+
|
| 3 |
+
First scaling measurement for the verified-union planner: vanilla (NOT fine-tuned)
|
| 4 |
+
20–31B open-weights models dropped into the EXACT hospital pipeline the 4B fine-tune
|
| 5 |
+
gate used — batched raw planner (batch_size=4, same `scrubdata/prompt.py` contract,
|
| 6 |
+
temperature 0) → `verify_plan(tau=0.5)` → union with the grounded heuristic
|
| 7 |
+
(`mock_plan`). Scored against hospital's 509 real errors with the
|
| 8 |
+
`eval/precision_curve.py` repairs-only churn-neutral protocol. Protocol parity was
|
| 9 |
+
verified by re-scoring the captured v6 plan through the same scorer: it reproduces the
|
| 10 |
+
prior gate numbers exactly (gated 0.993/0.287, union 0.905/0.413).
|
| 11 |
+
|
| 12 |
+
Disclosure: ≤32B open-weights models measured via hosted inference for speed; all are
|
| 13 |
+
locally deployable in principle.
|
| 14 |
+
|
| 15 |
+
| model | params (B) | family | gated P @ C | union P @ C | validity | kept/dropped | runtime (s) |
|
| 16 |
+
|---|---|---|---|---|---|---|---|
|
| 17 |
+
| scrubdata-ft-v6 (Qwen3-4B fine-tune) | 4 | qwen3 (fine-tuned) | **0.993** @ 0.287 | 0.905 @ 0.413 | — | 132/38 | — (prior measurement) |
|
| 18 |
+
| gpt-oss:20b | 20 | openai/gpt-oss | 1.0 @ 0.000* | 0.845 @ 0.257* | 0.0 | 0/0 | 360 |
|
| 19 |
+
| devstral-small-2:24b | 24 | mistral/devstral | 0.943 @ 0.426 | 0.915 @ **0.485** | 1.0 | 208/87 | 135 |
|
| 20 |
+
| nemotron-3-nano:30b | 30 | nvidia/nemotron | 1.0 @ 0.138 | 0.877 @ 0.336 | 0.4 | 63/6 | 114 |
|
| 21 |
+
| gemma4:31b | 31 | google/gemma | 0.943 @ 0.426 | **0.915 @ 0.485** | 1.0 | 209/28 | 104 |
|
| 22 |
+
|
| 23 |
+
\* gpt-oss:20b is a serving-path failure, not a measured capability: the model
|
| 24 |
+
generated ~4.8k tokens per planning call (`done_reason=stop`) but the Ollama Cloud
|
| 25 |
+
proxy returned empty `content` and empty `thinking` on all 5 calls at both
|
| 26 |
+
num_predict=4000 and 8000 (simple prompts work) — its "gated" point is the degenerate
|
| 27 |
+
empty plan and its "union" point is the heuristic backstop alone. nemotron-3-nano
|
| 28 |
+
produced valid JSON on only 2/5 batch calls at num_predict=8000 (long-thinking
|
| 29 |
+
truncation); validity is part of the measurement.
|
| 30 |
+
|
| 31 |
+
**Interpretation.** Zero-shot capability at 24–31B does close — and slightly
|
| 32 |
+
exceed — the 4B fine-tune's gap inside the same verifier harness: devstral-24B and
|
| 33 |
+
gemma4-31B both land at union 0.915 precision @ 0.485 coverage vs the fine-tune's
|
| 34 |
+
0.905 @ 0.413, though the fine-tune remains the most precise gated planner
|
| 35 |
+
(0.993 vs 0.943) and the only ≤4B point, while two of the four bigger families
|
| 36 |
+
(gpt-oss, nemotron) fail on plan-schema validity before capability even gets
|
| 37 |
+
measured. Gemma4-31B is the best family on balance: same gate point as devstral but
|
| 38 |
+
cleaner raw plans (verifier dropped 28 entries vs devstral's 87 — vs 38 for the 4B
|
| 39 |
+
fine-tune) and the fastest wall-clock (104s). The union still dominates everywhere:
|
| 40 |
+
every model's union point adds coverage over its gated point at gate-passing
|
| 41 |
+
precision, and it floors even the broken planners (nemotron 0.877 @ 0.336) because
|
| 42 |
+
the grounded heuristic covers whatever the model misses.
|
| 43 |
+
|
| 44 |
+
Artifacts: `eval/results/scaling_arm.json` (rows + provenance),
|
| 45 |
+
`eval/results/scaling_<model>_hospital_raw_plan.json` (captured raw plans),
|
| 46 |
+
runner: `eval/scaling_arm.py`.
|
docs/TOOL_REFERENCE.md
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ScrubData — The Profound Tool Reference
|
| 2 |
+
|
| 3 |
+
> The single local document that explains the whole system: what it is, why every
|
| 4 |
+
> piece exists, where every number comes from, and what we learned building it.
|
| 5 |
+
> Written at the close of the research domain (2026-06-12). The paper
|
| 6 |
+
> (`docs/paper/main.tex`) is the citable account; THIS file is the operational one.
|
| 7 |
+
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
## 1. What ScrubData is
|
| 11 |
+
|
| 12 |
+
ScrubData is a **zero-config, zero-label, local** tabular data-cleaning system built
|
| 13 |
+
around one architectural commitment: **the model never touches data**.
|
| 14 |
+
|
| 15 |
+
A profiler aggregates each column into a bounded value-frequency profile; a small
|
| 16 |
+
(≤4B, locally-run) fine-tuned planner *proposes* a JSON cleaning plan; a
|
| 17 |
+
deterministic pandas executor *applies* it. The plan is the complete, inspectable,
|
| 18 |
+
reversible specification of every change. Three consequences define the product:
|
| 19 |
+
|
| 20 |
+
1. **No silent edits by construction** — every changed cell traces to a named,
|
| 21 |
+
logged operation (verified at scale: 0 silent edits across 35 wild tables and a
|
| 22 |
+
239-table GitTables trust audit).
|
| 23 |
+
2. **Abstention is first-class** — anything below confidence becomes a review flag
|
| 24 |
+
("YOUR CALL" card in the UI), never a quiet skip and never a guess.
|
| 25 |
+
3. **Profile-not-rows scaling** — the prompt scales with *distinct values*, not
|
| 26 |
+
rows; a million-row table profiles like a hundred-row one, and no cell values
|
| 27 |
+
leave the machine.
|
| 28 |
+
|
| 29 |
+
### The central finding (load-bearing, repeatedly measured)
|
| 30 |
+
|
| 31 |
+
**Model weights contribute approximately nothing to never-seen-table
|
| 32 |
+
generalization in this protocol class.** Five SFT retrains (v7–v10 + mixes, 109k
|
| 33 |
+
harvested real alias pairs) and a three-arm GRPO pilot (executor as verifiable
|
| 34 |
+
reward, including a random-reward control that reproduced the same format drift)
|
| 35 |
+
all failed to move held-out generalization. Every measured gain came from
|
| 36 |
+
**deterministic machinery gated by the plan-level verifier** (§5). Corroborated
|
| 37 |
+
independently by Spreadsheet-RL, arXiv:2601.05009, and arXiv:2606.02866.
|
| 38 |
+
Practical corollary: *to improve ScrubData, write a deterministic capability and
|
| 39 |
+
gate it with the verifier; do not collect more training data.*
|
| 40 |
+
|
| 41 |
+
---
|
| 42 |
+
|
| 43 |
+
## 2. The shipped pipeline (`scrubdata/active.py::get_planner`)
|
| 44 |
+
|
| 45 |
+
```
|
| 46 |
+
┌──────────────────────────────────────────────┐
|
| 47 |
+
df ──► profiler ──► │ model path (only if SCRUBDATA_MODEL is set) │
|
| 48 |
+
(bounded │ batched (4 cols/call) local Ollama planner │
|
| 49 |
+
profile incl. │ → per-batch fallback to heuristic on error │
|
| 50 |
+
suspects) │ → grounded (reference taxonomies, RACOON) │
|
| 51 |
+
│ → verify_plan(tau=SCRUBDATA_TAU, def 0.5) │
|
| 52 |
+
└───────────────┬──────────────────────────────┘
|
| 53 |
+
│ union_plans (model wins per surface;
|
| 54 |
+
│ inherits deterministic ops + table ops)
|
| 55 |
+
heuristic mock_plan ───────────┘
|
| 56 |
+
▼
|
| 57 |
+
executor.apply_plan → (clean_df, change_log)
|
| 58 |
+
▼
|
| 59 |
+
report.render_report · trace.log_run · observability
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
- **No model configured** → `mock_plan` (grounded deterministic heuristic) alone.
|
| 63 |
+
The app always produces a plan; the model is an upgrade, never a dependency.
|
| 64 |
+
- **Measured operating point** (hospital, 509 real errors): union **0.905
|
| 65 |
+
precision @ 0.413 coverage**; gated model alone 0.993 @ 0.287; 3-seed
|
| 66 |
+
0.891±0.012 @ 0.396±0.025. Precision flat 0.89–0.91 for τ∈[0.2,0.8].
|
| 67 |
+
|
| 68 |
+
Entry points: `uv run server.py` (FastAPI + UI), `app.py` (HF Space/Gradio),
|
| 69 |
+
`scrubdata/cli.py` (`scrubdata <file.csv> -o out.csv --report r.md --plan p.json`).
|
| 70 |
+
|
| 71 |
+
### Environment variables
|
| 72 |
+
|
| 73 |
+
| Var | Default | Meaning |
|
| 74 |
+
|---|---|---|
|
| 75 |
+
| `SCRUBDATA_MODEL` | unset | local Ollama model id (e.g. `scrubdata-ft-v6`); unset = heuristic only |
|
| 76 |
+
| `SCRUBDATA_TAU` | `0.5` | per-entry verifier threshold on model mappings |
|
| 77 |
+
| `SCRUBDATA_HC_TAU` | `0.8` | stricter bar for heuristic suspect-mappings (no model cross-check there) |
|
| 78 |
+
| `SCRUBDATA_PAIR_PROFILES` | off | WS2 candidate-constrained planning (measured redundant with verifier; off by default) |
|
| 79 |
+
| `SCRUBDATA_PII_NER` | off | OpenMed-PII 44M NER tier on top of deterministic validators |
|
| 80 |
+
|
| 81 |
+
---
|
| 82 |
+
|
| 83 |
+
## 3. Module map (`scrubdata/`)
|
| 84 |
+
|
| 85 |
+
| Module | Role | Key facts |
|
| 86 |
+
|---|---|---|
|
| 87 |
+
| `profiler.py` | column → bounded profile | `VALUE_COUNTS_CAP=80` (high-card cols: top-8 only) + `suspect_values` section (the visibility fix); `truncated_values` count keeps honesty about what's hidden |
|
| 88 |
+
| `detect.py` | typing + issue predicates | `detect_semantic_type` (zip/ZCTA/Excel-serial guards), `date_formats_consistent` (collapses digit AND alpha runs; 90% dominant-shape), `percent_formats_consistent` (90%), `has_mojibake`, `is_missing` |
|
| 89 |
+
| `planner.py` | deterministic heuristic planner | `mock_plan`, `_column_operations`, `_suspect_canonicalize` (τ_hc=0.8), `detect_entity_groups` (cross-row voting detection), emits `fix_encoding` BEFORE `strip_whitespace` (order-critical), `off_convention_dates` visible-abstention flags |
|
| 90 |
+
| `executor.py` | the only thing that touches cells | op dispatch (§4); unknown ops are no-ops (forward-compatible); returns `(df, change_log)`; `resolve_by_majority` table op lives here |
|
| 91 |
+
| `verifier.py` | WS1 selective prediction | `entry_confidence` (3 hard gates, §5.0), `verify_plan` (also enforces convention gates on MODEL-emitted parse_date/parse_percent — the model path otherwise bypasses them), `union_plans` (order-preserving op inheritance via `reversed(inherit)`) |
|
| 92 |
+
| `reconcile.py` | reference grounding | `ReferenceIndex`, `default_index()` loads toughtables_ref (contamination-guarded: excludes the 8 benchmark tables) + MusicBrainz hints + Wikidata companies + ROR; `infer_reference_type` needs **≥20% exact entity hits** (over-fire guard); falls back to `training/harvests/` for Space/clone parity |
|
| 93 |
+
| `grounded.py` | RACOON wrapper | model never free-generates a canonical for a reference-typed column |
|
| 94 |
+
| `pair_profile.py` | suspects + WS2 candidates | `suspects_for_column` (≤25/col, bounded: 4k rare cap + cheap prefilters before SequenceMatcher — 40min→24s fix), `candidate_pairs`, `constrain_plan` |
|
| 95 |
+
| `model_planner.py` | Ollama backends | `make_local_ollama_planner`, `make_batched_planner(batch_size=4)`, JSON extraction |
|
| 96 |
+
| `prompt.py` | prompt/training contract | `_profile_for_prompt` (compact suspects), `build_chat_example` (training-data side of the same contract — change one, regenerate the other) |
|
| 97 |
+
| `pii.py` | PII second task | deterministic validators (Luhn, IBAN, phone) + allowlist + coverage vote; optional 44M NER; `mask/hash/pseudonymize` |
|
| 98 |
+
| `active.py` | THE composition | `get_planner()` — §2 |
|
| 99 |
+
| `cli.py` / `report.py` / `trace.py` / `observability.py` | UX + audit | CLI, markdown report, JSONL traces, monitor summary/OTel span |
|
| 100 |
+
| `baselines.py` | OpenRefine kNN/fingerprint reimplementations | the zero-config comparison class |
|
| 101 |
+
| `refdata/cities.txt` | seed gazetteer | plus everything in `training/harvests/*.jsonl` |
|
| 102 |
+
|
| 103 |
+
---
|
| 104 |
+
|
| 105 |
+
## 4. Operation vocabulary (the executor's closed set)
|
| 106 |
+
|
| 107 |
+
**Column ops** (`_apply_column_op`): `strip_whitespace`, `normalize_punctuation`,
|
| 108 |
+
`fix_encoding` (lossless cp1252/latin-1↔utf8 round-trip, mojibake-marker-reduction
|
| 109 |
+
gated), `normalize_disguised_nulls`, `parse_currency`, `parse_number`,
|
| 110 |
+
`parse_percent` (abstains on bare values — no /100 corruption),
|
| 111 |
+
`parse_date`, `standardize_boolean`, `standardize_phone` (7-digit → `DDD-DDDD`),
|
| 112 |
+
`normalize_email`, `standardize_case`, `canonicalize_categories` (mapping-driven;
|
| 113 |
+
the verifier's subject), `flag_pii` (log-only), `mask_pii`, `hash_pii`,
|
| 114 |
+
`pseudonymize_pii`. Unknown op → no-op.
|
| 115 |
+
|
| 116 |
+
**Table ops**: `drop_empty_columns`, `drop_empty_rows`, `drop_exact_duplicates`,
|
| 117 |
+
`resolve_by_majority` (§5.3).
|
| 118 |
+
|
| 119 |
+
Op-order invariant: **`fix_encoding` must precede whitespace/punctuation ops** —
|
| 120 |
+
they destroy the UTF-8 byte patterns repair needs (grader-reproduced bug; fixed in
|
| 121 |
+
both heuristic emission and union inheritance).
|
| 122 |
+
|
| 123 |
+
---
|
| 124 |
+
|
| 125 |
+
## 5. The five deterministic capabilities (what actually generalizes)
|
| 126 |
+
|
| 127 |
+
### 5.0 Plan-level verifier (WS1) — `verifier.entry_confidence`
|
| 128 |
+
Every non-grounded `canonicalize_categories` entry `raw→canon` is scored with
|
| 129 |
+
three HARD gates, each killing a measured hospital failure class:
|
| 130 |
+
- **errors are rare**: `freq(raw) ≥ 3` → 0.0 (frequent = legit data; "de kalb"×92)
|
| 131 |
+
- **repair to dominance only**: `freq(canon) < max(2, 2·freq(raw))` → 0.0
|
| 132 |
+
("yex→yexu", typo mapped to a worse typo)
|
| 133 |
+
- **code discipline**: digit-bearing values repair only if letter-part similarity
|
| 134 |
+
≥0.85 AND digits identical (allows `amix-2→ami-2`, blocks `ak_→al_`)
|
| 135 |
+
Survivors score `sim × (0.5 + 0.5·support)`; below-τ entries become review flags.
|
| 136 |
+
|
| 137 |
+
### 5.1 Suspect surfacing (visibility) — `pair_profile.suspects_for_column`
|
| 138 |
+
The 80-value profile cap structurally hides high-cardinality dirty cells from ANY
|
| 139 |
+
planner (proved by the v8/v9 retrains: more data couldn't fix what the model
|
| 140 |
+
couldn't see). Every text-ish column profile now carries ≤25 `suspect_values`:
|
| 141 |
+
rare surfaces + evidence-backed candidates (frequency dominance, edit similarity,
|
| 142 |
+
reference membership). The heuristic maps suspects clearing `entry_confidence ≥
|
| 143 |
+
SCRUBDATA_HC_TAU=0.8`; the rest become flags.
|
| 144 |
+
|
| 145 |
+
### 5.2 Generic entity reference — `reconcile.default_index`
|
| 146 |
+
Open vocabularies (ToughTables-derived ref [8 bench tables excluded], MusicBrainz
|
| 147 |
+
search-hint misspellings, RxNorm, Wikidata companies, ROR, GeoNames, OpenFlights,
|
| 148 |
+
O*NET, nicknames) as a pluggable reference type. Typing requires **≥20% exact
|
| 149 |
+
hits** of distinct values (fuzzy coverage alone over-fires on name-like columns —
|
| 150 |
+
measured). Cracked the all-unique regime: 5 ToughTables tables **0 → 0.955–0.957
|
| 151 |
+
F1 at 0.0000 damage** (~62k corrections) — where no in-column frequency signal
|
| 152 |
+
exists at all.
|
| 153 |
+
|
| 154 |
+
### 5.3 Cross-row majority voting — `planner.detect_entity_groups` + `resolve_by_majority`
|
| 155 |
+
Tables repeating a real-world entity across rows (flights reported by many
|
| 156 |
+
sources) carry their own repair signal. Detection: compact-token key columns,
|
| 157 |
+
median multiplicity 3–30, ≥2 votable string columns with majority-bearing
|
| 158 |
+
disagreement + ≥2 distinct majorities, date-share ≤0.3 guard. Execution: resolve
|
| 159 |
+
thin dissenting minorities to group majority; skips missing-like keys;
|
| 160 |
+
min_share/min_group clamped. **False-consensus guard**: mean minority share ≥0.25
|
| 161 |
+
→ decline (legitimate correlated updates, not reporting errors — a flat volume cap
|
| 162 |
+
was measured to destroy the legitimate regime and replaced). Measured: flights
|
| 163 |
+
heuristic 0.044→**0.164** F1; hospital heuristic 0.092→**0.186**.
|
| 164 |
+
|
| 165 |
+
### 5.4 Convention conservatism — `detect.*_formats_consistent` + `verify_plan`
|
| 166 |
+
Never re-format an internally consistent column: date/percent ops gated on
|
| 167 |
+
dominant-shape inconsistency (digit+alpha runs collapsed, 90% rule); zip/postal
|
| 168 |
+
names never typed phone/date; Excel-serial typing needs a date-suggestive name.
|
| 169 |
+
Suppressed minorities surface as `off_convention_dates` flags. The verifier
|
| 170 |
+
enforces the same gates on model plans at the verification boundary (the model
|
| 171 |
+
path otherwise bypasses heuristic emission gates entirely).
|
| 172 |
+
|
| 173 |
+
---
|
| 174 |
+
|
| 175 |
+
## 6. Evaluation (how every number regenerates)
|
| 176 |
+
|
| 177 |
+
One scoring contract — `eval/run_real_multi.py::score()` — **churn-neutral,
|
| 178 |
+
convention-tolerant**: sem-equal = numeric-tolerant OR strip+casefold equal; pure
|
| 179 |
+
case/whitespace churn counts as nothing; a fix requires acting; **damage** =
|
| 180 |
+
clean cells corrupted / clean cells; **silent edits** = changed columns minus
|
| 181 |
+
log-attributed columns (must be 0).
|
| 182 |
+
|
| 183 |
+
| Harness | Command | What it measures | Current numbers |
|
| 184 |
+
|---|---|---|---|
|
| 185 |
+
| Money table | `python -m eval.run_real_multi` | 65-set suite, 3 seeds | grounded NORTH 0.224±0.004; REAL-F1 0.225 vs OR-kNN 0.058 (HEAD 2026-06-12 regen; freeze was 0.203/0.174) |
|
| 186 |
+
| WS1 gate | `python -m eval.precision_curve --plan eval/results/v6_hospital_raw_plan.json --union` | precision–coverage curve | **0.905 @ 0.413** (τ=0.5) |
|
| 187 |
+
| Paired bench | `python -m eval.paired_bench` | 42 dirty/gold pairs | unseen-35 macro F1 **0.363** @ dmg **0.0219** |
|
| 188 |
+
| Wild bench | `python -m eval.wild_bench` | 35 uncurated tables, behavioral + inject-recovery | recovery 0.207; **0 silent edits** |
|
| 189 |
+
| Trust audit | `python -m eval.gittables_audit` | 239 GitTables clean-lake | 239/239 valid, 0 crashes, 0 silent edits |
|
| 190 |
+
| Generalization | `python -m eval.generalization` | held-out-source (train: hospital/beers/movies_1 · eval: flights/rayyan/ed2) | GEN-F1 0.058, VR 0.108, dmg 0.036 |
|
| 191 |
+
| RADAR board | `python -m eval.radar_bench` | regime boundaries by artifact type | abstains on missingness ✓; reasoning-class = frontier territory |
|
| 192 |
+
| Baselines | `eval/run_baran.py`, `modal run scripts/modal_jellyfish.py` | disclosed-protocol comparisons | Baran (oracle+20 labels) 0.811; Jellyfish-13B 0.074 |
|
| 193 |
+
| Calibration / PII | `eval.calibration`, `eval.pii_leak` | abstention quality / leak test | AURC 0.120, ECE 0.169; 0/360 residual PII |
|
| 194 |
+
|
| 195 |
+
**Eval-source discipline**: TRAIN_SOURCES["v6"]={hospital,beers,movies_1};
|
| 196 |
+
EVAL_SOURCES={flights,rayyan,ed2_restaurants}. Never crossed.
|
| 197 |
+
|
| 198 |
+
---
|
| 199 |
+
|
| 200 |
+
## 7. Model & artifacts
|
| 201 |
+
|
| 202 |
+
| Artifact | Where | Notes |
|
| 203 |
+
|---|---|---|
|
| 204 |
+
| Champion adapter | Modal volume `scrubdata-v5-adapter` `/v5_seed21` (= "v6") | survived v7–v10 challenges + GRPO |
|
| 205 |
+
| Merged model | `hf.co/ricalanis/scrubdata-qwen3-4b` | card carries the v2 finding |
|
| 206 |
+
| Q8 GGUF | `hf.co/ricalanis/scrubdata-qwen3-4b-v6-q8` | **Q8_0 only — Q4_K_M corrupts** (Unsloth 2026.6.x); non-thinking Modelfile required (`notebooks/Modelfile`); suppress tokens 151657/151658 under transformers |
|
| 207 |
+
| Benchmark | `hf.co/datasets/ricalanis/wildclean` | 33 redistributable pairs + loaders.py for 9 license-gated + gittables250 + 10 vocabs + frozen results; first cleaning bench with damage + silent-edit accounting |
|
| 208 |
+
| Demo | `hf.co/spaces/build-small-hackathon/scrubdata` | deploy = `HfApi.upload_folder` of `git archive HEAD` — **NO GitHub auto-sync** |
|
| 209 |
+
| Paper | `docs/paper/main.tex` + `numbers.tex` | compile: `~/.local/bin/tectonic main.tex` (no pdflatex on this machine) |
|
| 210 |
+
| Vocabs | `training/harvests/*.jsonl` (15MB, 13 files) | loader falls back here for clone parity |
|
| 211 |
+
|
| 212 |
+
Modal patterns: `--detach` for anything long; results land in Modal Dicts
|
| 213 |
+
(`scrubdata-train-results`, `scrubdata-eval-v5-results`, `scrubdata-suite-results`).
|
| 214 |
+
**Budget status at domain close: ~$187 of $212 ceiling — Modal HALTED.**
|
| 215 |
+
|
| 216 |
+
---
|
| 217 |
+
|
| 218 |
+
## 8. Negative results ledger (measured, do not re-litigate)
|
| 219 |
+
|
| 220 |
+
1. **v7–v10 SFT retrains**: 109k harvested alias pairs, episode mixes, suspects
|
| 221 |
+
contract — GEN flat/worse. Mixing harvested pairs **dilutes** executor-verified
|
| 222 |
+
synthetic skill (monotonic dilution law across mix ratios; mixH 0.677).
|
| 223 |
+
2. **GRPO pilot, 3 arms** (main, KL-anchored v2, random-reward control): all
|
| 224 |
+
degrade format at 4B/LoRA/$30 scale; the control proved the drift is an RL
|
| 225 |
+
artifact (cf. "Spurious Rewards"). Published RLVR wins used real infra
|
| 226 |
+
(verl, 4×H100×40h). Episodes corpus (600, `training/build_grpo_episodes.py`) +
|
| 227 |
+
hand-rolled loop (`scripts/modal_grpo.py`) committed for a future attempt.
|
| 228 |
+
3. **Uniform verification of existing low-card mappings** (A1 per-class
|
| 229 |
+
thresholds): 0.905→0.890 — reverted.
|
| 230 |
+
4. **Strict entity-typing thresholds** (0.90/0.05): cost more than bought — reverted.
|
| 231 |
+
5. **WS2 candidate constraining composed with verifier**: 0.876 @ 0.387 < union at
|
| 232 |
+
same τ — redundant gating of the same failure class; available, off by default.
|
| 233 |
+
6. **Flat volume cap on cross-row voting**: destroyed the legitimate
|
| 234 |
+
dense-disagreement regime — replaced by the false-consensus guard.
|
| 235 |
+
7. **Frozen-gold synthetic yardstick predates the suspects prompt contract** —
|
| 236 |
+
regenerate gold before ever quoting synthetic canon_f1 again.
|
| 237 |
+
|
| 238 |
+
## 9. Known-open (graded non-blocking)
|
| 239 |
+
|
| 240 |
+
`_parse_date` per-value dayfirst; i18n name guards; mojibake fixpoint /
|
| 241 |
+
sequence-plausibility; backlog sources: CMS API, NHTSA, Canada contracts, Matelda
|
| 242 |
+
~6,670 pairs, GLEIF/USDA vocabs, WDVC-16. Reasoning-class artifacts (RADAR) are
|
| 243 |
+
explicitly out of protocol class — frontier-model territory.
|
| 244 |
+
|
| 245 |
+
## 10. Where deeper detail lives
|
| 246 |
+
|
| 247 |
+
`docs/PRODUCT.md` (trust contract) · `docs/SOTA.md` + `docs/ROADMAP_SOTA2.md`
|
| 248 |
+
(position + research map) · `docs/CAPABILITY_GRADES.md` (12-agent adversarial
|
| 249 |
+
grading + must-fix ledger) · `docs/WILD_BENCH.md` / `docs/PAIRED_BENCH.md` /
|
| 250 |
+
`docs/GITTABLES_AUDIT.md` / `docs/DATASETS.md` (per-bench detail + licenses) ·
|
| 251 |
+
`docs/NIGHT_LOG.md` (stage-3 timeline) · `project-memory/` (agent memory snapshot).
|
docs/WILD_BENCH.md
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Wild Bench — can the shipped system clean real-world tables?
|
| 2 |
+
|
| 3 |
+
Behavioral audit + seeded inject-recovery per dataset (eval/wild_bench.py).
|
| 4 |
+
|
| 5 |
+
| dataset | domain | rows×cols | valid | changes | flags | PII | silent | typo | ocr | case | ws | mean |
|
| 6 |
+
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
| 7 |
+
| airlines | aviation | 56×8 | ✓ | 413 | 1 | 1 | 0 | — | — | — | — | — |
|
| 8 |
+
| billboard | music-billboard | 317×83 | ✓ | 36222 | 3 | 2 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
|
| 9 |
+
| acnc_charities | nonprofits-au | 800×69 | ✓ | 43268 | 4 | 1 | 0 | 0.00 | 0.00 | 0.01 | 0.01 | 0.01 |
|
| 10 |
+
| open_food_facts | food-products | 800×211 | ✓ | 27115 | 34 | 5 | 0 | 0.02 | 0.02 | 0.02 | 0.03 | 0.02 |
|
| 11 |
+
| biz_sf | sf-business | 800×38 | ✓ | 8060 | 12 | 1 | 0 | 0.02 | 0.05 | 0.02 | 0.07 | 0.04 |
|
| 12 |
+
| irs_eo1 | nonprofits-us | 800×28 | ✓ | 16953 | 5 | 3 | 0 | 0.04 | 0.03 | 0.03 | 0.15 | 0.06 |
|
| 13 |
+
| permits_nyc | construction | 800×60 | ✓ | 16762 | 25 | 3 | 0 | 0.03 | 0.04 | 0.04 | 0.13 | 0.06 |
|
| 14 |
+
| pawnbrokers_nyc | business | 800×31 | ✓ | 8494 | 8 | 2 | 0 | 0.06 | 0.08 | 0.05 | 0.11 | 0.08 |
|
| 15 |
+
| proptax_sf | real-estate | 800×46 | ✓ | 9302 | 3 | 3 | 0 | 0.06 | 0.06 | 0.07 | 0.12 | 0.08 |
|
| 16 |
+
| biz_chicago | business-licenses | 800×37 | ✓ | 12808 | 9 | 2 | 0 | 0.05 | 0.06 | 0.06 | 0.15 | 0.08 |
|
| 17 |
+
| permits_seattle | seattle-permits | 800×40 | ✓ | 6878 | 9 | 2 | 0 | 0.08 | 0.13 | 0.09 | 0.14 | 0.11 |
|
| 18 |
+
| restaurants_nyc | restaurants | 800×27 | ✓ | 7742 | 6 | 4 | 0 | 0.07 | 0.08 | 0.09 | 0.20 | 0.11 |
|
| 19 |
+
| titanic | passengers | 800×12 | ✓ | 5722 | 1 | 0 | 0 | 0.00 | 0.00 | 0.09 | 0.40 | 0.12 |
|
| 20 |
+
| biz_la | la-business | 800×16 | ✓ | 2726 | 9 | 3 | 0 | 0.15 | 0.09 | 0.10 | 0.21 | 0.14 |
|
| 21 |
+
| schools_nyc | education | 800×41 | ✓ | 14387 | 7 | 5 | 0 | 0.08 | 0.14 | 0.12 | 0.22 | 0.14 |
|
| 22 |
+
| online_retail | ecommerce-uk | 800×8 | ✓ | 3413 | 1 | 0 | 0 | 0.26 | 0.01 | 0.01 | 0.30 | 0.14 |
|
| 23 |
+
| film_nyc | film | 800×14 | ✓ | 3049 | 3 | 0 | 0 | 0.14 | 0.16 | 0.11 | 0.23 | 0.16 |
|
| 24 |
+
| salary_survey | survey | 800×18 | ✓ | 4142 | 5 | 0 | 0 | 0.12 | 0.20 | 0.13 | 0.26 | 0.18 |
|
| 25 |
+
| restaurants_sf | sf-restaurants | 800×22 | ✓ | 6002 | 6 | 2 | 0 | 0.15 | 0.15 | 0.16 | 0.26 | 0.18 |
|
| 26 |
+
| alcohol_tx | alcohol-bars | 800×24 | ✓ | 8518 | 9 | 1 | 0 | 0.14 | 0.09 | 0.17 | 0.38 | 0.20 |
|
| 27 |
+
| contractors_chi | contractors | 800×116 | ✓ | 20213 | 22 | 2 | 0 | 0.17 | 0.20 | 0.16 | 0.33 | 0.21 |
|
| 28 |
+
| fhv_nyc | transport | 800×23 | ✓ | 3789 | 4 | 2 | 0 | 0.10 | 0.30 | 0.14 | 0.36 | 0.23 |
|
| 29 |
+
| uk_price_paid | real-estate-uk | 800×16 | ✓ | 1662 | 8 | 0 | 0 | 0.14 | 0.17 | 0.26 | 0.42 | 0.25 |
|
| 30 |
+
| food_chicago | food-inspections | 800×17 | ✓ | 2790 | 6 | 0 | 0 | 0.17 | 0.25 | 0.23 | 0.38 | 0.26 |
|
| 31 |
+
| bx_books | books | 800×8 | ✓ | 1650 | 3 | 1 | 0 | 0.22 | 0.22 | 0.16 | 0.51 | 0.28 |
|
| 32 |
+
| bl_flickr_books | library | 800×15 | ✓ | 1769 | 6 | 1 | 0 | 0.19 | 0.28 | 0.22 | 0.43 | 0.28 |
|
| 33 |
+
| svc311_nyc | complaints | 800×44 | ✓ | 6299 | 16 | 2 | 0 | 0.23 | 0.30 | 0.23 | 0.37 | 0.28 |
|
| 34 |
+
| spotify | music | 800×23 | ✓ | 4669 | 3 | 1 | 0 | 0.20 | 0.28 | 0.30 | 0.36 | 0.28 |
|
| 35 |
+
| glassdoor_jobs | job-listings | 800×14 | ✓ | 1713 | 6 | 0 | 0 | 0.20 | 0.29 | 0.22 | 0.43 | 0.29 |
|
| 36 |
+
| ct_real_estate | real-estate-us | 800×14 | ✓ | 4840 | 4 | 0 | 0 | 0.23 | 0.29 | 0.24 | 0.40 | 0.29 |
|
| 37 |
+
| worldcities | geography | 800×4 | ✓ | 914 | 2 | 0 | 0 | 0.41 | 0.11 | 0.22 | 0.69 | 0.36 |
|
| 38 |
+
| fec_indiv80 | political-finance | 800×21 | ✓ | 4375 | 4 | 2 | 0 | 0.20 | 0.24 | 0.35 | 0.87 | 0.41 |
|
| 39 |
+
| payroll_nyc | jobs | 800×17 | ✓ | 4587 | 3 | 2 | 0 | 0.45 | 0.56 | 0.42 | 0.73 | 0.54 |
|
| 40 |
+
| paris_trees | urban-fr | 800×16 | ✓ | 3305 | 5 | 1 | 0 | 0.43 | 0.54 | 0.55 | 0.73 | 0.56 |
|
| 41 |
+
| ev_wa | vehicles | 800×16 | ✓ | 4085 | 5 | 2 | 0 | 0.50 | 0.56 | 0.48 | 0.91 | 0.61 |
|
docs/assets/space_landing.png
ADDED
|
Git LFS Details
|
docs/assets/space_results.png
ADDED
|
Git LFS Details
|
docs/paper/fig_label_curve.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:70a409349a89aa8681a6b0a4f47a68405a9391fad562bfb4b5a0de9ec573ab74
|
| 3 |
+
size 19327
|
docs/paper/fig_label_curve.png
ADDED
|
Git LFS Details
|
docs/paper/fig_precision_coverage.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b4d65c23406f21bcb82e8054cecc95d40ceb41cd08096726b85db5430cdae4a2
|
| 3 |
+
size 19440
|
docs/paper/fig_precision_coverage.png
ADDED
|
Git LFS Details
|
docs/paper/fig_risk_coverage.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e11e0af912e0ce66da8c3f407732d62afe4c222fbc2af8e541467d4bf5f73bce
|
| 3 |
+
size 18227
|
docs/paper/fig_risk_coverage.png
ADDED
|
Git LFS Details
|
docs/paper/main.aux
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\relax
|
| 2 |
+
\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{}\protected@file@percent }
|
| 3 |
+
\citation{raha}
|
| 4 |
+
\citation{holoclean}
|
| 5 |
+
\citation{garf}
|
| 6 |
+
\citation{wrangle}
|
| 7 |
+
\citation{jellyfish}
|
| 8 |
+
\citation{tablegpt}
|
| 9 |
+
\citation{retclean}
|
| 10 |
+
\citation{turl}
|
| 11 |
+
\citation{tablellama}
|
| 12 |
+
\citation{belotti}
|
| 13 |
+
\citation{racoon}
|
| 14 |
+
\citation{mtab}
|
| 15 |
+
\@writefile{toc}{\contentsline {section}{\numberline {2}Related Work}{2}{}\protected@file@percent }
|
| 16 |
+
\newlabel{sec:related}{{2}{2}}
|
| 17 |
+
\citation{selective}
|
| 18 |
+
\citation{openmed}
|
| 19 |
+
\@writefile{toc}{\contentsline {section}{\numberline {3}Method}{3}{}\protected@file@percent }
|
| 20 |
+
\newlabel{sec:method}{{3}{3}}
|
| 21 |
+
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Planner / executor decomposition}{3}{}\protected@file@percent }
|
| 22 |
+
\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Execution-verified synthetic supervision}{3}{}\protected@file@percent }
|
| 23 |
+
\newlabel{sec:sft}{{3.2}{3}}
|
| 24 |
+
\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Reference-grounded canonicalization with abstention}{3}{}\protected@file@percent }
|
| 25 |
+
\newlabel{sec:grounding}{{3.3}{3}}
|
| 26 |
+
\@writefile{toc}{\contentsline {subsection}{\numberline {3.4}PII as a second task instance}{4}{}\protected@file@percent }
|
| 27 |
+
\newlabel{sec:pii}{{3.4}{4}}
|
| 28 |
+
\@writefile{toc}{\contentsline {section}{\numberline {4}Evaluation Design}{4}{}\protected@file@percent }
|
| 29 |
+
\newlabel{sec:eval}{{4}{4}}
|
| 30 |
+
\@writefile{toc}{\contentsline {section}{\numberline {5}Results}{4}{}\protected@file@percent }
|
| 31 |
+
\newlabel{sec:results}{{5}{4}}
|
| 32 |
+
\@writefile{toc}{\contentsline {subsection}{\numberline {5.1}Small fine-tuned planner vs.\ large generic model}{4}{}\protected@file@percent }
|
| 33 |
+
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Wide-suite comparison, 3 injection seeds, churn-neutral metric. NORTH is the double-macro harmonic mean; REAL-F1 is the real-error slice. (Filled from the final run.)}}{5}{}\protected@file@percent }
|
| 34 |
+
\newlabel{tab:money}{{1}{5}}
|
| 35 |
+
\@writefile{toc}{\contentsline {subsection}{\numberline {5.2}Grounding vs.\ clustering}{5}{}\protected@file@percent }
|
| 36 |
+
\@writefile{toc}{\contentsline {subsection}{\numberline {5.3}Ablations}{5}{}\protected@file@percent }
|
| 37 |
+
\@writefile{toc}{\contentsline {subsection}{\numberline {5.4}Calibration of abstention}{5}{}\protected@file@percent }
|
| 38 |
+
\newlabel{sec:calibration}{{5.4}{5}}
|
| 39 |
+
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Risk--coverage for grounded city reconciliation (650 probes). Operating points annotated; the confidence supports thresholded abstention.}}{6}{}\protected@file@percent }
|
| 40 |
+
\newlabel{fig:rc}{{1}{6}}
|
| 41 |
+
\@writefile{toc}{\contentsline {subsection}{\numberline {5.5}PII transfer and detection}{6}{}\protected@file@percent }
|
| 42 |
+
\@writefile{toc}{\contentsline {section}{\numberline {6}Limitations}{6}{}\protected@file@percent }
|
| 43 |
+
\@writefile{toc}{\contentsline {section}{\numberline {7}Conclusion}{6}{}\protected@file@percent }
|
| 44 |
+
\bibcite{raha}{{1}{}{{}}{{}}}
|
| 45 |
+
\bibcite{holoclean}{{2}{}{{}}{{}}}
|
| 46 |
+
\bibcite{garf}{{3}{}{{}}{{}}}
|
| 47 |
+
\bibcite{wrangle}{{4}{}{{}}{{}}}
|
| 48 |
+
\bibcite{jellyfish}{{5}{}{{}}{{}}}
|
| 49 |
+
\bibcite{tablegpt}{{6}{}{{}}{{}}}
|
| 50 |
+
\bibcite{retclean}{{7}{}{{}}{{}}}
|
| 51 |
+
\bibcite{turl}{{8}{}{{}}{{}}}
|
| 52 |
+
\bibcite{tablellama}{{9}{}{{}}{{}}}
|
| 53 |
+
\bibcite{belotti}{{10}{}{{}}{{}}}
|
| 54 |
+
\bibcite{racoon}{{11}{}{{}}{{}}}
|
| 55 |
+
\bibcite{mtab}{{12}{}{{}}{{}}}
|
| 56 |
+
\bibcite{selective}{{13}{}{{}}{{}}}
|
| 57 |
+
\bibcite{openmed}{{14}{}{{}}{{}}}
|
| 58 |
+
\providecommand\NAT@force@numbers{}\NAT@force@numbers
|
| 59 |
+
\gdef \@abspage@last{7}
|
docs/paper/main.log
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
**
|
| 2 |
+
(main.tex
|
| 3 |
+
LaTeX2e <2021-11-15> patch level 1
|
| 4 |
+
L3 programming layer <2022-02-24> (article.cls
|
| 5 |
+
Document Class: article 2021/10/04 v1.4n Standard LaTeX document class
|
| 6 |
+
(size11.clo
|
| 7 |
+
File: size11.clo 2021/10/04 v1.4n Standard LaTeX file (size option)
|
| 8 |
+
)
|
| 9 |
+
\c@part=\count181
|
| 10 |
+
\c@section=\count182
|
| 11 |
+
\c@subsection=\count183
|
| 12 |
+
\c@subsubsection=\count184
|
| 13 |
+
\c@paragraph=\count185
|
| 14 |
+
\c@subparagraph=\count186
|
| 15 |
+
\c@figure=\count187
|
| 16 |
+
\c@table=\count188
|
| 17 |
+
\abovecaptionskip=\skip47
|
| 18 |
+
\belowcaptionskip=\skip48
|
| 19 |
+
\bibindent=\dimen138
|
| 20 |
+
) (geometry.sty
|
| 21 |
+
Package: geometry 2020/01/02 v5.9 Page Geometry
|
| 22 |
+
(keyval.sty
|
| 23 |
+
Package: keyval 2014/10/28 v1.15 key=value parser (DPC)
|
| 24 |
+
\KV@toks@=\toks16
|
| 25 |
+
) (ifvtex.sty
|
| 26 |
+
Package: ifvtex 2019/10/25 v1.7 ifvtex legacy package. Use iftex instead.
|
| 27 |
+
(iftex.sty
|
| 28 |
+
Package: iftex 2022/02/03 v1.0f TeX engine tests
|
| 29 |
+
))
|
| 30 |
+
\Gm@cnth=\count189
|
| 31 |
+
\Gm@cntv=\count190
|
| 32 |
+
\c@Gm@tempcnt=\count191
|
| 33 |
+
\Gm@bindingoffset=\dimen139
|
| 34 |
+
\Gm@wd@mp=\dimen140
|
| 35 |
+
\Gm@odd@mp=\dimen141
|
| 36 |
+
\Gm@even@mp=\dimen142
|
| 37 |
+
\Gm@layoutwidth=\dimen143
|
| 38 |
+
\Gm@layoutheight=\dimen144
|
| 39 |
+
\Gm@layouthoffset=\dimen145
|
| 40 |
+
\Gm@layoutvoffset=\dimen146
|
| 41 |
+
\Gm@dimlist=\toks17
|
| 42 |
+
) (amsmath.sty
|
| 43 |
+
Package: amsmath 2021/10/15 v2.17l AMS math features
|
| 44 |
+
\@mathmargin=\skip49
|
| 45 |
+
For additional information on amsmath, use the `?' option.
|
| 46 |
+
(amstext.sty
|
| 47 |
+
Package: amstext 2021/08/26 v2.01 AMS text
|
| 48 |
+
(amsgen.sty
|
| 49 |
+
File: amsgen.sty 1999/11/30 v2.0 generic functions
|
| 50 |
+
\@emptytoks=\toks18
|
| 51 |
+
\ex@=\dimen147
|
| 52 |
+
)) (amsbsy.sty
|
| 53 |
+
Package: amsbsy 1999/11/29 v1.2d Bold Symbols
|
| 54 |
+
\pmbraise@=\dimen148
|
| 55 |
+
) (amsopn.sty
|
| 56 |
+
Package: amsopn 2021/08/26 v2.02 operator names
|
| 57 |
+
)
|
| 58 |
+
\inf@bad=\count192
|
| 59 |
+
LaTeX Info: Redefining \frac on input line 234.
|
| 60 |
+
\uproot@=\count193
|
| 61 |
+
\leftroot@=\count194
|
| 62 |
+
LaTeX Info: Redefining \overline on input line 399.
|
| 63 |
+
\classnum@=\count195
|
| 64 |
+
\DOTSCASE@=\count196
|
| 65 |
+
LaTeX Info: Redefining \ldots on input line 496.
|
| 66 |
+
LaTeX Info: Redefining \dots on input line 499.
|
| 67 |
+
LaTeX Info: Redefining \cdots on input line 620.
|
| 68 |
+
\Mathstrutbox@=\box50
|
| 69 |
+
\strutbox@=\box51
|
| 70 |
+
\big@size=\dimen149
|
| 71 |
+
LaTeX Font Info: Redeclaring font encoding OML on input line 743.
|
| 72 |
+
LaTeX Font Info: Redeclaring font encoding OMS on input line 744.
|
| 73 |
+
\macc@depth=\count197
|
| 74 |
+
\c@MaxMatrixCols=\count198
|
| 75 |
+
\dotsspace@=\muskip16
|
| 76 |
+
\c@parentequation=\count199
|
| 77 |
+
\dspbrk@lvl=\count266
|
| 78 |
+
\tag@help=\toks19
|
| 79 |
+
\row@=\count267
|
| 80 |
+
\column@=\count268
|
| 81 |
+
\maxfields@=\count269
|
| 82 |
+
\andhelp@=\toks20
|
| 83 |
+
\eqnshift@=\dimen150
|
| 84 |
+
\alignsep@=\dimen151
|
| 85 |
+
\tagshift@=\dimen152
|
| 86 |
+
\tagwidth@=\dimen153
|
| 87 |
+
\totwidth@=\dimen154
|
| 88 |
+
\lineht@=\dimen155
|
| 89 |
+
\@envbody=\toks21
|
| 90 |
+
\multlinegap=\skip50
|
| 91 |
+
\multlinetaggap=\skip51
|
| 92 |
+
\mathdisplay@stack=\toks22
|
| 93 |
+
LaTeX Info: Redefining \[ on input line 2938.
|
| 94 |
+
LaTeX Info: Redefining \] on input line 2939.
|
| 95 |
+
) (amssymb.sty
|
| 96 |
+
Package: amssymb 2013/01/14 v3.01 AMS font symbols
|
| 97 |
+
|
| 98 |
+
(amsfonts.sty
|
| 99 |
+
Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support
|
| 100 |
+
\symAMSa=\mathgroup4
|
| 101 |
+
\symAMSb=\mathgroup5
|
| 102 |
+
LaTeX Font Info: Redeclaring math symbol \hbar on input line 98.
|
| 103 |
+
LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold'
|
| 104 |
+
(Font) U/euf/m/n --> U/euf/b/n on input line 106.
|
| 105 |
+
)) (booktabs.sty
|
| 106 |
+
Package: booktabs 2020/01/12 v1.61803398 Publication quality tables
|
| 107 |
+
\heavyrulewidth=\dimen156
|
| 108 |
+
\lightrulewidth=\dimen157
|
| 109 |
+
\cmidrulewidth=\dimen158
|
| 110 |
+
\belowrulesep=\dimen159
|
| 111 |
+
\belowbottomsep=\dimen160
|
| 112 |
+
\aboverulesep=\dimen161
|
| 113 |
+
\abovetopsep=\dimen162
|
| 114 |
+
\cmidrulesep=\dimen163
|
| 115 |
+
\cmidrulekern=\dimen164
|
| 116 |
+
\defaultaddspace=\dimen165
|
| 117 |
+
\@cmidla=\count270
|
| 118 |
+
\@cmidlb=\count271
|
| 119 |
+
\@aboverulesep=\dimen166
|
| 120 |
+
\@belowrulesep=\dimen167
|
| 121 |
+
\@thisruleclass=\count272
|
| 122 |
+
\@lastruleclass=\count273
|
| 123 |
+
\@thisrulewidth=\dimen168
|
| 124 |
+
) (graphicx.sty
|
| 125 |
+
Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR)
|
| 126 |
+
(graphics.sty
|
| 127 |
+
Package: graphics 2021/03/04 v1.4d Standard LaTeX Graphics (DPC,SPQR)
|
| 128 |
+
(trig.sty
|
| 129 |
+
Package: trig 2021/08/11 v1.11 sin cos tan (DPC)
|
| 130 |
+
)
|
| 131 |
+
(graphics.cfg
|
| 132 |
+
File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration
|
| 133 |
+
)
|
| 134 |
+
Package graphics Info: Driver file: xetex.def on input line 107.
|
| 135 |
+
(xetex.def
|
| 136 |
+
File: xetex.def 2021/03/18 v5.0k Graphics/color driver for xetex
|
| 137 |
+
))
|
| 138 |
+
\Gin@req@height=\dimen169
|
| 139 |
+
\Gin@req@width=\dimen170
|
| 140 |
+
) (url.sty
|
| 141 |
+
\Urlmuskip=\muskip17
|
| 142 |
+
Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc.
|
| 143 |
+
) (xcolor.sty
|
| 144 |
+
Package: xcolor 2021/10/31 v2.13 LaTeX color extensions (UK)
|
| 145 |
+
(color.cfg
|
| 146 |
+
File: color.cfg 2016/01/02 v1.6 sample color configuration
|
| 147 |
+
)
|
| 148 |
+
Package xcolor Info: Driver file: xetex.def on input line 227.
|
| 149 |
+
Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1352.
|
| 150 |
+
Package xcolor Info: Model `RGB' extended on input line 1368.
|
| 151 |
+
Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1370.
|
| 152 |
+
Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1371.
|
| 153 |
+
Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1372.
|
| 154 |
+
Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1373.
|
| 155 |
+
Package xcolor Info: Model `Gray' substituted by `gray' on input line 1374.
|
| 156 |
+
Package xcolor Info: Model `wave' substituted by `hsb' on input line 1375.
|
| 157 |
+
) (natbib.sty
|
| 158 |
+
Package: natbib 2010/09/13 8.31b (PWD, AO)
|
| 159 |
+
\bibhang=\skip52
|
| 160 |
+
\bibsep=\skip53
|
| 161 |
+
LaTeX Info: Redefining \cite on input line 694.
|
| 162 |
+
\c@NAT@ctr=\count274
|
| 163 |
+
)
|
| 164 |
+
(numbers) (l3backend-xetex.def
|
| 165 |
+
File: l3backend-xetex.def 2022-02-07 L3 backend support: XeTeX
|
| 166 |
+
\c__kernel_sys_dvipdfmx_version_int=\count275
|
| 167 |
+
\l__color_backend_stack_int=\count276
|
| 168 |
+
\g__color_backend_stack_int=\count277
|
| 169 |
+
\g__graphics_track_int=\count278
|
| 170 |
+
\l__pdf_internal_box=\box52
|
| 171 |
+
\g__pdf_backend_object_int=\count279
|
| 172 |
+
\g__pdf_backend_annotation_int=\count280
|
| 173 |
+
\g__pdf_backend_link_int=\count281
|
| 174 |
+
) (main.aux)
|
| 175 |
+
\openout1 = `main.aux'.
|
| 176 |
+
|
| 177 |
+
LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 16.
|
| 178 |
+
LaTeX Font Info: ... okay on input line 16.
|
| 179 |
+
LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 16.
|
| 180 |
+
LaTeX Font Info: ... okay on input line 16.
|
| 181 |
+
LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 16.
|
| 182 |
+
LaTeX Font Info: ... okay on input line 16.
|
| 183 |
+
LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 16.
|
| 184 |
+
LaTeX Font Info: ... okay on input line 16.
|
| 185 |
+
LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 16.
|
| 186 |
+
LaTeX Font Info: Trying to load font information for TS1+cmr on input line 1
|
| 187 |
+
6.
|
| 188 |
+
(ts1cmr.fd
|
| 189 |
+
File: ts1cmr.fd 2019/12/16 v2.5j Standard LaTeX font definitions
|
| 190 |
+
)
|
| 191 |
+
LaTeX Font Info: ... okay on input line 16.
|
| 192 |
+
LaTeX Font Info: Checking defaults for TU/lmr/m/n on input line 16.
|
| 193 |
+
LaTeX Font Info: ... okay on input line 16.
|
| 194 |
+
LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 16.
|
| 195 |
+
LaTeX Font Info: ... okay on input line 16.
|
| 196 |
+
LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 16.
|
| 197 |
+
LaTeX Font Info: ... okay on input line 16.
|
| 198 |
+
|
| 199 |
+
*geometry* driver: auto-detecting
|
| 200 |
+
*geometry* detected driver: xetex
|
| 201 |
+
*geometry* verbose mode - [ preamble ] result:
|
| 202 |
+
* driver: xetex
|
| 203 |
+
* paper: <default>
|
| 204 |
+
* layout: <same size as paper>
|
| 205 |
+
* layoutoffset:(h,v)=(0.0pt,0.0pt)
|
| 206 |
+
* modes:
|
| 207 |
+
* h-part:(L,W,R)=(72.26999pt, 469.75502pt, 72.26999pt)
|
| 208 |
+
* v-part:(T,H,B)=(72.26999pt, 650.43001pt, 72.26999pt)
|
| 209 |
+
* \paperwidth=614.295pt
|
| 210 |
+
* \paperheight=794.96999pt
|
| 211 |
+
* \textwidth=469.75502pt
|
| 212 |
+
* \textheight=650.43001pt
|
| 213 |
+
* \oddsidemargin=0.0pt
|
| 214 |
+
* \evensidemargin=0.0pt
|
| 215 |
+
* \topmargin=-37.0pt
|
| 216 |
+
* \headheight=12.0pt
|
| 217 |
+
* \headsep=25.0pt
|
| 218 |
+
* \topskip=11.0pt
|
| 219 |
+
* \footskip=30.0pt
|
| 220 |
+
* \marginparwidth=59.0pt
|
| 221 |
+
* \marginparsep=10.0pt
|
| 222 |
+
* \columnsep=10.0pt
|
| 223 |
+
* \skip\footins=10.0pt plus 4.0pt minus 2.0pt
|
| 224 |
+
* \hoffset=0.0pt
|
| 225 |
+
* \voffset=0.0pt
|
| 226 |
+
* \mag=1000
|
| 227 |
+
* \@twocolumnfalse
|
| 228 |
+
* \@twosidefalse
|
| 229 |
+
* \@mparswitchfalse
|
| 230 |
+
* \@reversemarginfalse
|
| 231 |
+
* (1in=72.27pt=25.4mm, 1cm=28.453pt)
|
| 232 |
+
|
| 233 |
+
LaTeX Font Info: Trying to load font information for U+msa on input line 17.
|
| 234 |
+
|
| 235 |
+
(umsa.fd
|
| 236 |
+
File: umsa.fd 2013/01/14 v3.01 AMS symbols A
|
| 237 |
+
)
|
| 238 |
+
LaTeX Font Info: Trying to load font information for U+msb on input line 17.
|
| 239 |
+
|
| 240 |
+
(umsb.fd
|
| 241 |
+
File: umsb.fd 2013/01/14 v3.01 AMS symbols B
|
| 242 |
+
) [1
|
| 243 |
+
|
| 244 |
+
]
|
| 245 |
+
|
| 246 |
+
LaTeX Font Warning: Font shape `TU/lmr/bx/sc' undefined
|
| 247 |
+
(Font) using `TU/lmr/bx/n' instead on input line 92.
|
| 248 |
+
|
| 249 |
+
[2] [3] [4] [5] [6]
|
| 250 |
+
File: fig_precision_coverage.pdf Graphic file (type pdf)
|
| 251 |
+
<use fig_precision_coverage.pdf>
|
| 252 |
+
[7] [8] [9] [10] [11]
|
| 253 |
+
File: fig_label_curve.pdf Graphic file (type pdf)
|
| 254 |
+
<use fig_label_curve.pdf>
|
| 255 |
+
[12]
|
| 256 |
+
File: fig_risk_coverage.pdf Graphic file (type pdf)
|
| 257 |
+
<use fig_risk_coverage.pdf>
|
| 258 |
+
[13]
|
| 259 |
+
Underfull \hbox (badness 10000) in paragraph at lines 842--852
|
| 260 |
+
\TU/lmr/m/n/10.95 The model weights are public: $[][][][][] [] [] [] [][][][][]
|
| 261 |
+
[][][][][][] [] [][] [] [][][][][][][][][] []
|
| 262 |
+
[]
|
| 263 |
+
|
| 264 |
+
[14] [15] [16] [17] (main.aux)
|
| 265 |
+
|
| 266 |
+
LaTeX Font Warning: Some font shapes were not available, defaults substituted.
|
| 267 |
+
|
| 268 |
+
)
|
| 269 |
+
Output written on main.xdv (17 pages, 553908 bytes).
|
docs/paper/main.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:afcb564ae43329b0a7174b676446fe1204146968d9ed9a22426ab82454039e70
|
| 3 |
+
size 201091
|
docs/paper/main.tex
ADDED
|
@@ -0,0 +1,1021 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
\documentclass[11pt]{article}
|
| 2 |
+
\usepackage[utf8]{inputenc} % no-op on TeXLive >= 2018 (arXiv pdflatex); explicit for safety
|
| 3 |
+
\usepackage[margin=1in]{geometry}
|
| 4 |
+
\usepackage{amsmath,amssymb}
|
| 5 |
+
\usepackage{booktabs}
|
| 6 |
+
\usepackage{graphicx}
|
| 7 |
+
\usepackage{url}
|
| 8 |
+
\usepackage{xcolor}
|
| 9 |
+
\usepackage[numbers]{natbib}
|
| 10 |
+
\input{numbers}
|
| 11 |
+
|
| 12 |
+
\title{Verified Cleaning Plans: Plan-Level Selective Prediction Turns Local LLM
|
| 13 |
+
Planners into Trustworthy Table Cleaners}
|
| 14 |
+
\author{Ricardo Alanis\\ \small{\texttt{ricardo.alanis@gmail.com}}}
|
| 15 |
+
\date{June 2026}
|
| 16 |
+
|
| 17 |
+
\begin{document}
|
| 18 |
+
\maketitle
|
| 19 |
+
|
| 20 |
+
\begin{abstract}
|
| 21 |
+
Cleaning messy tabular data---particularly \emph{canonicalization}, the merging of
|
| 22 |
+
inconsistent surface forms such as \texttt{USA}/\texttt{U.S.A}/\texttt{united states}
|
| 23 |
+
into one canonical value---resists rule-based automation and is routinely done by hand.
|
| 24 |
+
We present ScrubData, an architecture built around a trust contract. A local LLM
|
| 25 |
+
\emph{planner} reads an aggregated column profile (per-value frequency counts,
|
| 26 |
+
invariant to row count) and \emph{proposes} a structured JSON cleaning plan; a
|
| 27 |
+
deterministic executor \emph{applies} it, making every change auditable and reversible;
|
| 28 |
+
and \emph{plan-level selective prediction} --- a deterministic verifier that scores
|
| 29 |
+
every proposed mapping and drops low-confidence entries to review flags --- extends
|
| 30 |
+
abstention from cell-level confidence to the plan itself. The verified union of the
|
| 31 |
+
gated model plan with a reference-grounded heuristic is the architecture's operating
|
| 32 |
+
point: a zero-configuration, zero-label system that repairs 41\% of the hospital
|
| 33 |
+
benchmark's 509 real errors at \unionGatePrec{} precision (strongest of three training
|
| 34 |
+
seeds; 3-seed mean \unionGateThreeSeedPrec{} at \unionGateThreeSeedCov{} coverage,
|
| 35 |
+
$\pm$ = 95\% CI),
|
| 36 |
+
with every declined merge surfaced for review. Four deterministic capabilities ---
|
| 37 |
+
profile-level \emph{suspect surfacing} for high-cardinality columns, reconciliation
|
| 38 |
+
against a pluggable \emph{entity reference} built from open vocabularies,
|
| 39 |
+
\emph{cross-row majority voting} over repeated-entity groups, and
|
| 40 |
+
\emph{convention-conservatism} gates --- carry the system to never-seen tables:
|
| 41 |
+
macro F1 \unseenMacroF{} at \unseenMacroDamage{} damage over the 35 unseen-source
|
| 42 |
+
pairs of a \nPairs-pair benchmark, and \emph{zero silent edits} across \nWild{} wild
|
| 43 |
+
tables plus a \nTrust-table trust audit, released together as the \textsc{WildClean}
|
| 44 |
+
benchmark.
|
| 45 |
+
|
| 46 |
+
Finally, we report where the capability lives. \emph{Execution-verified} synthetic
|
| 47 |
+
supervision --- a training example is kept only if executing its plan provably
|
| 48 |
+
recovers the known-clean table --- buys the 4B fine-tune real in-distribution skill
|
| 49 |
+
and the most precise gated planner at usable coverage (\modelGatePrec{} precision at
|
| 50 |
+
\modelGateCov{} coverage); but five further retrains and a three-arm GRPO pilot leave
|
| 51 |
+
held-out generalization statistically bounded (TOST against a pre-registered margin),
|
| 52 |
+
while two of three zero-shot 24--31B open-weights planners (devstral-24B, gemma4-31B)
|
| 53 |
+
dropped into the \emph{identical} harness exceed the fine-tune's operating point
|
| 54 |
+
(\scalePrecBig{} precision at \scaleCovBig{} coverage) with no task training. The
|
| 55 |
+
architecture is planner-agnostic: it converts capability gains into trustworthy
|
| 56 |
+
operating points without retraining. The shipped system runs entirely locally on commodity hardware;
|
| 57 |
+
no data leaves the machine (the scaling-arm planners were measured via hosted
|
| 58 |
+
endpoints; all are locally deployable open weights).
|
| 59 |
+
\end{abstract}
|
| 60 |
+
|
| 61 |
+
\section{Introduction}
|
| 62 |
+
|
| 63 |
+
A large share of practical data work is cleaning: a sales export where the same country
|
| 64 |
+
is spelled four ways, a hospital roster where \texttt{birminghxm} should be
|
| 65 |
+
\texttt{birmingham}, a CRM dump with mixed date formats and duplicated contacts. The
|
| 66 |
+
fuzzy half of this work---recognizing that distinct surface forms denote the same
|
| 67 |
+
entity---is exactly what rules do poorly and humans do slowly.
|
| 68 |
+
|
| 69 |
+
Large language models can do this fuzzy matching, but deploying them as cell editors has
|
| 70 |
+
three problems. First, \emph{trust}: a model that edits cells directly can silently
|
| 71 |
+
corrupt data, and its errors are unauditable. Second, \emph{cost and privacy}: shipping
|
| 72 |
+
every row of a private table to a hosted frontier model is expensive and often
|
| 73 |
+
unacceptable. Third, \emph{hallucination}: asked for a canonical form, a generative model
|
| 74 |
+
will invent one, and on tail entities it will invent wrong ones.
|
| 75 |
+
|
| 76 |
+
ScrubData addresses all three with an architecture in which the model never touches
|
| 77 |
+
data. A profiler aggregates each column into a value-frequency distribution; a small
|
| 78 |
+
local model reads the profile and \emph{proposes} a JSON cleaning plan; a deterministic
|
| 79 |
+
pandas executor \emph{applies} it. The plan is the complete, inspectable, reversible
|
| 80 |
+
specification of every change---there are no silent edits by construction
|
| 81 |
+
(\S\ref{sec:method}). Because the prompt scales with the number of \emph{distinct}
|
| 82 |
+
values rather than rows, a million-row table profiles like a hundred-row one.
|
| 83 |
+
|
| 84 |
+
This paper makes five contributions:
|
| 85 |
+
\begin{enumerate}
|
| 86 |
+
\item \textbf{A planner/executor decomposition with plan-level selective prediction}:
|
| 87 |
+
the model proposes, a deterministic engine executes with full lineage, and a
|
| 88 |
+
deterministic verifier gates every proposed mapping, extending abstention to the plan
|
| 89 |
+
itself. The verified union of the gated model plan with a reference-grounded
|
| 90 |
+
heuristic repairs 41\% of hospital's \hospErrors{} real errors at \unionGatePrec{}
|
| 91 |
+
precision with zero configuration and zero labels (\S\ref{sec:method},
|
| 92 |
+
\S\ref{sec:verifier}, \S\ref{sec:ws1results}).
|
| 93 |
+
\item \textbf{\textsc{WildClean} and an un-gameable evaluation}: a 65-dataset suite
|
| 94 |
+
(real-error benchmarks plus seeded error injection over 15 harvested open-data
|
| 95 |
+
domains) scored with a churn-neutral, convention-tolerant metric that cannot be
|
| 96 |
+
inflated by mass rewriting, with damage and silent edits scored alongside repair F1,
|
| 97 |
+
degenerate baselines pinning the metric's floor and ceiling, and the scorer itself
|
| 98 |
+
validated against 30 adversarial known-by-construction cases (\S\ref{sec:eval},
|
| 99 |
+
\S\ref{sec:degenerate}).
|
| 100 |
+
\item \textbf{Four deterministic capabilities that carry never-seen-table
|
| 101 |
+
generalization}: bounded suspect surfacing for high-cardinality columns, generic
|
| 102 |
+
entity-reference reconciliation with an exact-hit typing floor, cross-row majority
|
| 103 |
+
voting with a false-consensus guard, and convention-conservatism gates --- each
|
| 104 |
+
motivated by a measured failure regime and gated by the verifier
|
| 105 |
+
(\S\ref{sec:capabilities}, \S\ref{sec:wild}).
|
| 106 |
+
\item \textbf{Execution-verified synthetic supervision}, the training method behind
|
| 107 |
+
the 4B planner instantiation: every training example is validated by running the
|
| 108 |
+
executor on the (dirty table, plan) pair and checking that the known-clean table is
|
| 109 |
+
recovered; non-recovering examples are discarded (\S\ref{sec:sft}).
|
| 110 |
+
\item \textbf{A unified finding on where capability lives in this architecture}: five
|
| 111 |
+
further supervised fine-tunes and a three-arm GRPO pilot with the executor as a
|
| 112 |
+
verifiable reward leave held-out generalization statistically bounded (TOST), while
|
| 113 |
+
two of three zero-shot 24--31B planners dropped into the same harness exceed the
|
| 114 |
+
fine-tune's operating point --- deterministic machinery plus plan-level verification carry the
|
| 115 |
+
generalization that exists, and raw planner capability, not task fine-tuning, scales
|
| 116 |
+
it (\S\ref{sec:negative}, \S\ref{sec:scaling}).
|
| 117 |
+
\end{enumerate}
|
| 118 |
+
|
| 119 |
+
We deliberately report a negative-flavored finding alongside the positive ones: on
|
| 120 |
+
\emph{injected} typos, classical frequency clustering remains a strong baseline---by
|
| 121 |
+
construction, injection places the canonical form in the column, which is clustering's
|
| 122 |
+
ideal regime. The advantage of grounding is concentrated where it matters: real errors,
|
| 123 |
+
tail entities absent from the column, and adversarial near-misses where acting at all is
|
| 124 |
+
wrong (\S\ref{sec:results}).
|
| 125 |
+
|
| 126 |
+
\section{Related Work}
|
| 127 |
+
\label{sec:related}
|
| 128 |
+
|
| 129 |
+
\textbf{Error detection and repair.} Raha and Baran~\cite{raha} established
|
| 130 |
+
configuration-free error detection and correction benchmarks (hospital, beers, flights,
|
| 131 |
+
rayyan), which we adopt as out-of-distribution evaluation. HoloClean~\cite{holoclean}
|
| 132 |
+
combines integrity constraints, external reference data, and statistics in probabilistic
|
| 133 |
+
repair, demonstrating that external signals can veto statistically plausible but wrong
|
| 134 |
+
repairs---an insight our reference-veto inherits. GARF~\cite{garf} learns repair rules
|
| 135 |
+
self-supervised from the data itself; it also demonstrates the structural limit we
|
| 136 |
+
observe for frequency-only methods: a lone categorical column offers no co-occurring
|
| 137 |
+
signal to vote against an error.
|
| 138 |
+
|
| 139 |
+
\textbf{The 2025--26 landscape.} Post-Cocoon work concentrates on zero-label
|
| 140 |
+
\emph{detection}: ZeroED~\cite{zeroed} (cloud-LLM cluster labeling, hospital
|
| 141 |
+
detection F1 0.81, collapsing to 0.27 on smaller models), ForestED~\cite{forested}
|
| 142 |
+
(LLM-induced decision trees, 0.756), and Auto-Test~\cite{autotest} (corpus-mined
|
| 143 |
+
semantic-domain constraints, no LLM at inference) --- none performs zero-label
|
| 144 |
+
\emph{repair}. GIDCL~\cite{gidcl} sets the labeled-class repair ceiling
|
| 145 |
+
(hospital \gidclHosp{} with 20 labels and a LoRA trained per cleaned table);
|
| 146 |
+
Cocoon~\cite{cocoon} remains an unreproduced preprint (15 citing papers, none a
|
| 147 |
+
reproduction). Two concurrent results corroborate facets of this paper's central
|
| 148 |
+
negative finding that machinery, not weights, carries cleaning generalization: a
|
| 149 |
+
study showing even frontier models cannot correct table distortions without
|
| 150 |
+
explicit priors~\cite{distort}, and a large multi-agent-debate evaluation in which
|
| 151 |
+
LLM self-critique \emph{degrades} repair and only an adversarially separate,
|
| 152 |
+
execution-grounded critic helps~\cite{debate} --- the architecture our verifier
|
| 153 |
+
instantiates. Spreadsheet-RL~\cite{spreadsheetrl} reports the complementary
|
| 154 |
+
positive case: with full-scale RL infrastructure and execution-verified rewards,
|
| 155 |
+
a 4B model's spreadsheet-manipulation skill \emph{does} move (12.0\%
|
| 156 |
+
$\rightarrow$ 23.4\%) --- consistent with our reading that the gap between our
|
| 157 |
+
\$30 pilot and such results is infrastructure scale, a boundary we state rather
|
| 158 |
+
than blur (\S\ref{sec:negative}).
|
| 159 |
+
|
| 160 |
+
\textbf{LLMs for data wrangling.} Narayan et al.~\cite{wrangle} showed frontier
|
| 161 |
+
foundation models handle entity matching and imputation few-shot;
|
| 162 |
+
Jellyfish~\cite{jellyfish} and Table-GPT~\cite{tablegpt} fine-tune mid-size models for
|
| 163 |
+
data tasks. RetClean~\cite{retclean} is closest in spirit: retrieval from data lakes
|
| 164 |
+
grounds cell repair, with the key empirical split that parametric knowledge suffices on
|
| 165 |
+
world-known head values but collapses on the tail---motivating retrieval. Our work
|
| 166 |
+
differs in the planner/executor decomposition (the model emits no cell values, only
|
| 167 |
+
plans), in execution-verified supervision, and in the calibrated-abstention contract.
|
| 168 |
+
|
| 169 |
+
\textbf{Entity linking over tables.} TURL~\cite{turl} and TableLlama~\cite{tablellama}
|
| 170 |
+
inject candidate entities into table understanding; Belotti et al.~\cite{belotti}
|
| 171 |
+
show retriever coverage is the accuracy ceiling for table entity disambiguation and that
|
| 172 |
+
long candidate lists hurt smaller models. RACOON~\cite{racoon} shows inference-time KG
|
| 173 |
+
retrieval lifts a frozen model substantially, supporting our choice to ground at
|
| 174 |
+
inference rather than bake aliases into weights (TURL's out-of-domain collapse is the
|
| 175 |
+
cautionary result). MTab~\cite{mtab} established type-constrained matching with
|
| 176 |
+
abstention in semantic table annotation.
|
| 177 |
+
|
| 178 |
+
\textbf{Clustering-based cleaning tools.} The de-facto practitioner baseline is
|
| 179 |
+
OpenRefine: key-collision (fingerprint) clustering plus a nearest-neighbour mode; we
|
| 180 |
+
reimplement both faithfully, including blocking, and compare head-to-head.
|
| 181 |
+
|
| 182 |
+
\textbf{Selective prediction.} Risk--coverage analysis and calibration
|
| 183 |
+
metrics~\cite{selective} formalize ``knowing when not to act''; to our knowledge their
|
| 184 |
+
application to data-cleaning merge decisions is new.
|
| 185 |
+
|
| 186 |
+
\textbf{Small specialized models.} OpenMed~\cite{openmed} fine-tunes sub-500M encoders
|
| 187 |
+
to state-of-the-art biomedical NER, the sister result to our thesis that small
|
| 188 |
+
specialized models beat large generic ones on narrow structured tasks; we adopt their
|
| 189 |
+
released PII token classifiers for column typing (\S\ref{sec:pii}).
|
| 190 |
+
|
| 191 |
+
\section{Method}
|
| 192 |
+
\label{sec:method}
|
| 193 |
+
|
| 194 |
+
\subsection{Planner / executor decomposition}
|
| 195 |
+
A \emph{profiler} reduces each column to a typed summary: detected semantic type, missing
|
| 196 |
+
counts, issue flags, and a value--frequency distribution capped at 80 distinct values
|
| 197 |
+
(high-cardinality columns are summarized by their head). The \emph{planner}---either a
|
| 198 |
+
deterministic heuristic or our fine-tuned 4B model---maps the profile (plus three sample
|
| 199 |
+
rows) to a JSON plan: a list of per-column operations drawn from a closed vocabulary
|
| 200 |
+
(\texttt{canonicalize\_categories} with an explicit mapping, \texttt{parse\_date},
|
| 201 |
+
\texttt{standardize\_phone}, \texttt{mask\_pii}, \ldots), table operations, and review
|
| 202 |
+
flags. The \emph{executor} applies the plan with pure pandas transforms. The plan is the
|
| 203 |
+
only channel through which data changes: every diff is attributable to a named operation
|
| 204 |
+
with a rationale, the original table is never mutated, and abstentions are first-class
|
| 205 |
+
plan objects. We export per-run decision summaries as OpenTelemetry GenAI spans.
|
| 206 |
+
|
| 207 |
+
\subsection{Execution-verified synthetic supervision}
|
| 208 |
+
\label{sec:sft}
|
| 209 |
+
Training pairs are generated by corrupting clean synthetic tables with realistic noise
|
| 210 |
+
(casing, aliases, single-character typos with Zipf-distributed long-tail categorical
|
| 211 |
+
columns of 30--80 distinct values) while recording the ground-truth plan. The defining
|
| 212 |
+
step is \emph{verification by execution}: a candidate example is kept only if
|
| 213 |
+
$\textsc{Execute}(\text{dirty}, \text{plan}) = \text{clean}$ cell-for-cell. This closes
|
| 214 |
+
the loop between supervision and semantics---a plan that would not actually clean the
|
| 215 |
+
table can never become a training label. We augment with real supervision derived from
|
| 216 |
+
paired dirty/clean benchmarks by aligning cells and keeping only \emph{learnable}
|
| 217 |
+
canonicalizations (a surface form that is a string variant of its target and never a
|
| 218 |
+
legitimate value elsewhere), which excludes unlearnable per-cell corrections such as
|
| 219 |
+
divergent flight times. The fine-tune is QLoRA (rank 32) over Qwen3-4B-Instruct in
|
| 220 |
+
bf16; one practical finding is that the base model's tool-calling prior dominates
|
| 221 |
+
free-running generation even after convergent fine-tuning (loss 0.16) and must be
|
| 222 |
+
suppressed at decode time by banning the two tool-call tokens.
|
| 223 |
+
|
| 224 |
+
\subsection{Reference-grounded canonicalization with abstention}
|
| 225 |
+
\label{sec:grounding}
|
| 226 |
+
For columns whose values reconcile to a known concept type (countries, administrative
|
| 227 |
+
regions, cities), canonical forms are never generated: a fuzzy retriever (normalized
|
| 228 |
+
edit similarity with first-character blocking and length prefilters) matches each
|
| 229 |
+
distinct value against the type-scoped reference (ISO/pycountry; GeoNames cities500,
|
| 230 |
+
196k entries). A value maps to a canonical only if (i) similarity clears a threshold
|
| 231 |
+
$\tau{=}0.84$, (ii) the best--second-best margin clears $0.03$ (ambiguity veto: a value
|
| 232 |
+
equally close to \texttt{Box} and \texttt{Boaz} abstains), and (iii) the canonical is
|
| 233 |
+
cast to the column's observed case convention. Near-misses ($0.70{\le}s{<}\tau$) are
|
| 234 |
+
surfaced as review flags. The same wrapper grounds the \emph{model} planner: for
|
| 235 |
+
reference-typed columns the model's free-generated mapping is replaced by the grounded
|
| 236 |
+
one, so the model can add coverage but never invent a canonical for a grounded type.
|
| 237 |
+
|
| 238 |
+
\subsection{Plan-level selective prediction: the verified union planner}
|
| 239 |
+
\label{sec:verifier}
|
| 240 |
+
Grounding constrains reference-typed columns, but the planner's \emph{free}
|
| 241 |
+
canonicalization mappings on non-grounded columns remain unguarded---and they are where
|
| 242 |
+
real-data precision dies (the fine-tune's raw hospital plan: \hospModelPrecVSix{}
|
| 243 |
+
precision at \hospModelRecallVSix{} recall). Rather than retrain, we extend abstention
|
| 244 |
+
to the plan itself. A deterministic \emph{verifier} scores every proposed mapping entry
|
| 245 |
+
$raw{\to}canon$ with contract-preserving evidence (no cell values emitted, no gold
|
| 246 |
+
access): three hard gates distilled from the model's measured failure classes---a value
|
| 247 |
+
occurring ${\ge}3$ times is data, not a typo (\emph{errors are rare}); the target must
|
| 248 |
+
be a frequent column value clearly dominating the source (no mapping one typo onto
|
| 249 |
+
another); digit-bearing codes repair only when the letter part is near-identical---then
|
| 250 |
+
a confidence combining edit similarity with frequency support. Entries below a
|
| 251 |
+
threshold $\tau$ are dropped to review flags; abstention stays first-class. Sweeping
|
| 252 |
+
$\tau$ yields a plan-level precision--coverage curve. The shipped composition,
|
| 253 |
+
the \emph{verified union planner}, is the verifier-gated model plan ($\tau{=}0.5$)
|
| 254 |
+
unioned with the grounded heuristic's mappings (the model wins per surface form);
|
| 255 |
+
the same code path is the product default.
|
| 256 |
+
|
| 257 |
+
\subsection{Visibility and consensus: four deterministic capabilities}
|
| 258 |
+
\label{sec:capabilities}
|
| 259 |
+
Four further mechanisms, each motivated by a measured failure regime on never-seen
|
| 260 |
+
tables, complete the deterministic machinery. \textbf{(a) Suspect surfacing.} The
|
| 261 |
+
profile's value-frequency view is capped, so high-cardinality columns hide their
|
| 262 |
+
dirty cells from any planner. Every column profile now carries a bounded
|
| 263 |
+
\texttt{suspect\_values} section: rare anomalous surfaces with evidence-backed
|
| 264 |
+
repair candidates (frequency dominance, edit similarity, reference membership).
|
| 265 |
+
The heuristic planner repairs from suspects under a strict verifier bar
|
| 266 |
+
($\tau_{hc}{=}0.8$) and flags the rest. \textbf{(b) Generic entity reference.}
|
| 267 |
+
Open vocabularies (SemTab ToughTables aliases --- derived excluding our benchmark
|
| 268 |
+
tables; MusicBrainz search-hint misspellings; RxNorm; Wikidata; ROR) register as a
|
| 269 |
+
pluggable reference type. Because the reference is broad, entity-typing a column
|
| 270 |
+
additionally requires that ${\ge}20\%$ of its distinct values match the reference
|
| 271 |
+
\emph{exactly} --- fuzzy coverage alone over-fires on name-like columns (measured).
|
| 272 |
+
This resolves the regime where every surface in a column is unique (no in-column
|
| 273 |
+
frequency signal exists at all): five such benchmark tables go from 0.0 to
|
| 274 |
+
\ttFOne{} F1 at \emph{zero} damage. \textbf{(c) Cross-row majority voting.} Tables
|
| 275 |
+
that repeat a real-world entity across rows (a flight reported by many sources)
|
| 276 |
+
carry their own repair signal. A detection step finds compact-token key columns
|
| 277 |
+
with small groups (median multiplicity 3--30) and columns whose groups show
|
| 278 |
+
\emph{majority-bearing} disagreement with per-group information; a table-level
|
| 279 |
+
operation then resolves thin dissenting minorities to the group majority. A
|
| 280 |
+
\emph{false-consensus} guard declines when minority shares look like legitimate
|
| 281 |
+
correlated updates rather than reporting errors (mean minority share ${\ge}0.25$)
|
| 282 |
+
--- a flat volume cap was measured to destroy the legitimate dense-disagreement
|
| 283 |
+
regime and replaced. \textbf{(d) Convention conservatism.} The planner never
|
| 284 |
+
re-formats an internally consistent column: date and percent ops are gated on
|
| 285 |
+
dominant-shape inconsistency (digit and alpha runs collapsed; 90\% rule),
|
| 286 |
+
ZIP/postal-named columns are never typed as phones or dates, and Excel-serial
|
| 287 |
+
date typing requires a date-suggestive column name. Suppressed minority values
|
| 288 |
+
surface as review flags --- abstention is visible, never silent. The verifier
|
| 289 |
+
enforces the same gates on model-emitted plans at the verification boundary.
|
| 290 |
+
|
| 291 |
+
\subsection{PII as a second task instance}
|
| 292 |
+
\label{sec:pii}
|
| 293 |
+
The identical contract covers PII: a deterministic tier types columns by checksum and
|
| 294 |
+
pattern validators (Luhn, IBAN mod-97, SSN/email/phone) over distinct values; an
|
| 295 |
+
optional 44M OpenMed-PII token classifier~\cite{openmed} extends coverage to names and
|
| 296 |
+
addresses, gated by a sensitive-type allowlist and a column-level coverage vote; and
|
| 297 |
+
masking, salted hashing, and join-stable pseudonymization are deterministic executor
|
| 298 |
+
operations. Measured briefly: the classifier, though trained on sentence-level
|
| 299 |
+
clinical text, transfers to bare cell values --- \piiNameBare{} detection on
|
| 300 |
+
person-name cells and \piiAddrBare{} on address cells ($n{=}40$ sampled cells each);
|
| 301 |
+
the validator tier, evaluated out-of-distribution on per-type columns from the Gretel
|
| 302 |
+
PII test split, types 5/5 covered PII types correctly with 0/7 false positives on
|
| 303 |
+
negative columns drawn from real open data; and after deterministic masking,
|
| 304 |
+
re-running all validators over the output finds \piiLeakRate{} residual PII ---
|
| 305 |
+
residual PII \emph{detectable by our validators}, a circularity we note explicitly:
|
| 306 |
+
the leak test can only see what the validator tier sees.
|
| 307 |
+
|
| 308 |
+
\section{Evaluation Design}
|
| 309 |
+
\label{sec:eval}
|
| 310 |
+
\textbf{Suite.} Five real-error benchmarks (Raha) plus seeded error injection
|
| 311 |
+
(typo/OCR/case/whitespace) over 15 harvested open-data domains (NYC, Chicago, SF, LA,
|
| 312 |
+
Seattle, Texas, WA portals; GitHub) $\approx$ 65 datasets per seed. We aggregate as a
|
| 313 |
+
\emph{double macro}---mean over error types of mean over datasets, harmonically combined
|
| 314 |
+
with the domain macro---so no single table or error type dominates:
|
| 315 |
+
\begin{equation*}
|
| 316 |
+
\textsc{north} \;=\; \operatorname{HM}\Biggl(
|
| 317 |
+
\underbrace{\frac{1}{|T|}\sum_{t \in T}\frac{1}{|D_t|}\sum_{d \in D_t} F_1(d)}_{\text{error-type macro}},\;
|
| 318 |
+
\underbrace{\frac{1}{|G|}\sum_{g \in G}\frac{1}{|D_g|}\sum_{d \in D_g} F_1(d)}_{\text{domain macro}}
|
| 319 |
+
\Biggr),
|
| 320 |
+
\end{equation*}
|
| 321 |
+
where $T$ is the set of error types, $G$ the set of data domains, $D_t$ (resp.\ $D_g$)
|
| 322 |
+
the datasets carrying error type $t$ (domain $g$), and $\operatorname{HM}$ the harmonic
|
| 323 |
+
mean.
|
| 324 |
+
|
| 325 |
+
\textbf{Churn-neutral metric.} A cell change that is case/whitespace-equivalent to the
|
| 326 |
+
input but does not restore the gold counts as nothing: not a fix, not a change, not
|
| 327 |
+
damage. Without this, mass case-rewriting inflates precision (we observed $+0.12$
|
| 328 |
+
NORTH from \emph{removing} case matching before the correction); with it, fixing a
|
| 329 |
+
case-injected error requires actually acting. We additionally report
|
| 330 |
+
\emph{damage}---the rate of semantically corrupting clean cells---and an adversarial
|
| 331 |
+
\emph{abstain slice} whose traps are garbage strings (not single-edit variants of any
|
| 332 |
+
reference entity; an earlier trap set mis-scored grounding for correctly mapping
|
| 333 |
+
\texttt{Boazz}$\to$\texttt{Boaz}). We report both repairs of these metric artifacts as
|
| 334 |
+
evidence that gameability must be tested, not assumed.
|
| 335 |
+
|
| 336 |
+
\textbf{Real vs.\ injected.} Injected typos are in-distribution for frequency
|
| 337 |
+
clustering by construction (the canonical is present and dominant in the column), so we
|
| 338 |
+
report the real-error and injected slices separately. A TableEG-style audit
|
| 339 |
+
quantifies the gap (\texttt{eval/inject\_validity.py}): the injector covers three
|
| 340 |
+
of nine error classes (Jensen--Shannon divergence 0.526 bits from the pooled real
|
| 341 |
+
distribution over 163{,}607 real errors), and injected-only evaluation would
|
| 342 |
+
invert the fingerprint-clustering ranking --- exactly the overstatement the
|
| 343 |
+
separate-slice reporting prevents.
|
| 344 |
+
|
| 345 |
+
\textbf{Scorer validation.} Following GroUSE-style evaluator
|
| 346 |
+
testing~\cite{grouse}, the scorer itself is validated against 30 adversarial
|
| 347 |
+
known-by-construction cases: a no-op plan must score 0 fixes and 0 damage, an
|
| 348 |
+
oracle plan exactly 1.0, vandalizing $k$ of $m$ clean cells must score damage
|
| 349 |
+
$k/m$ at precision 0, pure churn (case/whitespace rewrites that do not restore
|
| 350 |
+
gold) must count as nothing although a naive scorer would count it, fixes must
|
| 351 |
+
require actually acting, and silent edits must trip the audit. All 30 pass
|
| 352 |
+
against the shipped scorer unmodified. We additionally cross-score every system
|
| 353 |
+
under the \emph{original} Raha/Baran cell-repair protocol side by side with ours
|
| 354 |
+
(\texttt{eval/cross\_scoring.py}): rankings agree at Kendall $\tau_b{=}1.0$ on
|
| 355 |
+
three of five datasets, and the disagreements cut both ways --- raw string
|
| 356 |
+
equality denies credit for numerically-correct serialization restorations (our
|
| 357 |
+
movies\_1 repairs), while churn-neutrality charges Baran for load-time
|
| 358 |
+
normalizer rewrites its own protocol hides (hospital precision
|
| 359 |
+
$0.908\!\to\!0.783$). Neither metric family flatters us uniformly, and our Baran
|
| 360 |
+
reproduction calibrates against its published Table~3 within $+0.02$ on three of
|
| 361 |
+
the four shared datasets.
|
| 362 |
+
|
| 363 |
+
\textbf{Contamination.} The Raha-suite benchmarks have been public on GitHub since
|
| 364 |
+
2019 and sit inside every modern base model's training window; we treat them as
|
| 365 |
+
potentially contaminated and split our claims accordingly. A verbatim-completion
|
| 366 |
+
probe makes the concern concrete: prompted with five fields of a gold hospital row,
|
| 367 |
+
a frontier-class model reproduces \textbf{25\%} of the held-out cells exactly
|
| 368 |
+
(30/120 cells over 30 rows, exact-substring match), versus \textbf{0\%} (0/120) on a
|
| 369 |
+
date-stamped post-training-cutoff wild harvest under the identical protocol
|
| 370 |
+
(\texttt{eval/contamination\_probe.py}). The rate is an upper bound on memorization
|
| 371 |
+
--- some completions are guessable from the given fields --- but it is not zero, so
|
| 372 |
+
results on legacy-public benchmarks (including the all-hospital
|
| 373 |
+
Table~\ref{tab:scaling}, whose zero-shot planners may partially benefit from
|
| 374 |
+
memorized gold) carry this caveat, while the architecture's trust claims
|
| 375 |
+
(zero silent edits, damage accounting, abstention) rest on the date-stamped wild
|
| 376 |
+
and GitTables slices, where the probe finds nothing to complete.
|
| 377 |
+
|
| 378 |
+
\section{Results}
|
| 379 |
+
\label{sec:results}
|
| 380 |
+
|
| 381 |
+
\subsection{Plan-level selective prediction on real errors}
|
| 382 |
+
\label{sec:ws1results}
|
| 383 |
+
On hospital's \hospErrors{} real errors, the verifier transforms the fine-tune from
|
| 384 |
+
unshippable to precise (Figure~\ref{fig:pc}): the raw model plan repairs
|
| 385 |
+
\hospModelRecallVSix{} of errors at \hospModelPrecVSix{} precision; gated at $\tau{=}0.5$ it
|
| 386 |
+
reaches \modelGatePrec{} precision at \modelGateCov{} coverage (146 of 147 committed
|
| 387 |
+
changes correct). The union with the grounded heuristic buys coverage back:
|
| 388 |
+
\textbf{\unionGatePrec{} precision at \unionGateCov{} coverage} (\unionChanged{}
|
| 389 |
+
changes, \unionFixed{} correct). This turns the system's promise into a measured
|
| 390 |
+
sentence: \emph{zero-configuration and zero labels, repair 41\% of real errors at
|
| 391 |
+
${\ge}0.90$ precision, with every declined merge surfaced for review}. For context,
|
| 392 |
+
Baran given oracle error positions and 20 gold-labeled tuples per dataset reaches
|
| 393 |
+
\realFBaran{} F1 on the same slice (\S\ref{sec:ws4})---selective prediction does not
|
| 394 |
+
close a supervised gap, but it makes the zero-label operating point trustworthy, which
|
| 395 |
+
is the regime our user occupies. Precision is flat ($0.89$--$0.91$) for
|
| 396 |
+
$\tau\in[0.2,0.8]$, so the operating point is not threshold-brittle, and the result is
|
| 397 |
+
seed-robust: across three training seeds of the same data recipe the union operating
|
| 398 |
+
point is \unionGateThreeSeedPrec{} precision at \unionGateThreeSeedCov{} coverage
|
| 399 |
+
(the shipped adapter is the strongest seed), with every seed clearing the
|
| 400 |
+
$0.70$-precision/$0.30$-coverage bar decisively. All 3-seed intervals in this paper
|
| 401 |
+
are normal-approximation 95\% CIs ($1.96\,\sigma/\sqrt{3}$); the $t$-based
|
| 402 |
+
interval at $n{=}3$ is ${\sim}2.7\times$ wider ($\pm 0.031$ here) and every
|
| 403 |
+
qualitative claim survives it --- the weakest seed alone clears the bar.
|
| 404 |
+
|
| 405 |
+
\textbf{Candidate-constrained planning (negative result).} We also tested constraining
|
| 406 |
+
the planner's \emph{inputs}: the profiler emits evidence-backed (variant$\,\to\,$
|
| 407 |
+
canonical) candidate pairs (frequency dominance, edit similarity, reference membership)
|
| 408 |
+
and the model may only select among them, with a deterministic check dropping
|
| 409 |
+
off-candidate mappings to review flags. As a standalone guard it is strong---the raw
|
| 410 |
+
plan's precision rises from \hospModelPrecVSix{} to \pairsRawPrec{} with no verifier at
|
| 411 |
+
all---but composed with the verifier and union it reaches \pairsUnionPrec{} precision
|
| 412 |
+
at \pairsUnionCov{} coverage, slightly \emph{below} the unconstrained pipeline at the
|
| 413 |
+
same $\tau$: the candidate cap (top-3 per surface) removes some correct repairs the
|
| 414 |
+
verifier would have kept, and the two mechanisms gate the same failure class. We ship
|
| 415 |
+
the verifier and keep candidate constraining available but off by default, reporting
|
| 416 |
+
this as a measured redundancy rather than a stacked win.
|
| 417 |
+
|
| 418 |
+
\begin{figure}[t]
|
| 419 |
+
\centering
|
| 420 |
+
\includegraphics[width=0.62\linewidth]{fig_precision_coverage}
|
| 421 |
+
\caption{Plan-level precision--coverage on hospital (509 real errors), sweeping the
|
| 422 |
+
verifier threshold $\tau$. The union planner dominates the raw model plan; the shipped
|
| 423 |
+
operating point ($\tau{=}0.5$) is annotated.}
|
| 424 |
+
\label{fig:pc}
|
| 425 |
+
\end{figure}
|
| 426 |
+
|
| 427 |
+
\subsection{The 4B fine-tune as one planner instantiation}
|
| 428 |
+
On frozen synthetic gold, the fine-tuned 4B planner reaches canonicalization micro-F1
|
| 429 |
+
\canonFMultiSeed{} --- versus \canonFBig{} for a much larger zero-shot generalist
|
| 430 |
+
prompted identically and \canonFHeur{} for the rule heuristic (best single run
|
| 431 |
+
\canonFOursBest; operation-F1 \opFOurs, JSON validity \jsonValidOurs). On real hospital
|
| 432 |
+
typos the synthetic-only fine-tune scores 0.000 repair recall; adding 20\%
|
| 433 |
+
real-derived supervision lifts it to \hospModelRecall, and a data-scaling iteration
|
| 434 |
+
(tripling the real-derived share from three paired benchmarks) reaches
|
| 435 |
+
\hospModelRecallVSix{} recall at \hospModelPrecVSix{} precision---approaching the
|
| 436 |
+
\frontierZeroShotRecall{} of a frontier-scale zero-shot model. The scaling gain is seed-robust: $+0.09$
|
| 437 |
+
canonicalization F1 over the base mix under identical protocol, with non-overlapping
|
| 438 |
+
3-seed confidence intervals. Real, execution-verified pairs are what transfer:
|
| 439 |
+
the same iteration found frequency-derived and algorithm-cleaned labels both
|
| 440 |
+
\emph{reduce} quality, consistent with our grounding thesis.
|
| 441 |
+
|
| 442 |
+
\subsection{Grounding vs.\ clustering}
|
| 443 |
+
With the errors-are-rare frequency gates now in both paths, grounding and frequency
|
| 444 |
+
clustering are comparable on hospital alone (repairs-only, churn-neutral:
|
| 445 |
+
\hospPrecGrounded{} precision at \hospRecallGrounded{} recall grounded vs
|
| 446 |
+
\hospPrecFreq{} at \hospRecallFreq{} clustering---hospital's dominant errors are
|
| 447 |
+
in-column typos, clustering's best case). Grounding's margin appears where references
|
| 448 |
+
matter: across the five-benchmark real-error macro it reaches \ablFullRealF{} versus
|
| 449 |
+
\ablNoGroundRealF{} for the frequency-clustering ablation ($+29\%$), and it carries
|
| 450 |
+
the behavioral guarantees below.
|
| 451 |
+
On the full suite against OpenRefine (Table~\ref{tab:money}), the result splits
|
| 452 |
+
cleanly by regime, and we report both. On the \emph{real-error} slice---the regime the
|
| 453 |
+
tool exists for---grounded cleaning reaches REAL-F1 \realFGrounded{}, $3.9\times$
|
| 454 |
+
OpenRefine kNN (\realFORKnn) and $5.7\times$ fingerprint (\realFORFp), with seed CIs of
|
| 455 |
+
$\pm$\northGroundedCI. Provenance: the grounded and OpenRefine rows of
|
| 456 |
+
Table~\ref{tab:money} are regenerated at the current system head (2026-06-12,
|
| 457 |
+
post-capability, scorer fix in); the dagger rows keep their original capture
|
| 458 |
+
provenance. The June-10 freeze system measured REAL-F1 \realFGroundedFreeze{} on the
|
| 459 |
+
same protocol --- the $+0.05$ difference is the measured contribution of the four
|
| 460 |
+
deterministic capabilities (\S\ref{sec:capabilities}) on the real-error slice. On the \emph{injected} slice, fingerprint clustering wins
|
| 461 |
+
(\injFORFp{} vs \injFGrounded) at near-zero damage: our case/whitespace injectors are
|
| 462 |
+
exactly the perturbations key-collision normalizes away, so this is its home game and
|
| 463 |
+
we say so. kNN clustering---the method that, like us, attempts typo repair---loses on
|
| 464 |
+
both slices while incurring the highest damage among baselines (\damageORKnn), the
|
| 465 |
+
no-reference over-merging failure the grounding was built to prevent. The shipped
|
| 466 |
+
verified-union system's suite row (REAL-F1 \modelRealF, damage \modelDamage) shows the
|
| 467 |
+
grounding wrapper and heuristic union carry entity canonicalization on these datasets ---
|
| 468 |
+
the model's contribution concentrates on the synthetic regime and hospital repair
|
| 469 |
+
(\S\ref{sec:ws1results}), and the verifier cuts its suite damage to \modelDamage,
|
| 470 |
+
$6\times$ below the grounded heuristic's \damageGrounded{} (HEAD damage vs the union
|
| 471 |
+
row's freeze-time capture --- a disclosed basis mix). Within our own ablations
|
| 472 |
+
(June-10 freeze basis throughout), removing grounding cedes $22\%$ of real-error
|
| 473 |
+
F1 (\ablNoGroundRealF{} vs \ablFullRealF) and forfeits the behavioral guarantees:
|
| 474 |
+
perfect abstention on adversarial traps (\ablFullAbstain) versus
|
| 475 |
+
\ablNoAbstainAbstain{} without abstention, and reference-vetoed wrong merges (e.g.\
|
| 476 |
+
\texttt{guntxrsvillx}$\to$\texttt{huntsville}).
|
| 477 |
+
|
| 478 |
+
\begin{table}[t]
|
| 479 |
+
\centering
|
| 480 |
+
\caption{Wide-suite comparison, 3 injection seeds, churn-neutral metric. NORTH is the
|
| 481 |
+
double-macro harmonic mean; REAL-F1 is the real-error slice. Regenerated at the
|
| 482 |
+
current system head (2026-06-12); the June-10 freeze system measured
|
| 483 |
+
\realFGroundedFreeze{} REAL-F1 / \northGroundedFreeze{} NORTH on the same protocol.}
|
| 484 |
+
\label{tab:money}
|
| 485 |
+
\begin{tabular}{lcccccc}
|
| 486 |
+
\toprule
|
| 487 |
+
System & NORTH & $\pm$95\%CI & REAL-F1 & INJ-F1 & damage & abstain \\
|
| 488 |
+
\midrule
|
| 489 |
+
Grounded (ours) & \northGrounded & \northGroundedCI & \textbf{\realFGrounded} & \injFGrounded & \damageGrounded & \ablFullAbstain \\
|
| 490 |
+
OpenRefine fingerprint & \northORFp & 0.000 & \realFORFp & \injFORFp & \damageORFp & 1.000 \\
|
| 491 |
+
OpenRefine kNN & \northORKnn & 0.002 & \realFORKnn & 0.148 & \damageORKnn & 1.000 \\
|
| 492 |
+
Verified union 4B (shipped)$^{\dagger}$ & -- & -- & \modelRealF & -- & \modelDamage & \modelAbstain \\
|
| 493 |
+
\midrule
|
| 494 |
+
Baran (oracle det.\ + 20 labels)$^{\ddagger}$ & -- & -- & \realFBaran & -- & \damageBaran & -- \\
|
| 495 |
+
Jellyfish-13B (ED+DI)$^{\ddagger}$ & -- & -- & \realFJelly & -- & \damageJelly & -- \\
|
| 496 |
+
\bottomrule
|
| 497 |
+
\end{tabular}
|
| 498 |
+
|
| 499 |
+
\smallskip
|
| 500 |
+
{\small $^{\dagger}$single seed, REAL + typo-injected slice only (GPU cost); other rows
|
| 501 |
+
are 3-seed means. $^{\ddagger}$real slice only, disclosed protocol asymmetries
|
| 502 |
+
(\S\ref{sec:ws4}): Baran uses oracle error positions + gold labels; Jellyfish is our
|
| 503 |
+
detect-then-impute composition with seen-data caveats.}
|
| 504 |
+
\end{table}
|
| 505 |
+
|
| 506 |
+
\subsection{Generalization to never-seen tables}
|
| 507 |
+
\label{sec:wild}
|
| 508 |
+
The freeze-version system above was then pointed at data it had never seen, under
|
| 509 |
+
three new harnesses (all released with this paper as the \textsc{WildClean} bundle).
|
| 510 |
+
\textbf{(1) Paired bench}: \nPairs{} dirty/gold pairs spanning the Raha suite, SemTab
|
| 511 |
+
ToughTables, government open-data typo corpora, entity-matching tables, and
|
| 512 |
+
LLM-cleaning evaluation sets. On the 35 pairs from sources absent from training ---
|
| 513 |
+
a count that coincidentally equals, but is distinct from, the \nWild{} gold-free wild
|
| 514 |
+
tables of harness~(2) below --- the
|
| 515 |
+
post-freeze system scores \textbf{macro F1 \unseenMacroF{} at damage
|
| 516 |
+
\unseenMacroDamage}. The largest single contribution is the regime
|
| 517 |
+
\S\ref{sec:capabilities}(b) unlocks: on five all-unique entity tables where no
|
| 518 |
+
in-column frequency signal exists, F1 moves from $0.0$ to \ttFOne{} at zero damage.
|
| 519 |
+
Cross-row voting (\S\ref{sec:capabilities}c) is the second: flights---many sources
|
| 520 |
+
reporting the same flight---goes from \flightsBaseF{} to \flightsVoteF{} F1
|
| 521 |
+
heuristic-only, and the heuristic hospital path doubles from \hospBaseHeur{} to
|
| 522 |
+
\hospVoteHeur{}. The hospital union gate is invariant under all of this
|
| 523 |
+
(\unionGatePrec{} at \unionGateCov). \textbf{(2) Wild bench}: \nWild{} uncurated
|
| 524 |
+
in-the-wild tables (open-data portals, GitHub, Kaggle) with no gold; we score seeded
|
| 525 |
+
inject--recovery on each table's own data (mean recovery \wildRecovery{} over the 34
|
| 526 |
+
tables with inject scores; one table has none) plus a
|
| 527 |
+
behavioral audit: every run yields a valid plan, every changed cell is attributable
|
| 528 |
+
to a logged operation --- \textbf{zero silent edits across all \nWild{} tables}.
|
| 529 |
+
\textbf{(3) Trust audit at scale}: \nTrust{} GitTables tables, same property ---
|
| 530 |
+
\nTrust{}/\nTrust{} valid plans, zero crashes, zero silent edits. The held-out-source
|
| 531 |
+
generalization metric (train and evaluation drawn from disjoint benchmark sources)
|
| 532 |
+
remains low in absolute terms (GEN-F1 \genFTwo{}, variant-recall \genVRTwo{}, damage
|
| 533 |
+
\genDamageTwo): cleaning unfamiliar tables is far from solved, and we report the
|
| 534 |
+
number to anchor the next section's claim about \emph{where} the capability that does
|
| 535 |
+
exist actually lives.
|
| 536 |
+
|
| 537 |
+
\subsection{Where capability lives: a bounded null for fine-tuning}
|
| 538 |
+
\label{sec:negative}
|
| 539 |
+
Every attempt to move never-seen-table performance through the model weights failed;
|
| 540 |
+
every gain in \S\ref{sec:wild} came from deterministic machinery plus the verifier.
|
| 541 |
+
Five further supervised fine-tunes --- adding 109k harvested real-world alias pairs
|
| 542 |
+
(ToughTables-derived, MusicBrainz search hints, RxNorm, OpenFlights), error-dense
|
| 543 |
+
episode mixes, and a suspects-contract retrain --- left held-out GEN-F1
|
| 544 |
+
\emph{statistically bounded}: every retrain's delta is positive but negligible (mean
|
| 545 |
+
$+0.003$), never approaching the pre-registered $\delta{=}0.05$. ``Bounded'' is a
|
| 546 |
+
tested equivalence claim, not an eyeballed one~\cite{lakens}: across the five-retrain
|
| 547 |
+
series the mean held-out GEN-F1 delta (retrain minus champion) is $+0.0028$
|
| 548 |
+
(90\% bootstrap CI $[+0.0008, +0.0049]$, strictly positive;
|
| 549 |
+
10{,}000 resamples, seed 42; per-dataset granularity, $n{=}15$ over 3 held-out
|
| 550 |
+
sources $\times$ 5 retrains --- per-pair deltas do not exist for the retrain
|
| 551 |
+
series, so within-retrain deltas are clustered and we add a retrain-level
|
| 552 |
+
robustness check, $n{=}5$ macro deltas), and TOST rejects effects beyond the
|
| 553 |
+
pre-registered SESOI of $\pm 0.05$ ($p = 8.0\times10^{-16}$; retrain-level check
|
| 554 |
+
$p = 8.3\times10^{-8}$). One disclosure sharpens the clustering caveat: two
|
| 555 |
+
retrains' held-out rows are \emph{bit-identical} --- mechanically verified as
|
| 556 |
+
verifier-collapse, not a data error (their raw plans share zero mapping entries,
|
| 557 |
+
9 vs.\ 82 on flights, yet the verifier kills all of both, so each union
|
| 558 |
+
degenerates to the same deterministic plan;
|
| 559 |
+
\texttt{eval/results/equivalence\_coincidence.json}) --- so the $n{=}15$ rows
|
| 560 |
+
carry fewer independent observations than their count suggests, which is exactly
|
| 561 |
+
why the $n{=}5$ retrain-level test is the one we lean on. The collapse itself is
|
| 562 |
+
the finding in miniature: different weights, same held-out behavior, because the
|
| 563 |
+
verifier and the deterministic machinery decide what survives. Two reconciliations make the claim auditable. First, the
|
| 564 |
+
basis: the equivalence series is scored against the champion's absolute GEN-F1 of
|
| 565 |
+
\genChampionBasis{}, while the \genFTwo{} of \S\ref{sec:wild} is the \emph{shipped
|
| 566 |
+
system} at the post-freeze HEAD with all deterministic capabilities --- the
|
| 567 |
+
equivalence series scores each retrain's model-union path at its own capture time,
|
| 568 |
+
so the two figures share a metric but not a basis. Second, the SESOI: weight
|
| 569 |
+
interventions move GEN-F1 by at most $0.005$, while the deterministic machinery of
|
| 570 |
+
\S\ref{sec:capabilities} moved the unseen-pair macro from $0.10$ to \unseenMacroF{}
|
| 571 |
+
--- $\delta{=}0.05$ sits an order of magnitude above the measured weight effect and
|
| 572 |
+
well below the machinery effect, which is exactly the boundary the test is meant to
|
| 573 |
+
police. Mixing harvested pairs into the training blend
|
| 574 |
+
\emph{diluted} the synthetic skill the executor verifies (a monotonic dilution law
|
| 575 |
+
across mix ratios). A GRPO pilot using the executor as a verifiable reward (the
|
| 576 |
+
direction RLVR table work~\cite{tabler1} motivates) was negative in all three arms at
|
| 577 |
+
4B/LoRA scale: the main arm and a KL-anchored variant degraded plan-format validity,
|
| 578 |
+
and a random-reward control arm reproduced the same drift, identifying it as an RL
|
| 579 |
+
artifact rather than signal~\cite{spurious}. We state this as a \emph{bounded} null,
|
| 580 |
+
not a universal one: at 4B/LoRA scale, under our propose/execute protocol and
|
| 581 |
+
training budgets, no weight intervention we ran produced measurable movement in
|
| 582 |
+
never-seen-table repair --- profiling visibility, reference grounding, cross-row
|
| 583 |
+
consensus, convention conservatism, and plan-level verification carry the capability
|
| 584 |
+
that exists. The bound is explicit: results with full-scale RL infrastructure
|
| 585 |
+
(execution-verified rewards on multi-GPU RLVR stacks~\cite{spreadsheetrl,tabler1})
|
| 586 |
+
show task skill moving at the same parameter scale, so our claim is about what
|
| 587 |
+
SFT-and-pilot-RL buy in this protocol class, not about reinforcement learning in
|
| 588 |
+
general. A second explicit bound: every weight experiment here uses the Qwen3
|
| 589 |
+
family --- and the very work we cite to explain the control arm's drift documents
|
| 590 |
+
that random-reward GRPO effects are themselves family-sensitive~\cite{spurious}
|
| 591 |
+
--- so the null is stated for Qwen3-class models pending a cross-family
|
| 592 |
+
replication. Concurrent evaluations corroborate the mechanism from independent
|
| 593 |
+
directions~\cite{distort,debate}. The practical corollary is unusual but actionable:
|
| 594 |
+
a contributor who wants to improve a system like this should write a deterministic
|
| 595 |
+
capability and gate it with the verifier, not collect more training data.
|
| 596 |
+
|
| 597 |
+
The null extends to test-time compute --- with one instructive exception that
|
| 598 |
+
\emph{confirms} the architecture claim. Self-consistency \emph{voting} over
|
| 599 |
+
$N{=}16$ temperature-0.7 samples (cell-edit-level majority, run through the
|
| 600 |
+
identical verifier--union pipeline) yields 0.906 precision at 0.454 coverage
|
| 601 |
+
versus 0.9055 at 0.4519 for matched greedy decoding on the same local runtime ---
|
| 602 |
+
a null at matched precision, the visibility law from the test-time side: voting
|
| 603 |
+
cannot surface repairs the profile does not expose, and it actively discards
|
| 604 |
+
verified-recoverable coverage. But pooling \emph{every} mapping from all 16
|
| 605 |
+
samples and letting the verifier filter the union gives the best operating point
|
| 606 |
+
we measure for the 4B: \textbf{0.911 precision at 0.483 coverage} ($+0.6$ points
|
| 607 |
+
precision, $+7.1$ points coverage over the shipped gate; an independent $N{=}8$
|
| 608 |
+
replication reproduces the \emph{voted} point to $\pm 0.0003$ precision /
|
| 609 |
+
$\pm 0.002$ coverage, and the greedy anchor exactly). The lesson is the paper's thesis in miniature: sampling
|
| 610 |
+
helps only as a \emph{candidate generator}; consensus adds nothing the verifier
|
| 611 |
+
does not already provide --- pool candidates, verify, do not vote. Separately,
|
| 612 |
+
the local capture path itself (Q8 quantization with grammar-constrained decoding)
|
| 613 |
+
is worth $+3.9$ points of coverage over the original Modal capture at equal
|
| 614 |
+
precision.
|
| 615 |
+
|
| 616 |
+
\subsection{Zero-label capability scaling: the verifier harness is planner-agnostic}
|
| 617 |
+
\label{sec:scaling}
|
| 618 |
+
The negative result bounds what fine-tuning small weights buys; it says nothing
|
| 619 |
+
about raw capability. To separate the two we dropped zero-shot, $\leq$32B
|
| 620 |
+
open-weights planners --- with \emph{no} task training --- into the identical
|
| 621 |
+
hospital pipeline the 4B fine-tune uses: same prompt contract, same
|
| 622 |
+
verify($\tau{=}0.5$), same union with the grounded heuristic
|
| 623 |
+
(Table~\ref{tab:scaling}). devstral-small-2-24B and gemma4-31B both reach
|
| 624 |
+
\textbf{\scalePrecBig{} precision at \scaleCovBig{} coverage} --- exceeding the
|
| 625 |
+
fine-tune's union point of \unionGatePrec{} at \unionGateCov{} --- while
|
| 626 |
+
nemotron-30B reaches \scalePrecNemo{} at \scaleCovNemo{} with JSON-plan validity
|
| 627 |
+
0.4 (validity is part of the measurement: a planner that cannot reliably emit the
|
| 628 |
+
plan schema loses coverage before capability is measured). gpt-oss-20B is
|
| 629 |
+
excluded as a serving failure, documented rather than scored as capability: the
|
| 630 |
+
hosted proxy returned empty content on every planning call despite full-length
|
| 631 |
+
generation. The arm is multi-family (Mistral, Google, NVIDIA), which addresses
|
| 632 |
+
the single-family bound of \S\ref{sec:negative} for the inference side; the
|
| 633 |
+
weight-training null itself remains Qwen3-scoped. Disclosure: these models were
|
| 634 |
+
measured via hosted inference for speed; all are $\leq$32B open weights and
|
| 635 |
+
locally deployable in principle. The interpretation we draw is the paper's
|
| 636 |
+
sharpest: SFT at 4B does not buy held-out generalization (\S\ref{sec:negative}),
|
| 637 |
+
but raw capability at 24--31B does lift the same harness --- the verifier/union
|
| 638 |
+
architecture is the portable contribution, converting any sufficiently capable
|
| 639 |
+
planner into a trustworthy cleaner.
|
| 640 |
+
|
| 641 |
+
\begin{table}[t]
|
| 642 |
+
\centering
|
| 643 |
+
\caption{Zero-shot $\leq$32B planners in the identical verify($\tau{=}0.5$)+union
|
| 644 |
+
harness, hospital's \hospErrors{} real errors. Validity = fraction of planning
|
| 645 |
+
calls returning schema-valid JSON. Runtime = wall-clock for the planning calls on
|
| 646 |
+
hosted endpoints (single capture, no seeds; the 4B row is a prior Modal A100
|
| 647 |
+
capture with no comparable local figure). Each scaling row is a single capture;
|
| 648 |
+
the primary evidence is the union coverage delta ($+0.07$) at matched-or-better
|
| 649 |
+
precision, not any single cell. For context, 16-sample pooling lifts the 4B
|
| 650 |
+
fine-tune to $0.911@0.483$ at $16\times$ planning compute
|
| 651 |
+
(\S\ref{sec:negative}); the 24--31B planners reach $0.915@0.485$ in a single
|
| 652 |
+
greedy pass --- single-pass capability versus test-time compute, both converted
|
| 653 |
+
into trustworthy operating points by the same verifier. Bold marks the best union operating point.
|
| 654 |
+
gpt-oss-20B excluded (serving failure: empty
|
| 655 |
+
proxy responses, not measurable capability).
|
| 656 |
+
The identical devstral/gemma rows are a verified counting coincidence, not a
|
| 657 |
+
scoring artifact: their applied cell-edit sets share 266 of 270 cells, each
|
| 658 |
+
commits 4 model-specific repairs (all correct), and the totals coincide
|
| 659 |
+
(\texttt{eval/results/scaling\_coincidence.json}).
|
| 660 |
+
}
|
| 661 |
+
\label{tab:scaling}
|
| 662 |
+
\footnotesize
|
| 663 |
+
\begin{tabular}{lccccc}
|
| 664 |
+
\toprule
|
| 665 |
+
Planner & Params & Gated P@C & Union P@C & Validity & Runtime (s) \\
|
| 666 |
+
\midrule
|
| 667 |
+
ScrubData-v6 (Qwen3-4B fine-tune) & 4B & 0.993 @ 0.287 & 0.905 @ 0.413 & --- & --- \\
|
| 668 |
+
devstral-small-2 (Mistral) & 24B & 0.943 @ 0.426 & \textbf{0.915 @ 0.485} & 1.0 & \runtimeDevstral \\
|
| 669 |
+
nemotron-3-nano (NVIDIA) & 30B & 1.000 @ 0.138 & 0.877 @ 0.336 & 0.4 & \runtimeNemo \\
|
| 670 |
+
gemma4 (Google) & 31B & 0.943 @ 0.426 & \textbf{0.915 @ 0.485} & 1.0 & \runtimeGemma \\
|
| 671 |
+
\bottomrule
|
| 672 |
+
\end{tabular}
|
| 673 |
+
\end{table}
|
| 674 |
+
|
| 675 |
+
\subsection{Ablations}
|
| 676 |
+
All ablations are 3-seed means (CIs $\le\pm0.003$). Removing abstention costs $-0.013$
|
| 677 |
+
NORTH, raises damage to \ablNoAbstainDamage{} (from \ablFullDamage), and collapses trap
|
| 678 |
+
abstention to \ablNoAbstainAbstain. Removing the ambiguity margin costs $-0.006$ with
|
| 679 |
+
$+0.001$ damage. Removing case matching costs $-0.002$ under the churn-neutral metric
|
| 680 |
+
(and \emph{gained} $+0.12$ under the uncorrected metric---the artifact). Replacing
|
| 681 |
+
grounding with frequency clustering gains $+0.020$ NORTH, all of it from the injected
|
| 682 |
+
slice (\S\ref{sec:eval}), while ceding $-0.039$ real-error F1---the trade the system
|
| 683 |
+
refuses by design.
|
| 684 |
+
|
| 685 |
+
\subsection{Learned-repair baselines under disclosed protocols}
|
| 686 |
+
\label{sec:ws4}
|
| 687 |
+
We additionally run two learned-repair baselines on the real-error (Raha) slice,
|
| 688 |
+
under the identical churn-neutral metric but with honestly disclosed protocol
|
| 689 |
+
asymmetries. \textbf{Baran}~\cite{raha} is semi-supervised: we run its reference
|
| 690 |
+
configuration---oracle error positions from the dirty/gold diff plus 20 gold-labeled
|
| 691 |
+
tuples per dataset (its package default), without the optional Wikipedia-pretrained
|
| 692 |
+
value models. It reaches REAL-F1 \realFBaran{}$\,\pm$\realFBaranCI{} (3 label-sampling
|
| 693 |
+
seeds) at \damageBaran{} damage---an upper bound under a strictly more informed
|
| 694 |
+
protocol than ours (zero labels, no oracle detection); with oracle positions it can
|
| 695 |
+
essentially only edit true-error cells, so its near-zero damage is structural.
|
| 696 |
+
\textbf{Jellyfish-13B}~\cite{jellyfish} publishes per-cell error detection and
|
| 697 |
+
imputation but no repair task; we compose the two (detect, then impute flagged cells
|
| 698 |
+
with the attribute masked) --- a pipeline of our construction, not theirs. It scores
|
| 699 |
+
REAL-F1 \realFJelly{} at \damageJelly{} damage (single seed, recommended decoding;
|
| 700 |
+
note hospital is in its instruction-tuning data and flights/rayyan in its published
|
| 701 |
+
evaluation suite, so these numbers may flatter it). Neither baseline is run on the
|
| 702 |
+
56-spec injected suite (computationally and methodologically out of scope for
|
| 703 |
+
semi-supervised and per-cell-LLM repair); their NORTH/INJ-F1 cells in
|
| 704 |
+
Table~\ref{tab:money} are blank by design. The comparison locates our contribution:
|
| 705 |
+
zero-config systems (ours, OpenRefine) occupy a different protocol class from
|
| 706 |
+
supervised repair, and the verifier (\S\ref{sec:ws1results}) is what makes the
|
| 707 |
+
zero-config class precise enough to trust, not what closes the labeled gap.
|
| 708 |
+
|
| 709 |
+
Table~\ref{tab:perdataset} breaks the real-error slice down per dataset at HEAD.
|
| 710 |
+
The verified-union rows are reported with their honest shape: off hospital the
|
| 711 |
+
union turns ultra-conservative --- on rayyan it commits 12 changes at 0.001
|
| 712 |
+
damage; on beers it holds precision 0.546 at recall 0.018. The gate's precision
|
| 713 |
+
premise transfers as \emph{safety} (union damage stays at 0.001--0.080) but not
|
| 714 |
+
as coverage. The movies\_1 union cell ($^{q}$: local Q8 capture, the disclosed
|
| 715 |
+
quantized protocol) is the instructive worst case: on entity-rich name columns
|
| 716 |
+
the quantized planner proposes plausible-but-wrong merges
|
| 717 |
+
(\texttt{The Longest Day}$\,\to\,$\texttt{The Longest Yard}); the verifier kills
|
| 718 |
+
most, and what leaks through is damage within the disclosed band with zero
|
| 719 |
+
credited fixes --- the planner contributes nothing there, and the system's value
|
| 720 |
+
is that it \emph{contains} a bad planner rather than amplifying it. This directly
|
| 721 |
+
answers the co-adaptation concern: hospital is where the model's learned mappings
|
| 722 |
+
live, and elsewhere the system abstains or contains rather than guesses.
|
| 723 |
+
|
| 724 |
+
\begin{table}[t]
|
| 725 |
+
\centering
|
| 726 |
+
\caption{Per-dataset real-error results (Raha slice), churn-neutral F1 / damage.
|
| 727 |
+
Grounded is the HEAD deterministic system; OR = OpenRefine reimplementations;
|
| 728 |
+
Union is the verified union planner ($\tau{=}0.5$) where a captured model plan
|
| 729 |
+
exists (movies\_1 capture pending); Baran uses oracle error positions + 20 gold
|
| 730 |
+
labels (mean of 3 label-sampling seeds) and is a supervised reference, not a
|
| 731 |
+
peer.}
|
| 732 |
+
\label{tab:perdataset}
|
| 733 |
+
\footnotesize
|
| 734 |
+
\begin{tabular}{lccccc}
|
| 735 |
+
\toprule
|
| 736 |
+
Dataset & Grounded (HEAD) & OR fingerprint & OR kNN & Verified union & Baran (oracle+20) \\
|
| 737 |
+
\midrule
|
| 738 |
+
hospital & 0.258 / .066 & 0.000 / .000 & 0.189 / .083 & 0.567 / .001 & 0.827 / .004 \\
|
| 739 |
+
beers & 0.025 / .005 & 0.194 / .000 & 0.086 / .074 & 0.035 / .001 & 0.918 / .000 \\
|
| 740 |
+
flights & 0.127 / .082 & 0.000 / .000 & 0.014 / .065 & 0.035 / .080 & 1.000 / .000 \\
|
| 741 |
+
rayyan & 0.000 / .118 & 0.000 / .001 & 0.002 / .008 & 0.000 / .001 & 0.402 / .010 \\
|
| 742 |
+
movies\_1 & 0.714 / .025 & 0.002 / .018 & 0.001 / .072 & 0.000 / .025$^{q}$ & 0.909 / .001 \\
|
| 743 |
+
\midrule
|
| 744 |
+
macro F1 & \realFOursHead & 0.039 & 0.058 & --- & \realFBaran \\
|
| 745 |
+
\bottomrule
|
| 746 |
+
\end{tabular}
|
| 747 |
+
\end{table}
|
| 748 |
+
|
| 749 |
+
\subsection{A matched label budget separates the supervision regimes}
|
| 750 |
+
\label{sec:labelcurve}
|
| 751 |
+
The Baran comparison above is two points (zero labels, twenty labels); the
|
| 752 |
+
matched-budget curve in Figure~\ref{fig:labelcurve} measures what each label is
|
| 753 |
+
worth to each system on the same five-dataset real-error macro. At zero labels
|
| 754 |
+
Baran --- even \emph{retaining} its oracle error positions --- repairs nothing
|
| 755 |
+
(F1 \realFBaranZero, 3 seeds): its value models have nothing to learn from.
|
| 756 |
+
ScrubData operates at \realFOursHead{} with zero configuration. With labels Baran
|
| 757 |
+
climbs steeply (\realFBaranFive{} at $k{=}5$, \realFBaran{} at $k{=}20$): the two
|
| 758 |
+
systems occupy complementary supervision regimes, a relationship now measured
|
| 759 |
+
rather than asserted. ScrubData's own $k$-label arm uses the labels \emph{only}
|
| 760 |
+
to validate and expand the verifier accept set --- no retraining, no oracle
|
| 761 |
+
positions: $\realFOursFive \pm 0.023$ at $k{=}5$ and $\realFOursTwenty \pm 0.012$
|
| 762 |
+
at $k{=}20$ (3 label-sampling seeds). The disclosed asymmetry stands at every
|
| 763 |
+
budget: Baran keeps oracle error positions throughout, so the curve is an upper
|
| 764 |
+
bound in its favor.
|
| 765 |
+
|
| 766 |
+
\begin{figure}[t]
|
| 767 |
+
\centering
|
| 768 |
+
\includegraphics[width=0.62\linewidth]{fig_label_curve}
|
| 769 |
+
\caption{Matched-budget label curve, five-dataset real-error macro F1. At
|
| 770 |
+
$k{=}0$ Baran repairs nothing even with oracle error positions retained;
|
| 771 |
+
ScrubData operates at \realFOursHead{} with zero configuration. With labels
|
| 772 |
+
Baran climbs steeply --- complementary supervision regimes, measured. Error
|
| 773 |
+
bars ($\pm$) are standard deviations over 3 label-sampling seeds; the Baran
|
| 774 |
+
$k{=}20$ point reuses the 3-seed baseline run of \S\ref{sec:ws4}.}
|
| 775 |
+
\label{fig:labelcurve}
|
| 776 |
+
\end{figure}
|
| 777 |
+
|
| 778 |
+
\subsection{Degenerate baselines and cost-weighted damage}
|
| 779 |
+
\label{sec:degenerate}
|
| 780 |
+
Four degenerate policies pin the metric's floor and ceiling on the full 42-pair
|
| 781 |
+
bench (Table~\ref{tab:degenerate}). No-op and oracle land exactly at 0 and 1;
|
| 782 |
+
abstain-all is score-identical to no-op because the repair metric is flag-blind
|
| 783 |
+
by design (abstentions are audited separately); seeded random editing of 5\% of
|
| 784 |
+
cells is vandalism the metric must punish. Since F1 alone under-punishes
|
| 785 |
+
vandalism, we add a cost-weighted score in the Effective-Reliability style,
|
| 786 |
+
$\Phi_c = (\mathrm{fixes} - c\cdot\mathrm{damaged})/\mathrm{errors}$ at
|
| 787 |
+
$c \in \{1, 5, 10\}$: random editing scores $-0.49$ to $-4.89$, while the
|
| 788 |
+
shipped system stays positive at $c{=}1$ (\degShippedPhiOne) --- and goes
|
| 789 |
+
negative at higher $c$, which is the honest reading: at 10:1 cost asymmetry,
|
| 790 |
+
only near-zero-damage operating points (the verified union) are defensible.
|
| 791 |
+
|
| 792 |
+
One disclosure: the oracle acceptance check itself surfaced a scorer artifact
|
| 793 |
+
--- 3 cells in 1.79M held the literal string \texttt{Nan} (a first name), which
|
| 794 |
+
parses to float NaN and was unequal to itself --- now fixed in
|
| 795 |
+
\texttt{eval/metrics.py} with a regression test; published numbers shift by
|
| 796 |
+
less than $10^{-4}$.
|
| 797 |
+
|
| 798 |
+
\begin{table}[t]
|
| 799 |
+
\centering
|
| 800 |
+
\caption{Degenerate policies pin the metric (42 pairs, churn-neutral macro;
|
| 801 |
+
random-edit: seeded, 5\% of cells). $\Phi_c$ is micro-summed
|
| 802 |
+
$(\mathrm{fixes} - c\cdot\mathrm{damaged})$ per benchmark error. ``Shipped''
|
| 803 |
+
here is the deterministic grounded path on the 42 pairs (damage
|
| 804 |
+
\degShippedDamage), distinct from the verified-union suite row of
|
| 805 |
+
Table~\ref{tab:money} (damage \modelDamage).}
|
| 806 |
+
\label{tab:degenerate}
|
| 807 |
+
\small
|
| 808 |
+
\begin{tabular}{lccccccc}
|
| 809 |
+
\toprule
|
| 810 |
+
Policy & F1 & P & R & damage & $\Phi_1$ & $\Phi_5$ & $\Phi_{10}$ \\
|
| 811 |
+
\midrule
|
| 812 |
+
no-op & 0.000 & 1.000 & 0.000 & 0.000 & $0.00$ & $0.00$ & $0.00$ \\
|
| 813 |
+
abstain-all & 0.000 & 1.000 & 0.000 & 0.000 & $0.00$ & $0.00$ & $0.00$ \\
|
| 814 |
+
random-edit & 0.000 & 0.001 & 0.001 & 0.049 & $-0.49$ & $-2.45$ & $-4.89$ \\
|
| 815 |
+
oracle & 1.000 & 1.000 & 1.000 & 0.000 & $+1.00$ & $+1.00$ & $+1.00$ \\
|
| 816 |
+
shipped & \degShippedF & \degShippedP & 0.308 & \degShippedDamage & $+0.13$ & $-1.37$ & $-3.26$ \\
|
| 817 |
+
\bottomrule
|
| 818 |
+
\end{tabular}
|
| 819 |
+
\end{table}
|
| 820 |
+
|
| 821 |
+
\subsection{Calibration of abstention}
|
| 822 |
+
\label{sec:calibration}
|
| 823 |
+
On a probe of reference-entity typos plus garbage traps, retrieval confidence is a
|
| 824 |
+
usable selective-prediction signal: AURC \aurc, ECE \ece{} (over-confident;
|
| 825 |
+
temperature scaling is future work), and
|
| 826 |
+
precision rises monotonically with threshold---\precAtDefault{} precision at the default
|
| 827 |
+
$\tau{=}0.84$ (coverage \covAtDefault), and $\geq$95\% precision at
|
| 828 |
+
$\tau{=}\threshNinetyFive$ (coverage \covNinetyFive). Figure~\ref{fig:rc} shows the
|
| 829 |
+
risk--coverage curve.
|
| 830 |
+
|
| 831 |
+
\begin{figure}[t]
|
| 832 |
+
\centering
|
| 833 |
+
\includegraphics[width=0.62\linewidth]{fig_risk_coverage}
|
| 834 |
+
\caption{Risk--coverage for grounded city reconciliation (650 probes). Operating points
|
| 835 |
+
annotated; the confidence supports thresholded abstention.}
|
| 836 |
+
\label{fig:rc}
|
| 837 |
+
\end{figure}
|
| 838 |
+
|
| 839 |
+
\section{Limitations}
|
| 840 |
+
Reference coverage is the recall ceiling: entities absent from the taxonomy abstain by
|
| 841 |
+
design, which is safe but not helpful; coverage work (larger gazetteers, ROR for
|
| 842 |
+
organizations) moves recall directly. Our damage metric is convention-tolerant for case
|
| 843 |
+
and whitespace but still counts alias expansion (\texttt{NYC}$\to$\texttt{New York}) as
|
| 844 |
+
damage when the gold keeps the alias---a value-level convention question we leave open.
|
| 845 |
+
The confidence signal is over-confident (ECE \ece); temperature scaling is future
|
| 846 |
+
work. The injected half of the suite, while seeded and reproducible, inherits the
|
| 847 |
+
injector's error model; we mitigate with the real-error slice and report both. All
|
| 848 |
+
weight-training experiments (SFT and GRPO) use a single model family (Qwen3), so
|
| 849 |
+
the negative result of \S\ref{sec:negative} is family-scoped until replicated on a
|
| 850 |
+
second family. PII
|
| 851 |
+
coverage is English-only, and we make no de-identification guarantee. Finally, the
|
| 852 |
+
fine-tune headline is reported with multi-seed confidence intervals, but the wide-suite
|
| 853 |
+
model row is single-seed for cost reasons and scoped as such.
|
| 854 |
+
|
| 855 |
+
\section{Conclusion}
|
| 856 |
+
A planner/executor decomposition with plan-level selective prediction --- the model
|
| 857 |
+
proposes, a deterministic engine executes, a verifier gates every mapping --- turns
|
| 858 |
+
LLM data cleaning from a trust liability into an auditable system: every change is a
|
| 859 |
+
named, reversible operation; uncertain actions become review flags rather than silent
|
| 860 |
+
corruptions; and the evaluation itself is built to resist gaming. The post-freeze
|
| 861 |
+
program sharpened the architecture into a finding: across
|
| 862 |
+
five further fine-tunes and a three-arm GRPO pilot, the weights never moved
|
| 863 |
+
never-seen-table performance --- deterministic visibility, grounding, consensus, and
|
| 864 |
+
verification did, at zero silent edits across \nWild{} wild tables and a
|
| 865 |
+
\nTrust{}-table trust audit. The scaling arm completes the picture: the bounded null
|
| 866 |
+
is about fine-tuning small weights, not about capability --- two of three zero-shot
|
| 867 |
+
24--31B planners dropped into the unchanged verifier harness exceed the
|
| 868 |
+
fine-tune's operating point (\S\ref{sec:scaling}), so the architecture is
|
| 869 |
+
planner-agnostic: capability gains arrive as better operating points without
|
| 870 |
+
retraining. The shipped system runs
|
| 871 |
+
entirely locally on commodity hardware and no data leaves the machine; the
|
| 872 |
+
scaling-arm planners were measured via hosted endpoints, but all are locally
|
| 873 |
+
deployable open weights. We believe the recipe---propose/execute decomposition,
|
| 874 |
+
verification-by-execution, retrieval-grounded outputs, and selective prediction over
|
| 875 |
+
deterministic capabilities---is a template for deploying small specialized models on
|
| 876 |
+
other structured tasks.
|
| 877 |
+
|
| 878 |
+
\section*{Reproducibility}
|
| 879 |
+
\begin{sloppypar}
|
| 880 |
+
The model weights are public:
|
| 881 |
+
\url{https://huggingface.co/ricalanis/scrubdata-qwen3-4b-v6-q8}. Code, evaluation
|
| 882 |
+
suite, and result artifacts are released at the project repository,
|
| 883 |
+
\url{https://github.com/ricalanis/scrubdata-hackathon} (public upon publication,
|
| 884 |
+
available to reviewers from the initial submission). The \textsc{WildClean}
|
| 885 |
+
bundle --- redistributable dirty/gold pairs, the GitTables audit slice, open
|
| 886 |
+
vocabularies, result JSONs, and license-gated loaders for the non-redistributable
|
| 887 |
+
pairs --- is a public Hugging Face dataset
|
| 888 |
+
(\url{https://huggingface.co/datasets/ricalanis/wildclean}). The shipped product
|
| 889 |
+
planner is the identical code path measured here (\texttt{scrubdata/active.py}).
|
| 890 |
+
\end{sloppypar}
|
| 891 |
+
|
| 892 |
+
\paragraph{Release integrity.} Our own reproducibility QA discovered that the
|
| 893 |
+
published Q8\_0 GGUF was corrupted by an export bug (the export declared a wrong
|
| 894 |
+
end-of-generation token id inside the Qwen3 vocabulary, degenerating into
|
| 895 |
+
tool-call loops on all runtimes; a base-model control isolated the fault to the
|
| 896 |
+
export, not the adapter). It has been re-exported from the v6 adapter and
|
| 897 |
+
replaced under the same filename, with both sha256 checksums recorded in the
|
| 898 |
+
model card's Integrity section. Third-party reproduction of the model-path
|
| 899 |
+
numbers additionally requires constrained decoding on long prompts ---
|
| 900 |
+
\texttt{format=json} under Ollama, or
|
| 901 |
+
\texttt{suppress\_tokens=[151657,151658]} under transformers --- which is now
|
| 902 |
+
documented in the model card and \texttt{notebooks/Modelfile}.
|
| 903 |
+
|
| 904 |
+
\paragraph{Setup.} Clone the repository and run \texttt{uv sync} (Python 3.12;
|
| 905 |
+
\texttt{uv} resolves the pinned environment). The non-redistributable benchmark
|
| 906 |
+
pairs materialize from their original sources with the \textsc{WildClean}
|
| 907 |
+
\texttt{loaders.py}. Model-path results additionally need the released Q8\_0 GGUF
|
| 908 |
+
served by a local Ollama (\texttt{SCRUBDATA\_MODEL}); every deterministic-path
|
| 909 |
+
number runs with no model at all. Baran runs in the separate pinned environment
|
| 910 |
+
documented at the top of \texttt{eval/run\_baran.py}; Jellyfish-13B runs remotely
|
| 911 |
+
via Modal.
|
| 912 |
+
|
| 913 |
+
\paragraph{One command per reported number} (all from the repository root, at the
|
| 914 |
+
released revision):
|
| 915 |
+
|
| 916 |
+
\begin{center}
|
| 917 |
+
\footnotesize
|
| 918 |
+
\begin{tabular}{@{}ll@{}}
|
| 919 |
+
\toprule
|
| 920 |
+
Reported result & Command \\
|
| 921 |
+
\midrule
|
| 922 |
+
Wide-suite comparison (Table~\ref{tab:money}) & \texttt{python -m eval.run\_real\_multi --out eval/results} \\
|
| 923 |
+
Precision--coverage curve + gate & \texttt{python -m eval.precision\_curve} \\
|
| 924 |
+
\quad (Figure~\ref{fig:pc}, \S\ref{sec:ws1results}) & \texttt{\ \ --plan eval/results/v6\_hospital\_raw\_plan.json --union} \\
|
| 925 |
+
Ablations & \texttt{python -m eval.ablations} \\
|
| 926 |
+
Calibration (Figure~\ref{fig:rc}) & \texttt{python -m eval.calibration} \\
|
| 927 |
+
PII leak test & \texttt{python -m eval.pii\_leak} \\
|
| 928 |
+
Baran baseline & \texttt{python eval/run\_baran.py}, then \\
|
| 929 |
+
& \texttt{python -m eval.baselines\_learned --score-baran} \\
|
| 930 |
+
Jellyfish baseline & \texttt{modal run scripts/modal\_jellyfish.py} \\
|
| 931 |
+
\midrule
|
| 932 |
+
Paired bench (\S\ref{sec:wild}) & \texttt{python -m eval.paired\_bench} \\
|
| 933 |
+
Wild bench (\S\ref{sec:wild}) & \texttt{python -m eval.wild\_bench} \\
|
| 934 |
+
GitTables trust audit (\S\ref{sec:wild}) & \texttt{python -m eval.gittables\_audit} \\
|
| 935 |
+
Held-out-source generalization & \texttt{python -m eval.generalization} \\
|
| 936 |
+
\midrule
|
| 937 |
+
Scorer validation (\S\ref{sec:eval}) & \texttt{python -m pytest tests/test\_wildclean\_scorer.py} \\
|
| 938 |
+
Degenerate baselines (Table~\ref{tab:degenerate}) & \texttt{python -m eval.degenerate} \\
|
| 939 |
+
TOST equivalence (\S\ref{sec:negative}) & \texttt{python -m eval.equivalence} \\
|
| 940 |
+
Label curve (Figure~\ref{fig:labelcurve}) & \texttt{python -m eval.label\_curve} \\
|
| 941 |
+
Per-dataset table (Table~\ref{tab:perdataset}) & \texttt{python -m eval.raha\_table} \\
|
| 942 |
+
Self-consistency vote/pool (\S\ref{sec:negative}) & \texttt{python -m eval.sc\_rerank --model scrubdata-ft --n 16} \\
|
| 943 |
+
Scaling arm (Table~\ref{tab:scaling}) & \texttt{python -m eval.scaling\_arm} \\
|
| 944 |
+
\bottomrule
|
| 945 |
+
\end{tabular}
|
| 946 |
+
\end{center}
|
| 947 |
+
|
| 948 |
+
\begin{thebibliography}{20}
|
| 949 |
+
\bibitem{raha} M.~Mahdavi, Z.~Abedjan, R.~Castro Fernandez, S.~Madden, M.~Ouzzani,
|
| 950 |
+
M.~Stonebraker, N.~Tang. Raha: A Configuration-Free Error Detection System. SIGMOD
|
| 951 |
+
2019; M.~Mahdavi, Z.~Abedjan. Baran: Effective Error Correction via a Unified Context
|
| 952 |
+
Representation and Transfer Learning. PVLDB 13(11):1948--1961, 2020.
|
| 953 |
+
\bibitem{holoclean} T.~Rekatsinas, X.~Chu, I.~F.~Ilyas, C.~R\'e. HoloClean: Holistic
|
| 954 |
+
Data Repairs with Probabilistic Inference. PVLDB 10(11), 2017. arXiv:1702.00820.
|
| 955 |
+
\bibitem{garf} J.~Peng, D.~Shen, N.~Tang, T.~Liu, Y.~Kou, T.~Nie, H.~Cui, G.~Yu.
|
| 956 |
+
Self-Supervised and Interpretable Data Cleaning with Sequence Generative Adversarial
|
| 957 |
+
Networks (GARF). PVLDB 16(3):433--446, 2022.
|
| 958 |
+
\bibitem{wrangle} A.~Narayan, I.~Chami, L.~Orr, S.~Arora, C.~R\'e. Can Foundation
|
| 959 |
+
Models Wrangle Your Data? PVLDB 16(4):738--746, 2022. arXiv:2205.09911.
|
| 960 |
+
\bibitem{jellyfish} H.~Zhang, Y.~Dong, C.~Xiao, M.~Oyamada. Jellyfish:
|
| 961 |
+
Instruction-Tuning Local Large Language Models for Data Preprocessing. EMNLP 2024.
|
| 962 |
+
arXiv:2312.01678.
|
| 963 |
+
\bibitem{cocoon} S.~Zhang, Z.~Huang, E.~Wu. Data Cleaning Using Large Language Models
|
| 964 |
+
(Cocoon). arXiv:2410.15547, 2024 (preprint; no published reproduction).
|
| 965 |
+
\bibitem{zeroed} W.~Ni, K.~Zhang, X.~Miao, X.~Zhao, Y.~Wu, Y.~Wang, J.~Yin. ZeroED:
|
| 966 |
+
Hybrid Zero-Shot Error Detection Through Large Language Model Reasoning. ICDE 2025.
|
| 967 |
+
arXiv:2504.05345.
|
| 968 |
+
\bibitem{forested} M.~Wang, J.~Wang, Q.~Liu, X.~Xu, Z.~Xing, L.~Zhu, W.~Zhang.
|
| 969 |
+
Ensembling LLM-Induced Decision Trees for Explainable and Robust Error Detection.
|
| 970 |
+
arXiv:2512.07246, 2025 (preprint).
|
| 971 |
+
\bibitem{autotest} Q.~Chen, Y.~He, R.~C.-W.~Wong, W.~Cui, S.~Ge, H.~Zhang, D.~Zhang,
|
| 972 |
+
S.~Chaudhuri. Auto-Test: Learning Semantic-Domain Constraints for Unsupervised Error
|
| 973 |
+
Detection in Tables. SIGMOD 2025. arXiv:2504.10762.
|
| 974 |
+
\bibitem{gidcl} M.~Yan, Y.~Wang, Y.~Wang, X.~Miao, J.~Li. GIDCL: A Graph-Enhanced
|
| 975 |
+
Interpretable Data Cleaning Framework with Large Language Models. Proc.\ ACM Manag.\
|
| 976 |
+
Data 2(6), Article 236, 2024 (SIGMOD).
|
| 977 |
+
\bibitem{spreadsheetrl} B.~Chi, Y.~Xie, M.~Wu, J.~Yang, J.~Jiang, Z.~Li, et al.
|
| 978 |
+
Spreadsheet-RL: Advancing Large Language Model Agents on Realistic Spreadsheet Tasks
|
| 979 |
+
via Reinforcement Learning. arXiv:2605.22642, 2026.
|
| 980 |
+
\bibitem{distort} A.~Dutta, H.~Nigam, H.~Hasanbeig, A.~Radhakrishna, S.~Gulwani.
|
| 981 |
+
An Empirical Investigation of Robustness in Large Language Models under Tabular
|
| 982 |
+
Distortions. arXiv:2601.05009, 2026.
|
| 983 |
+
\bibitem{debate} C.~Parmar, A.~Mehta, H.~Wu, J.~Ramamurthy, S.~Medhekar. When Helping
|
| 984 |
+
Hurts and How to Fix It: Multi-Agent Debate for Data Cleaning. arXiv:2606.02866, 2026.
|
| 985 |
+
\bibitem{tabler1} Z.~Yang, L.~Chen, A.~Cohan, Y.~Zhao. Table-R1: Inference-Time
|
| 986 |
+
Scaling for Table Reasoning. EMNLP 2025. arXiv:2505.23621.
|
| 987 |
+
\bibitem{spurious} R.~Shao, S.~S.~Li, R.~Xin, S.~Geng, Y.~Wang, et al. Spurious
|
| 988 |
+
Rewards: Rethinking Training Signals in RLVR. arXiv:2506.10947, 2025.
|
| 989 |
+
\bibitem{tablegpt} P.~Li, Y.~He, D.~Yashar, W.~Cui, S.~Ge, H.~Zhang, D.~Rifinski
|
| 990 |
+
Fainman, D.~Zhang, S.~Chaudhuri. Table-GPT: Table Fine-tuned GPT for Diverse Table
|
| 991 |
+
Tasks. Proc.\ ACM Manag.\ Data 2(3), Article 176, 2024 (SIGMOD). arXiv:2310.09263.
|
| 992 |
+
\bibitem{retclean} Z.~A.~Naeem, M.~S.~Ahmad, M.~Eltabakh, M.~Ouzzani, N.~Tang.
|
| 993 |
+
RetClean: Retrieval-Based Data Cleaning Using LLMs and Data Lakes. PVLDB 17(12), 2024
|
| 994 |
+
(demo). arXiv:2303.16909.
|
| 995 |
+
\bibitem{turl} X.~Deng, H.~Sun, A.~Lees, Y.~Wu, C.~Yu. TURL: Table Understanding
|
| 996 |
+
through Representation Learning. PVLDB 14(3):307--319, 2021. arXiv:2006.14806.
|
| 997 |
+
\bibitem{tablellama} T.~Zhang, X.~Yue, Y.~Li, H.~Sun. TableLlama: Towards Open Large
|
| 998 |
+
Generalist Models for Tables. NAACL 2024. arXiv:2311.09206.
|
| 999 |
+
\bibitem{belotti} F.~Belotti, F.~Dadda, M.~Cremaschi, R.~Avogadro, M.~Palmonari.
|
| 1000 |
+
Evaluating LLMs on Entity Disambiguation in Tables. arXiv:2408.06423, 2024 (preprint).
|
| 1001 |
+
\bibitem{racoon} L.~L.~Wei, G.~Xiao, M.~Balazinska. RACOON: An LLM-based Framework for
|
| 1002 |
+
Retrieval-Augmented Column Type Annotation with a Knowledge Graph. arXiv:2409.14556,
|
| 1003 |
+
2024 (preprint).
|
| 1004 |
+
\bibitem{mtab} P.~Nguyen, N.~Kertkeidkachorn, R.~Ichise, H.~Takeda. MTab: Matching
|
| 1005 |
+
Tabular Data to Knowledge Graph using Probability Models. SemTab/ISWC 2019.
|
| 1006 |
+
arXiv:1910.00246.
|
| 1007 |
+
\bibitem{selective} R.~El-Yaniv, Y.~Wiener. On the Foundations of Noise-free Selective
|
| 1008 |
+
Classification. JMLR 11:1605--1641, 2010; Y.~Geifman, R.~El-Yaniv. Selective
|
| 1009 |
+
Classification for Deep Neural Networks. NeurIPS 2017.
|
| 1010 |
+
\bibitem{openmed} M.~Panahi. OpenMed NER: Open-Source, Domain-Adapted State-of-the-Art
|
| 1011 |
+
Transformers for Biomedical NER Across 12 Public Datasets. arXiv:2508.01630, 2025
|
| 1012 |
+
(preprint).
|
| 1013 |
+
\bibitem{lakens} D.~Lakens. Equivalence Tests: A Practical Primer for t Tests,
|
| 1014 |
+
Correlations, and Meta-Analyses. Social Psychological and Personality Science
|
| 1015 |
+
8(4):355--362, 2017.
|
| 1016 |
+
\bibitem{grouse} S.~Muller, A.~Loison, B.~Omrani, G.~Viaud. GroUSE: A Benchmark
|
| 1017 |
+
to Evaluate Evaluators in Grounded Question Answering. COLING 2025.
|
| 1018 |
+
arXiv:2409.06595.
|
| 1019 |
+
\end{thebibliography}
|
| 1020 |
+
|
| 1021 |
+
\end{document}
|
docs/paper/numbers.tex
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
% Result macros — every value regenerates from one command (see Reproducibility section).
|
| 2 |
+
% Headline fine-tune (synthetic frozen gold, Layer 1)
|
| 3 |
+
\newcommand{\canonFOurs}{0.815} % v5 bf16, n=20 (single run; multi-seed below)
|
| 4 |
+
\newcommand{\canonFOursBest}{0.901} % v4 Q8 measurement
|
| 5 |
+
\newcommand{\canonFBig}{0.452} % large generic model (GLM-class, zero-shot)
|
| 6 |
+
\newcommand{\canonFHeur}{0.152} % rule heuristic
|
| 7 |
+
\newcommand{\canonFMultiSeed}{$0.803 \pm 0.009$ (95\% CI, 3 training seeds)}
|
| 8 |
+
\newcommand{\opFOurs}{0.957}
|
| 9 |
+
\newcommand{\jsonValidOurs}{0.950}
|
| 10 |
+
|
| 11 |
+
% Hospital head-to-head, repairs-only churn-neutral (both paths incl. errors-are-rare gates)
|
| 12 |
+
\newcommand{\hospRecallGrounded}{0.257}
|
| 13 |
+
\newcommand{\hospRecallFreq}{0.293}
|
| 14 |
+
\newcommand{\hospPrecGrounded}{0.845}
|
| 15 |
+
\newcommand{\hospPrecFreq}{0.871}
|
| 16 |
+
\newcommand{\hospModelRecall}{0.424} % fine-tuned v5, repair_recall (vs 0.000 synthetic-only)
|
| 17 |
+
|
| 18 |
+
% Wide-suite comparison (3 seeds, churn-neutral metric) — money table.
|
| 19 |
+
% PRIMARY = HEAD regeneration 2026-06-12 (eval/results/money_table_head.json,
|
| 20 |
+
% post-capability system, NaN metric fix in). Freeze (2026-06-10) values kept
|
| 21 |
+
% as *Freeze macros where the narrative discusses the freeze-version system.
|
| 22 |
+
\newcommand{\northGrounded}{0.224}
|
| 23 |
+
\newcommand{\northGroundedCI}{0.004}
|
| 24 |
+
\newcommand{\northORFp}{0.211}
|
| 25 |
+
\newcommand{\northORKnn}{0.122}
|
| 26 |
+
\newcommand{\realFGrounded}{0.225}
|
| 27 |
+
\newcommand{\realFORKnn}{0.058}
|
| 28 |
+
\newcommand{\damageGrounded}{0.092}
|
| 29 |
+
\newcommand{\damageORKnn}{0.096}
|
| 30 |
+
\newcommand{\northGroundedFreeze}{0.203}
|
| 31 |
+
\newcommand{\realFGroundedFreeze}{0.174}
|
| 32 |
+
\newcommand{\damageGroundedFreeze}{0.104}
|
| 33 |
+
|
| 34 |
+
% SHIPPED system (verified union, v6 adapter) on suite — scripts/modal_eval_suite.py
|
| 35 |
+
\newcommand{\modelRealF}{0.142}
|
| 36 |
+
\newcommand{\modelDamage}{0.015}
|
| 37 |
+
\newcommand{\modelAbstain}{1.000}
|
| 38 |
+
|
| 39 |
+
% Ablations (churn-neutral metric, 3 seeds — eval/results/ablations.json)
|
| 40 |
+
\newcommand{\ablFull}{0.203}
|
| 41 |
+
\newcommand{\ablNoGround}{0.223}
|
| 42 |
+
\newcommand{\ablNoAbstain}{0.190}
|
| 43 |
+
\newcommand{\ablNoMargin}{0.197}
|
| 44 |
+
\newcommand{\ablNoCase}{0.201}
|
| 45 |
+
\newcommand{\ablFullRealF}{0.174}
|
| 46 |
+
\newcommand{\ablNoGroundRealF}{0.135}
|
| 47 |
+
\newcommand{\ablNoAbstainDamage}{0.108}
|
| 48 |
+
\newcommand{\ablFullDamage}{0.104}
|
| 49 |
+
\newcommand{\ablFullAbstain}{1.000}
|
| 50 |
+
\newcommand{\ablNoAbstainAbstain}{0.250}
|
| 51 |
+
|
| 52 |
+
% Selective prediction / calibration
|
| 53 |
+
\newcommand{\aurc}{0.120}
|
| 54 |
+
\newcommand{\ece}{0.169}
|
| 55 |
+
\newcommand{\precAtDefault}{0.899} % threshold 0.84
|
| 56 |
+
\newcommand{\covAtDefault}{0.669}
|
| 57 |
+
\newcommand{\threshNinetyFive}{0.91}
|
| 58 |
+
\newcommand{\covNinetyFive}{0.206}
|
| 59 |
+
|
| 60 |
+
% PII transfer validation (OpenMed-PII 44M on bare cells)
|
| 61 |
+
\newcommand{\piiNameBare}{100\%}
|
| 62 |
+
\newcommand{\piiAddrBare}{100\%}
|
| 63 |
+
\newcommand{\piiNegRate}{43\%}
|
| 64 |
+
\newcommand{\piiLeakRate}{zero (0/360)} % eval/pii_leak.py, seeded
|
| 65 |
+
\newcommand{\realFORFp}{0.039}
|
| 66 |
+
\newcommand{\injFGrounded}{0.224}
|
| 67 |
+
\newcommand{\injFORFp}{0.282}
|
| 68 |
+
\newcommand{\damageORFp}{0.001}
|
| 69 |
+
\newcommand{\hospModelRecallVSix}{0.475}
|
| 70 |
+
\newcommand{\hospModelPrecVSix}{0.185}
|
| 71 |
+
|
| 72 |
+
% WS1 — plan-level selective prediction (verified union planner)
|
| 73 |
+
% repro: uv run python -m eval.precision_curve --plan eval/results/v6_hospital_raw_plan.json --union
|
| 74 |
+
\newcommand{\unionGatePrec}{0.905}
|
| 75 |
+
\newcommand{\unionGateCov}{0.413}
|
| 76 |
+
\newcommand{\modelGatePrec}{0.993} % gated model plan alone, tau=0.5 (146/147 correct)
|
| 77 |
+
\newcommand{\modelGateCov}{0.287}
|
| 78 |
+
\newcommand{\unionChanged}{232}
|
| 79 |
+
\newcommand{\unionFixed}{210}
|
| 80 |
+
\newcommand{\hospErrors}{509}
|
| 81 |
+
% 3 training seeds (mixA 21=shipped/25/26), union @ tau=0.5 (eval/results/union_gate_3seed.json)
|
| 82 |
+
\newcommand{\unionGateThreeSeedPrec}{$0.891 \pm 0.012$}
|
| 83 |
+
\newcommand{\unionGateThreeSeedCov}{$0.396 \pm 0.025$}
|
| 84 |
+
% WS2 pair-profiles (measured-and-cut): constrained raw plan / composed at tau=0.5
|
| 85 |
+
\newcommand{\pairsRawPrec}{0.760}
|
| 86 |
+
\newcommand{\pairsRawCov}{0.348}
|
| 87 |
+
\newcommand{\pairsUnionPrec}{0.876}
|
| 88 |
+
\newcommand{\pairsUnionCov}{0.387}
|
| 89 |
+
|
| 90 |
+
% ===== v2 (post-freeze system, 2026-06-11/12) =====
|
| 91 |
+
\newcommand{\nPairs}{42}
|
| 92 |
+
\newcommand{\nWild}{35}
|
| 93 |
+
\newcommand{\nTrust}{239}
|
| 94 |
+
\newcommand{\unseenMacroF}{0.363}
|
| 95 |
+
\newcommand{\unseenMacroDamage}{0.0219}
|
| 96 |
+
\newcommand{\wildRecovery}{0.207}
|
| 97 |
+
\newcommand{\genFTwo}{0.058}
|
| 98 |
+
\newcommand{\genVRTwo}{0.108}
|
| 99 |
+
\newcommand{\genDamageTwo}{0.036}
|
| 100 |
+
\newcommand{\ttFOne}{0.955--0.957}
|
| 101 |
+
\newcommand{\flightsVoteF}{0.164}
|
| 102 |
+
\newcommand{\flightsBaseF}{0.044}
|
| 103 |
+
\newcommand{\hospVoteHeur}{0.186}
|
| 104 |
+
\newcommand{\hospBaseHeur}{0.092}
|
| 105 |
+
\newcommand{\gidclHosp}{0.97}
|
| 106 |
+
|
| 107 |
+
% WS4 — learned-repair baselines, Raha real slice only (eval/baselines_learned.py)
|
| 108 |
+
\newcommand{\realFBaran}{0.811} % oracle detection + 20 gold labels: upper bound
|
| 109 |
+
\newcommand{\realFBaranCI}{0.018} % 3 label-sampling seeds
|
| 110 |
+
\newcommand{\damageBaran}{0.003}
|
| 111 |
+
\newcommand{\precBaran}{0.824}
|
| 112 |
+
\newcommand{\realFJelly}{0.074} % Jellyfish-13B ED+DI (scripts/modal_jellyfish.py)
|
| 113 |
+
\newcommand{\damageJelly}{0.027}
|
| 114 |
+
|
| 115 |
+
% W1.a — matched-budget label curve, 5-dataset Raha macro (eval/results/label_curve.json)
|
| 116 |
+
\newcommand{\realFBaranZero}{0.000} % Baran k=0 (oracle positions retained), 3 seeds
|
| 117 |
+
\newcommand{\realFBaranFive}{0.504} % Baran k=5
|
| 118 |
+
\newcommand{\realFOursFive}{0.282} % ours k=5 (labels validate/expand accept set only)
|
| 119 |
+
\newcommand{\realFOursTwenty}{0.351} % ours k=20
|
| 120 |
+
\newcommand{\realFOursHead}{0.225} % ours k=0 at HEAD (post-freeze capabilities)
|
| 121 |
+
|
| 122 |
+
% W4.3/4.4 — degenerate baselines + cost-weighted scores (eval/results/degenerate.json)
|
| 123 |
+
\newcommand{\degShippedF}{0.343}
|
| 124 |
+
\newcommand{\degShippedP}{0.576}
|
| 125 |
+
\newcommand{\degShippedDamage}{0.023}
|
| 126 |
+
\newcommand{\degShippedPhiOne}{$+0.13$}
|
| 127 |
+
|
| 128 |
+
% W1.c — zero-shot capability scaling arm (eval/results/scaling_arm.json)
|
| 129 |
+
\newcommand{\scalePrecBig}{0.915} % devstral-24B and gemma4-31B union point
|
| 130 |
+
\newcommand{\scaleCovBig}{0.485}
|
| 131 |
+
\newcommand{\scalePrecNemo}{0.877}
|
| 132 |
+
\newcommand{\scaleCovNemo}{0.336}
|
| 133 |
+
% hosted wall-clock for the 5 planning calls, single capture (scaling_arm.json runtime_s)
|
| 134 |
+
\newcommand{\runtimeDevstral}{135}
|
| 135 |
+
\newcommand{\runtimeNemo}{114}
|
| 136 |
+
\newcommand{\runtimeGemma}{104}
|
| 137 |
+
|
| 138 |
+
% Frontier zero-shot reference point: hospital repair recall of a vanilla frontier-scale
|
| 139 |
+
% cloud planner run through the same propose/execute harness (2026-06-04..07 architecture
|
| 140 |
+
% validation captures, pre-verifier; recorded in project training-run logs). Quoted in the
|
| 141 |
+
% fine-tune results subsection as the zero-shot ceiling the v6 recall approaches.
|
| 142 |
+
\newcommand{\frontierZeroShotRecall}{0.51}
|
| 143 |
+
|
| 144 |
+
% R3 — absolute champion GEN-F1 basis of the equivalence retrain series
|
| 145 |
+
% (eval/results/equivalence.json spec.champion_macro_gen_f1 = 0.014606)
|
| 146 |
+
\newcommand{\genChampionBasis}{0.0146}
|
eval/README.md
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Eval harness + goalpost
|
| 2 |
+
|
| 3 |
+
Measures any planner against a **held-out** synthetic gold set (seed differs from
|
| 4 |
+
training, and gold is filtered to oracle-solvable so the ceiling is a clean 1.0).
|
| 5 |
+
|
| 6 |
+
```bash
|
| 7 |
+
uv run eval/run_eval.py --n 300 --seed 4242
|
| 8 |
+
```
|
| 9 |
+
|
| 10 |
+
Adopts the researched tooling: `jsonschema` for plan validity; set-based micro-F1 for
|
| 11 |
+
operations and canonicalization mappings; the **executor itself** for end-to-end
|
| 12 |
+
cell-recovery (the Raha-style dirty→clean comparison). promptfoo + `llm-rubric` will
|
| 13 |
+
wrap the report-quality layer once a model exists.
|
| 14 |
+
|
| 15 |
+
## Metrics
|
| 16 |
+
- **json_valid** — plan conforms to the schema (`eval/metrics.py:PLAN_SCHEMA`).
|
| 17 |
+
- **op_f1 / op_r** — micro-F1 / recall over `(column, operation)` pairs vs gold.
|
| 18 |
+
- **canon_f1 / canon_r** — micro-F1 / recall over `(column, raw→canonical)` mapping
|
| 19 |
+
pairs. *This is the fuzzy skill rules can't do — the whole reason for the model.*
|
| 20 |
+
- **recovery** — fraction of clean-reference cells recovered by executing the plan.
|
| 21 |
+
|
| 22 |
+
## Baseline (measured) and the goalpost
|
| 23 |
+
|
| 24 |
+
Two reference systems frame every run:
|
| 25 |
+
- **ORACLE** = the gold plan → the ceiling.
|
| 26 |
+
- **HEURISTIC** (`scrubdata.mock_plan`) = the rule-based baseline the model must beat.
|
| 27 |
+
|
| 28 |
+
Measured on the frozen 300-example gold set (`eval/gold.jsonl`, **value_counts/aggregation
|
| 29 |
+
format**):
|
| 30 |
+
|
| 31 |
+
| system | json_valid | op_f1 | canon_f1 | canon_r | recovery |
|
| 32 |
+
|---|---|---|---|---|---|
|
| 33 |
+
| ORACLE (gold) | 1.000 | 1.000 | 1.000 | 1.000 | **1.000** |
|
| 34 |
+
| HEURISTIC (baseline) | 1.000 | 0.932 | **0.189** | 0.129 | **0.637** |
|
| 35 |
+
|
| 36 |
+
**Reading:** with case-folding + typo-clustering the heuristic does the *easy*
|
| 37 |
+
canonicalization (collapse to most-frequent surface), but it's still ~blind to
|
| 38 |
+
**alias/semantic** canonicalization (`USA`→`United States`, `NYC`→`New York`) — canon_f1
|
| 39 |
+
0.19 vs the oracle's 1.0. That gap is the fine-tuned model's job. (Earlier, on the old
|
| 40 |
+
sample-rows format, a fine-tune reached canon_f1 0.86 vs a big vanilla model's 0.45 —
|
| 41 |
+
proving small-aligned > big-generic; the v4 retrain re-establishes this on the new format.)
|
| 42 |
+
|
| 43 |
+
### 🎯 Goalpost for the fine-tuned Qwen3-4B
|
| 44 |
+
| metric | baseline | **target** | ceiling |
|
| 45 |
+
|---|---|---|---|
|
| 46 |
+
| json_valid | 1.000 | **≥ 0.99** | 1.000 |
|
| 47 |
+
| op_f1 | 0.932 | **≥ 0.98** | 1.000 |
|
| 48 |
+
| canon_f1 | 0.189 | **≥ 0.85** | 1.000 |
|
| 49 |
+
| recovery | 0.637 | **≥ 0.95** | 1.000 |
|
| 50 |
+
|
| 51 |
+
A fine-tune that hits these clearly beats the (now stronger) heuristic and approaches the
|
| 52 |
+
oracle — the headline being **canon_f1 0.133 → ≥0.85** (alias-level canonicalization) and
|
| 53 |
+
**recovery 0.627 → ≥0.95**.
|
| 54 |
+
|
| 55 |
+
## Plugging in the model
|
| 56 |
+
`evaluate(planner, gold)` takes any `planner(dirty_df, gold_plan) -> plan dict`. For
|
| 57 |
+
the model, wrap inference (build prompt via `scrubdata.prompt`, parse JSON) and pass it
|
| 58 |
+
in alongside the two reference systems. Track the table every fine-tune iteration; the
|
| 59 |
+
per-metric delta vs baseline is the cheap regression signal.
|
| 60 |
+
|
| 61 |
+
## Layer 2 — real out-of-distribution data (`uv run eval/run_real.py`)
|
| 62 |
+
|
| 63 |
+
Raha `hospital` (1000×20, row-aligned dirty/clean). Errors are char-substitution typos
|
| 64 |
+
(`birminghxm`→`birmingham`) — only ~2.5% of cells. Scored with the Raha **repair**
|
| 65 |
+
protocol (the right metric when data is already mostly correct):
|
| 66 |
+
|
| 67 |
+
| system | recovery | repair_recall | repair_prec | broken |
|
| 68 |
+
|---|---|---|---|---|
|
| 69 |
+
| NO-OP (dirty as-is) | 0.975 | 0.000 | 0.000 | 0 |
|
| 70 |
+
| HEURISTIC (baseline) | 0.880 | **0.293** | 0.065 | 2041 |
|
| 71 |
+
|
| 72 |
+
(Typo-clustering now fixes ~29% of the real char-substitution errors — up from 0. The
|
| 73 |
+
model should push repair_recall higher and improve repair_prec.)
|
| 74 |
+
|
| 75 |
+
**Reading (honest + important):** the rule heuristic fixes **0** typos. Its 2021 changed
|
| 76 |
+
cells are **convention divergence, not errors** — our tool parses `100%`→`1.0` and
|
| 77 |
+
reformats phones, which this benchmark stores as raw text. That's product value, so raw
|
| 78 |
+
`recovery`/`broken` *understates* a standardizing tool on a foreign benchmark. The honest
|
| 79 |
+
metric here is **`repair_recall`** — did we fix the actual char-substitution typos
|
| 80 |
+
(`birminghxm`→`birmingham`)? The heuristic can't (scores 0); cluster-canonicalization is
|
| 81 |
+
the model's job. Two takeaways:
|
| 82 |
+
1. **The headline real-data metric is `repair_recall`** (error-fixing), not recovery.
|
| 83 |
+
2. **Product feature surfaced:** offer a "preserve original formats" toggle — some users
|
| 84 |
+
want raw representation kept; standardizing is the default but should be reversible
|
| 85 |
+
(matches PRODUCT.md's trust contract).
|
| 86 |
+
|
| 87 |
+
### 🎯 Real-data goalpost (fine-tuned model)
|
| 88 |
+
| metric | NO-OP | HEURISTIC | **target** | note |
|
| 89 |
+
|---|---|---|---|---|
|
| 90 |
+
| **repair_recall** | 0.000 | 0.000 | **≥ 0.30** | the real test — fix typos via clustering |
|
| 91 |
+
| repair_prec | 0.000 | 0.000 | **≥ 0.70** | of cells changed, fraction that fixed an error |
|
| 92 |
+
| recovery | 0.975 | 0.874 | report-only | convention-sensitive; not a pass/fail gate |
|
| 93 |
+
|
| 94 |
+
The model plugs into `_score(dirty, clean, model_output)` exactly like the heuristic.
|
| 95 |
+
|
| 96 |
+
> Data auto-fetched to `data/real/hospital/` (gitignored). Add Flights/Beers/CleanML the
|
| 97 |
+
> same way for breadth.
|
| 98 |
+
|
| 99 |
+
## Scale: aggregation + agentic batching (validated)
|
| 100 |
+
|
| 101 |
+
Cleaning *large* tables doesn't mean bigger prompts — it means reasoning over **patterns**:
|
| 102 |
+
- **Aggregation** — the profiler sends per-column `value_counts` (`[value, frequency]`), so
|
| 103 |
+
the prompt size depends on DISTINCT values, not rows. Rare typos sit at the tail next to
|
| 104 |
+
their dominant canonical (`birminghxm`:1 vs `birmingham`:312) — visible at any scale.
|
| 105 |
+
- **Column batching** — `scrubdata.model_planner.make_batched_planner` plans a wide table
|
| 106 |
+
in small column-batches, so a 20-column table never blows one prompt.
|
| 107 |
+
|
| 108 |
+
**Validated** on the real Raha hospital table (1000×20) with a *vanilla* model (no retrain):
|
| 109 |
+
**repair_recall 0.509** (fixed 259/509 typos), vs **0.000** for the old one-shot+sample-rows
|
| 110 |
+
approach. The v4 fine-tune trains on this `value_counts` format.
|
| 111 |
+
|
| 112 |
+
---
|
| 113 |
+
|
| 114 |
+
## The wide suite (current north-star)
|
| 115 |
+
|
| 116 |
+
The single-dataset hospital metric was retired as north-star (biased: one table,
|
| 117 |
+
recall-only, convention-sensitive, abstain-blind). The current harness:
|
| 118 |
+
|
| 119 |
+
- **`run_real_multi.py`** — 65-dataset suite (5 Raha real-error benchmarks + seeded
|
| 120 |
+
error injection over 15 harvested open-data domains), scored with a **churn-neutral**
|
| 121 |
+
metric (pure case/whitespace rewrites that don't restore gold count as nothing) and
|
| 122 |
+
aggregated as a **double macro** (error-type × domain, harmonic mean) so no single
|
| 123 |
+
table or error type dominates. Reports REAL vs INJECTED slices separately — injected
|
| 124 |
+
typos are in-distribution for frequency clustering by construction.
|
| 125 |
+
- **`ablations.py`** — removes one grounding component at a time (reference, abstain,
|
| 126 |
+
ambiguity margin, case-match). Caught two metric artifacts (churn inflation,
|
| 127 |
+
reference-unsafe traps) now fixed and documented in the paper.
|
| 128 |
+
- **`calibration.py`** — risk–coverage + ECE for the abstention confidence
|
| 129 |
+
(AURC 0.120; 90% precision at the default threshold, ≥95% at 0.91).
|
| 130 |
+
- **`pii_leak.py`** — masking leak test: 0/360 residual detectable PII.
|
| 131 |
+
- **`pii_slice.py`** — OOD PII column typing on Gretel test: 5/5 types, 0/7 FP.
|
| 132 |
+
- **`inject.py`** — seeded, self-verifying error injectors (typo/OCR/case/whitespace)
|
| 133 |
+
that turn any clean table into validation data.
|
| 134 |
+
|
| 135 |
+
Baselines include OpenRefine fingerprint + kNN clustering (`scrubdata/baselines.py`,
|
| 136 |
+
with blocking, as the real tool uses). Full results & discussion: `docs/paper/`.
|
eval/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Evaluation harness for the ScrubData planner.
|
| 2 |
+
|
| 3 |
+
Measures any planner (`callable(dirty_df) -> plan dict`) against a held-out gold set:
|
| 4 |
+
- JSON-schema validity of the plan
|
| 5 |
+
- operation-level micro-F1 vs the gold plan
|
| 6 |
+
- canonicalization mapping micro-F1 (the fuzzy skill rules can't do)
|
| 7 |
+
- end-to-end cell-recovery (executor(dirty, plan) vs known-clean reference)
|
| 8 |
+
|
| 9 |
+
Two reference systems frame every run:
|
| 10 |
+
- HEURISTIC (`scrubdata.mock_plan`) = the baseline a fine-tuned model must beat.
|
| 11 |
+
- ORACLE (the gold plan itself) = the goalpost ceiling (~100% by construction).
|
| 12 |
+
"""
|
eval/ablations.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Ablation suite — isolate each grounding component's contribution to the north-star.
|
| 2 |
+
|
| 3 |
+
Each row turns ONE design decision off (via mock_plan's ground_cfg) and re-runs the wide
|
| 4 |
+
validation suite. Shows what grounding / abstention / ambiguity-checking / case-matching each
|
| 5 |
+
buy in F1 and (critically) in DAMAGE.
|
| 6 |
+
|
| 7 |
+
uv run python -m eval.ablations
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
from scrubdata.planner import mock_plan
|
| 13 |
+
|
| 14 |
+
from .run_real_multi import evaluate_suite
|
| 15 |
+
|
| 16 |
+
ABLATIONS = [
|
| 17 |
+
("full (grounded)", {}),
|
| 18 |
+
("- grounding (freq-cluster)", {"use_reference": False}),
|
| 19 |
+
("- abstain (map nearest)", {"threshold": 0.0, "min_margin": 0.0}),
|
| 20 |
+
("- ambiguity check", {"min_margin": 0.0}),
|
| 21 |
+
("- case match", {"case_match": False}),
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def main(seeds=(7, 17, 27), out: str | None = None) -> None:
|
| 26 |
+
def mean(xs):
|
| 27 |
+
xs = list(xs)
|
| 28 |
+
return sum(xs) / len(xs) if xs else 0.0
|
| 29 |
+
|
| 30 |
+
print(f"\n=== Ablation suite (wide validation suite, {len(seeds)} seeds) — each "
|
| 31 |
+
"removes ONE grounding component ===\n")
|
| 32 |
+
print(f"{'variant':<28}{'NORTH*':>9}{'REAL-F1':>9}{'INJ-F1':>8}{'damage':>9}{'abstain':>9}")
|
| 33 |
+
print("-" * 72)
|
| 34 |
+
rows = []
|
| 35 |
+
for name, cfg in ABLATIONS:
|
| 36 |
+
planner = (lambda df, c=cfg: mock_plan(df, ground_cfg=c))
|
| 37 |
+
per_seed = [evaluate_suite(planner, seed=s) for s in seeds]
|
| 38 |
+
r = {k: mean(p[k] for p in per_seed)
|
| 39 |
+
for k in ("north", "real", "injected", "damage", "abstain")}
|
| 40 |
+
mu = r["north"]
|
| 41 |
+
var = mean([(p["north"] - mu) ** 2 for p in per_seed])
|
| 42 |
+
r["north_ci"] = 1.96 * (var ** 0.5) / (len(per_seed) ** 0.5)
|
| 43 |
+
rows.append((name, r))
|
| 44 |
+
print(f"{name:<28}{r['north']:>9.3f}{r['real']:>9.3f}{r['injected']:>8.3f}"
|
| 45 |
+
f"{r['damage']:>9.3f}{r['abstain']:>9.3f}", flush=True)
|
| 46 |
+
full = rows[0][1]
|
| 47 |
+
print("\nDeltas vs full (what each component buys):")
|
| 48 |
+
for name, r in rows[1:]:
|
| 49 |
+
print(f" {name:<28} ΔNORTH={r['north'] - full['north']:+.3f} "
|
| 50 |
+
f"Δdamage={r['damage'] - full['damage']:+.3f} Δabstain={r['abstain'] - full['abstain']:+.3f}")
|
| 51 |
+
if out:
|
| 52 |
+
import json
|
| 53 |
+
json.dump([{"variant": n, **r, "seeds": list(seeds)} for n, r in rows],
|
| 54 |
+
open(out, "w"), indent=1)
|
| 55 |
+
print(f"rows written to {out}")
|
| 56 |
+
print("\nGrounding lifts F1; abstain + ambiguity-check cut DAMAGE; case-match avoids "
|
| 57 |
+
"convention damage. The combination is the contribution.")
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
if __name__ == "__main__":
|
| 61 |
+
import argparse
|
| 62 |
+
ap = argparse.ArgumentParser()
|
| 63 |
+
ap.add_argument("--out", type=str, default=None)
|
| 64 |
+
main(out=ap.parse_args().out)
|
eval/baselines_learned.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""WS4 learned-repair baselines: scoring + Jellyfish prompt construction.
|
| 2 |
+
|
| 3 |
+
Both baselines bypass plan dicts (the executor is column-level by design; learned repair
|
| 4 |
+
is per-cell) — they produce repaired DataFrames scored by the SAME churn-neutral
|
| 5 |
+
`eval.run_real_multi.score` as every other row of the money table.
|
| 6 |
+
|
| 7 |
+
* Baran: repaired CSVs come from eval/run_baran.py (pinned env). Score here:
|
| 8 |
+
uv run python -m eval.baselines_learned --score-baran
|
| 9 |
+
* Jellyfish: prompts built here (unit-testable without a GPU), executed by
|
| 10 |
+
scripts/modal_jellyfish.py (vLLM on Modal), scored in-run with the same `score`.
|
| 11 |
+
|
| 12 |
+
Jellyfish has NO repair task — we compose its two published cell-level tasks:
|
| 13 |
+
error detection (yes/no per cell) then data imputation (infer the flagged cell with the
|
| 14 |
+
attribute removed). Prompt templates are verbatim from the NECOUDBFM/Jellyfish-13B model
|
| 15 |
+
card; this composition is OURS, not theirs (disclosed in the paper).
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
|
| 20 |
+
import argparse
|
| 21 |
+
import json
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
|
| 24 |
+
SYSTEM_MESSAGE = ("You are an AI assistant that follows instruction extremely well. "
|
| 25 |
+
"Help as much as you can.")
|
| 26 |
+
|
| 27 |
+
_ED_TEMPLATE = (
|
| 28 |
+
"Your task is to determine if there is an error in the value of a specific "
|
| 29 |
+
"attribute within the whole record provided.\n"
|
| 30 |
+
"The attributes may include {attrs}.\n"
|
| 31 |
+
"Errors may include, but are not limited to, spelling errors, inconsistencies, "
|
| 32 |
+
"or values that don't make sense given the context of the whole record.\n"
|
| 33 |
+
"Record [{record}]\n"
|
| 34 |
+
"Attribute for Verification: [{col}: {val}]\n"
|
| 35 |
+
"Question: Is there an error in the value of {col}? "
|
| 36 |
+
"Choose your answer from: [Yes, No]."
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
_DI_TEMPLATE = (
|
| 40 |
+
"You are presented with a {keyword} record that is missing a specific attribute: "
|
| 41 |
+
"{col}.\n"
|
| 42 |
+
"Your task is to deduce or infer the value of {col} using the available "
|
| 43 |
+
"information in the record.\n"
|
| 44 |
+
"You may be provided with fields like {attrs} to help you in the inference.\n"
|
| 45 |
+
"Record: [{record}]\n"
|
| 46 |
+
"Based on the provided record, what would you infer is the value for the missing "
|
| 47 |
+
"attribute {col}?\n"
|
| 48 |
+
"Answer only the value of {col}."
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def wrap_prompt(user_message: str) -> str:
|
| 53 |
+
"""The Jellyfish-13B chat scaffold (verbatim from the model card)."""
|
| 54 |
+
return f"{SYSTEM_MESSAGE}\n\n### Instruction:\n\n{user_message}\n\n### Response:\n\n"
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _serialize(record: dict, skip: str | None = None) -> str:
|
| 58 |
+
return ", ".join(f"{k}: {v}" for k, v in record.items() if k != skip)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def ed_prompt(record: dict, col: str) -> str:
|
| 62 |
+
"""Error-detection prompt (whole-record form) for one cell."""
|
| 63 |
+
return wrap_prompt(_ED_TEMPLATE.format(
|
| 64 |
+
attrs=", ".join(record.keys()), record=_serialize(record),
|
| 65 |
+
col=col, val=record[col]))
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def di_prompt(record: dict, col: str, keyword: str) -> str:
|
| 69 |
+
"""Data-imputation prompt for a flagged cell — the attribute is REMOVED from the
|
| 70 |
+
serialized record so the model infers, not copies."""
|
| 71 |
+
attrs = [k for k in record.keys() if k != col]
|
| 72 |
+
return wrap_prompt(_DI_TEMPLATE.format(
|
| 73 |
+
keyword=keyword, col=col, attrs=", ".join(attrs),
|
| 74 |
+
record=_serialize(record, skip=col)))
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def parse_ed(text: str) -> bool:
|
| 78 |
+
"""True = the model says the cell is erroneous."""
|
| 79 |
+
return text.strip().lower().lstrip("[").startswith("yes")
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def parse_di(text: str, original: str) -> str:
|
| 83 |
+
"""Imputed value, or the original (abstain) when the answer is unusable —
|
| 84 |
+
empty, multi-line/rambling, or implausibly long for a cell."""
|
| 85 |
+
ans = text.strip().strip('"').strip()
|
| 86 |
+
if not ans or "\n" in ans or len(ans) > 80:
|
| 87 |
+
return original
|
| 88 |
+
return ans
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
# ---------------------------------------------------------------- Baran scoring
|
| 92 |
+
|
| 93 |
+
def score_baran(repaired_dir: str = "eval/results/baran",
|
| 94 |
+
out: str = "eval/results/baran_raha.json") -> dict:
|
| 95 |
+
"""Score every <name>_seed<k>_repaired.csv against (dirty, clean) under the
|
| 96 |
+
identical churn-neutral protocol; macro REAL-F1 mean ± 95% CI over seeds."""
|
| 97 |
+
import collections
|
| 98 |
+
|
| 99 |
+
import pandas as pd
|
| 100 |
+
|
| 101 |
+
from .run_real_multi import _raha_pair, score
|
| 102 |
+
|
| 103 |
+
per_seed: dict[int, list] = collections.defaultdict(list)
|
| 104 |
+
per_ds = []
|
| 105 |
+
for p in sorted(Path(repaired_dir).glob("*_seed*_repaired.csv")):
|
| 106 |
+
name, seed = p.stem.rsplit("_repaired", 1)[0].rsplit("_seed", 1)
|
| 107 |
+
repaired = pd.read_csv(p, dtype=str, keep_default_na=False)
|
| 108 |
+
dirty, clean = _raha_pair(name)
|
| 109 |
+
m = score(dirty, clean, repaired)
|
| 110 |
+
per_seed[int(seed)].append(m)
|
| 111 |
+
per_ds.append({"name": name, "seed": int(seed), **{k: v for k, v in m.items()}})
|
| 112 |
+
print(f" {name:<10} seed{seed}: F1={m['f1']:.3f} P={m['precision']:.3f} "
|
| 113 |
+
f"R={m['recall']:.3f} dmg={m['damage']:.3f}")
|
| 114 |
+
if not per_seed:
|
| 115 |
+
raise SystemExit(f"no repaired CSVs found in {repaired_dir}")
|
| 116 |
+
|
| 117 |
+
def mean(xs):
|
| 118 |
+
xs = list(xs)
|
| 119 |
+
return sum(xs) / len(xs) if xs else 0.0
|
| 120 |
+
|
| 121 |
+
seed_f1 = [mean(m["f1"] for m in ms) for ms in per_seed.values()]
|
| 122 |
+
mu = mean(seed_f1)
|
| 123 |
+
var = mean([(x - mu) ** 2 for x in seed_f1])
|
| 124 |
+
ci = 1.96 * (var ** 0.5) / (len(seed_f1) ** 0.5)
|
| 125 |
+
result = {
|
| 126 |
+
"system": "Baran (oracle detection, 20 gold labels)",
|
| 127 |
+
"real_f1": mu, "real_f1_ci": ci, "real_f1_per_seed": seed_f1,
|
| 128 |
+
"damage": mean(mean(m["damage"] for m in ms) for ms in per_seed.values()),
|
| 129 |
+
"precision": mean(mean(m["precision"] for m in ms) for ms in per_seed.values()),
|
| 130 |
+
"recall": mean(mean(m["recall"] for m in ms) for ms in per_seed.values()),
|
| 131 |
+
"n_seeds": len(per_seed), "per_dataset": per_ds,
|
| 132 |
+
"protocol_note": "upper bound: oracle error positions + 20 gold-labeled tuples "
|
| 133 |
+
"(its package default); damage=0 by construction",
|
| 134 |
+
}
|
| 135 |
+
json.dump(result, open(out, "w"), indent=1)
|
| 136 |
+
print(f"\nBaran macro REAL-F1 {mu:.3f} ± {ci:.3f} (n={len(seed_f1)} seeds) -> {out}")
|
| 137 |
+
return result
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
if __name__ == "__main__":
|
| 141 |
+
ap = argparse.ArgumentParser()
|
| 142 |
+
ap.add_argument("--score-baran", action="store_true")
|
| 143 |
+
args = ap.parse_args()
|
| 144 |
+
if args.score_baran:
|
| 145 |
+
score_baran()
|
eval/calibration.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Selective prediction / calibration study for grounded canonicalization.
|
| 2 |
+
|
| 3 |
+
"Knowing when NOT to act" is the research contribution (and the AI-safety monitorability
|
| 4 |
+
angle): instead of always emitting a canonical, the grounded reconciler attaches a
|
| 5 |
+
CONFIDENCE and ABSTAINS below threshold. This module measures whether that confidence is
|
| 6 |
+
trustworthy:
|
| 7 |
+
|
| 8 |
+
* Risk-Coverage curve + AURC — sort decisions by confidence; as we cover more (abstain
|
| 9 |
+
less) does risk rise gracefully? Low AURC = a good selective predictor.
|
| 10 |
+
* ECE (Expected Calibration Error) — does a confidence of 0.9 actually mean ~90% correct?
|
| 11 |
+
* Operating point — at our default threshold, what coverage and precision do we get, and
|
| 12 |
+
what threshold hits a target precision (e.g. 95%)?
|
| 13 |
+
|
| 14 |
+
Probe = real cities sampled from the reference with injected typos (recoverable, gold known)
|
| 15 |
+
+ garbage TRAP strings (acting at all is an error). Reproducible (fixed seed).
|
| 16 |
+
|
| 17 |
+
uv run python -m eval.calibration
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import random
|
| 23 |
+
import string
|
| 24 |
+
|
| 25 |
+
from scrubdata.reconcile import _norm, default_index
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _typo(s: str, rng: random.Random) -> str:
|
| 29 |
+
if len(s) < 4:
|
| 30 |
+
return s + rng.choice(string.ascii_lowercase)
|
| 31 |
+
i = rng.randrange(1, len(s) - 1)
|
| 32 |
+
if not s[i].isalpha():
|
| 33 |
+
i = 1
|
| 34 |
+
pool = string.ascii_lowercase if s[i].islower() else string.ascii_uppercase
|
| 35 |
+
return s[:i] + rng.choice([c for c in pool if c != s[i].lower()]) + s[i + 1:]
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def build_probe(n_real: int = 500, n_trap: int = 150, seed: int = 5):
|
| 39 |
+
"""(value, gold|None, kind) probes: real-city typos (recoverable) + garbage traps."""
|
| 40 |
+
idx = default_index()
|
| 41 |
+
cities = [c for bucket in idx._buckets.get("city", {}).values() for (c, _) in bucket]
|
| 42 |
+
rng = random.Random(seed)
|
| 43 |
+
probe = []
|
| 44 |
+
for c in rng.sample(cities, min(n_real, len(cities))):
|
| 45 |
+
probe.append((_typo(c, rng), c, "real"))
|
| 46 |
+
for _ in range(n_trap):
|
| 47 |
+
g = "".join(rng.choice(string.ascii_lowercase) for _ in range(rng.randint(5, 9)))
|
| 48 |
+
probe.append((g, None, "trap"))
|
| 49 |
+
rng.shuffle(probe)
|
| 50 |
+
return probe, idx
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _scored(probe, idx, ctype="city"):
|
| 54 |
+
"""(confidence, correct_if_acted) per probe."""
|
| 55 |
+
out = []
|
| 56 |
+
for value, gold, kind in probe:
|
| 57 |
+
b = idx.best(value, ctype)
|
| 58 |
+
conf = b[1] if b else 0.0
|
| 59 |
+
correct = bool(kind == "real" and b and _norm(b[0]) == _norm(gold))
|
| 60 |
+
out.append((conf, correct))
|
| 61 |
+
return out
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def risk_coverage(scored):
|
| 65 |
+
rows = sorted(scored, key=lambda x: -x[0])
|
| 66 |
+
n, cum = len(rows), 0
|
| 67 |
+
curve = []
|
| 68 |
+
for k, (conf, ok) in enumerate(rows, 1):
|
| 69 |
+
cum += int(ok)
|
| 70 |
+
curve.append((k / n, 1 - cum / k, conf)) # coverage, risk, confidence
|
| 71 |
+
aurc = sum(r for _, r, _ in curve) / len(curve)
|
| 72 |
+
return curve, aurc
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def ece(scored, bins: int = 10) -> float:
|
| 76 |
+
n = len(scored)
|
| 77 |
+
e = 0.0
|
| 78 |
+
for b in range(bins):
|
| 79 |
+
lo, hi = b / bins, (b + 1) / bins
|
| 80 |
+
bucket = [(c, ok) for c, ok in scored if (lo <= c < hi) or (b == bins - 1 and c == 1.0)]
|
| 81 |
+
if not bucket:
|
| 82 |
+
continue
|
| 83 |
+
conf = sum(c for c, _ in bucket) / len(bucket)
|
| 84 |
+
acc = sum(int(ok) for _, ok in bucket) / len(bucket)
|
| 85 |
+
e += len(bucket) / n * abs(conf - acc)
|
| 86 |
+
return e
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def operating_point(scored, threshold: float):
|
| 90 |
+
acted = [(c, ok) for c, ok in scored if c >= threshold]
|
| 91 |
+
coverage = len(acted) / len(scored)
|
| 92 |
+
precision = (sum(int(ok) for _, ok in acted) / len(acted)) if acted else 1.0
|
| 93 |
+
return coverage, precision
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def main() -> None:
|
| 97 |
+
probe, idx = build_probe()
|
| 98 |
+
scored = _scored(probe, idx)
|
| 99 |
+
curve, aurc = risk_coverage(scored)
|
| 100 |
+
e = ece(scored)
|
| 101 |
+
print(f"\n=== Selective prediction / calibration — grounded city reconciliation "
|
| 102 |
+
f"({len(probe)} probes: real typos + traps) ===\n")
|
| 103 |
+
print(f" AURC (area under risk-coverage, lower=better) = {aurc:.4f}")
|
| 104 |
+
print(f" ECE (expected calibration error, lower=better) = {e:.4f}")
|
| 105 |
+
print("\n Risk-Coverage operating points:")
|
| 106 |
+
print(f" {'threshold':>10}{'coverage':>10}{'precision':>11}")
|
| 107 |
+
for t in (0.70, 0.78, 0.84, 0.90, 0.95, 1.00):
|
| 108 |
+
cov, prec = operating_point(scored, t)
|
| 109 |
+
print(f" {t:>10.2f}{cov:>10.3f}{prec:>11.3f}")
|
| 110 |
+
# threshold achieving >=95% precision
|
| 111 |
+
best_t = next((t / 100 for t in range(70, 101)
|
| 112 |
+
if operating_point(scored, t / 100)[1] >= 0.95), 1.0)
|
| 113 |
+
cov95, _ = operating_point(scored, best_t)
|
| 114 |
+
print(f"\n -> for >=95% precision use threshold {best_t:.2f} (coverage {cov95:.3f}). "
|
| 115 |
+
"The confidence is trustworthy enough to ABSTAIN on the rest — the safety contract.")
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
main()
|
eval/capture_plan_local.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Capture a raw v6 model plan LOCALLY (Ollama Q8_0 GGUF) for a Raha dataset.
|
| 2 |
+
|
| 3 |
+
Mirrors the Modal capture composition (scripts/modal_eval_v5.py --capture):
|
| 4 |
+
make_batched_planner(base, batch_size=4), greedy, no grounded wrapper, no union —
|
| 5 |
+
verification/union happen downstream (eval/raha_table.py, eval/precision_curve.py).
|
| 6 |
+
DISCLOSED deltas vs the Modal captures: (1) Q8_0 GGUF on local Ollama instead of the
|
| 7 |
+
bf16 merged adapter on A100 — quantization may shift individual mappings; (2) Ollama
|
| 8 |
+
format=json instead of generate(suppress_tokens=[151657,151658]) — both exist solely
|
| 9 |
+
to block the degenerate <tool_call> first token (without either, generation loops).
|
| 10 |
+
|
| 11 |
+
Prereq: ollama pull hf.co/ricalanis/scrubdata-qwen3-4b-v6-q8:Q8_0
|
| 12 |
+
ollama create scrubdata-ft -f notebooks/Modelfile
|
| 13 |
+
|
| 14 |
+
uv run python -m eval.capture_plan_local --dataset beers
|
| 15 |
+
Writes eval/results/v6_<dataset>_raw_plan_localq8.json.
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
|
| 20 |
+
import argparse
|
| 21 |
+
import json
|
| 22 |
+
import time
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
|
| 25 |
+
from scrubdata.model_planner import _extract_json, make_batched_planner
|
| 26 |
+
|
| 27 |
+
from .run_real_multi import _raha_pair
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def make_json_constrained_planner(model: str, host: str = "http://localhost:11434",
|
| 31 |
+
timeout: int = 600):
|
| 32 |
+
"""Local Ollama planner with format=json (grammar-constrained decoding)."""
|
| 33 |
+
import urllib.request
|
| 34 |
+
|
| 35 |
+
from scrubdata.profiler import profile_dataframe
|
| 36 |
+
from scrubdata.prompt import SYSTEM_PROMPT, build_user_prompt
|
| 37 |
+
|
| 38 |
+
def planner(dirty_df, *_):
|
| 39 |
+
user = build_user_prompt(profile_dataframe(dirty_df), dirty_df)
|
| 40 |
+
payload = {
|
| 41 |
+
"model": model, "stream": False, "format": "json",
|
| 42 |
+
"messages": [{"role": "system", "content": SYSTEM_PROMPT},
|
| 43 |
+
{"role": "user", "content": user}],
|
| 44 |
+
"options": {"temperature": 0, "num_predict": 2000, "num_ctx": 16384},
|
| 45 |
+
}
|
| 46 |
+
req = urllib.request.Request(
|
| 47 |
+
host + "/api/chat", data=json.dumps(payload).encode(),
|
| 48 |
+
headers={"Content-Type": "application/json"})
|
| 49 |
+
try:
|
| 50 |
+
with urllib.request.urlopen(req, timeout=timeout) as r:
|
| 51 |
+
out = json.loads(r.read())["message"]["content"]
|
| 52 |
+
except Exception as e: # noqa: BLE001
|
| 53 |
+
print(f" batch failed: {str(e)[:80]}", flush=True)
|
| 54 |
+
return {"__error__": str(e)[:120]}
|
| 55 |
+
plan = _extract_json(out)
|
| 56 |
+
if plan is None:
|
| 57 |
+
print(f" batch returned no JSON: {out[:80]!r}", flush=True)
|
| 58 |
+
return {"__error__": "no_json"}
|
| 59 |
+
plan.setdefault("table_operations", [])
|
| 60 |
+
plan.setdefault("columns", [])
|
| 61 |
+
plan.setdefault("flags", [])
|
| 62 |
+
return plan
|
| 63 |
+
return planner
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def main() -> None:
|
| 67 |
+
ap = argparse.ArgumentParser()
|
| 68 |
+
ap.add_argument("--dataset", required=True)
|
| 69 |
+
ap.add_argument("--model", default="scrubdata-ft")
|
| 70 |
+
ap.add_argument("--timeout", type=int, default=600)
|
| 71 |
+
args = ap.parse_args()
|
| 72 |
+
|
| 73 |
+
dirty, _clean = _raha_pair(args.dataset) # same table the scorer sees
|
| 74 |
+
print(f"capturing plan: {args.dataset} ({len(dirty)} rows x {dirty.shape[1]} cols)",
|
| 75 |
+
flush=True)
|
| 76 |
+
t0 = time.time()
|
| 77 |
+
plan = make_batched_planner(make_json_constrained_planner(args.model, timeout=args.timeout),
|
| 78 |
+
batch_size=4)(dirty)
|
| 79 |
+
dt = time.time() - t0
|
| 80 |
+
n_ops = sum(len(c.get("operations", [])) for c in plan.get("columns", []))
|
| 81 |
+
print(f"done in {dt:.0f}s — {len(plan.get('columns', []))} columns, {n_ops} ops")
|
| 82 |
+
|
| 83 |
+
out = (Path(__file__).resolve().parent / "results"
|
| 84 |
+
/ f"v6_{args.dataset}_raw_plan_localq8.json")
|
| 85 |
+
json.dump(plan, open(out, "w"), indent=1)
|
| 86 |
+
print(f"written to {out}")
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
if __name__ == "__main__":
|
| 90 |
+
main()
|
eval/contamination_probe.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Memorization probe (W4.6): can a web-trained model complete benchmark rows verbatim?
|
| 2 |
+
|
| 3 |
+
Legacy-public benchmarks (hospital et al., GitHub since 2019) sit inside every base
|
| 4 |
+
model's training window; a HIGH verbatim-completion rate red-flags memorized gold.
|
| 5 |
+
A low rate does not prove absence — the contamination statement stays assumption-based.
|
| 6 |
+
Control: a date-stamped post-cutoff wild harvest (expected ~0).
|
| 7 |
+
|
| 8 |
+
uv run python -m eval.contamination_probe
|
| 9 |
+
"""
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
import random
|
| 14 |
+
import subprocess
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
import pandas as pd
|
| 18 |
+
|
| 19 |
+
ROOT = Path(__file__).resolve().parent.parent
|
| 20 |
+
N_ROWS, N_GIVEN, MODEL = 30, 5, "glm-5.1"
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def probe(df: pd.DataFrame, name: str) -> dict:
|
| 24 |
+
rng = random.Random(0)
|
| 25 |
+
rows = rng.sample(range(len(df)), min(N_ROWS, len(df)))
|
| 26 |
+
cols = list(df.columns)
|
| 27 |
+
given, asked = cols[:N_GIVEN], cols[N_GIVEN:N_GIVEN + 4]
|
| 28 |
+
hits = total = 0
|
| 29 |
+
for r in rows:
|
| 30 |
+
prompt = (f"This is a row from the well-known public dataset '{name}'. "
|
| 31 |
+
f"Complete the remaining fields EXACTLY as they appear in the dataset. "
|
| 32 |
+
f"Known fields: "
|
| 33 |
+
+ "; ".join(f"{c}={df.iloc[r][c]}" for c in given)
|
| 34 |
+
+ ". Respond ONLY with: " + "; ".join(f"{c}=<value>" for c in asked))
|
| 35 |
+
out = subprocess.run(["oll", prompt, "--model", MODEL, "--max-tokens", "200"],
|
| 36 |
+
capture_output=True, text=True, timeout=120).stdout.lower()
|
| 37 |
+
for c in asked:
|
| 38 |
+
total += 1
|
| 39 |
+
v = str(df.iloc[r][c]).strip().lower()
|
| 40 |
+
if v and v not in ("nan", "") and v in out:
|
| 41 |
+
hits += 1
|
| 42 |
+
return {"table": name, "rows": len(rows), "cells_asked": total,
|
| 43 |
+
"verbatim_hits": hits, "rate": round(hits / max(total, 1), 4)}
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def main() -> None:
|
| 47 |
+
hosp = pd.read_csv(ROOT / "data" / "real" / "hospital" / "clean.csv").astype(str)
|
| 48 |
+
wild = pd.read_csv(ROOT / "data" / "wild" / "glassdoor_jobs.csv").astype(str)
|
| 49 |
+
res = {"model": MODEL, "protocol": f"{N_ROWS} rows, {N_GIVEN} given cols, 4 asked cols, exact-substring match",
|
| 50 |
+
"probes": [probe(hosp, "hospital (Raha benchmark)"),
|
| 51 |
+
probe(wild, "glassdoor_jobs (post-cutoff wild harvest)")]}
|
| 52 |
+
json.dump(res, open(ROOT / "eval" / "results" / "contamination_probe.json", "w"), indent=1)
|
| 53 |
+
print(json.dumps(res["probes"], indent=1))
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
if __name__ == "__main__":
|
| 57 |
+
main()
|
eval/cross_scoring.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""B1 (W4.2) dual-metric cross-scoring on the 5 Raha real-error datasets.
|
| 2 |
+
|
| 3 |
+
Scores every system under BOTH metric families, side by side:
|
| 4 |
+
* original — the Raha/Baran cell-level repair protocol (Mahdavi & Abedjan,
|
| 5 |
+
PVLDB 13(12), p1948, Sec 6.1 + raha/dataset.py get_data_cleaning_evaluation):
|
| 6 |
+
values minimally normalized (html-unescape, whitespace collapse — their
|
| 7 |
+
value_normalizer), then RAW string equality; precision = exact-gold repairs /
|
| 8 |
+
cells changed; recall = exact-gold repairs / (dirty->clean diff); no
|
| 9 |
+
churn-neutrality, no case folding, no semantic tolerance, no damage metric.
|
| 10 |
+
* churn_neutral — our eval.run_real_multi.score (the scoring contract):
|
| 11 |
+
convention-normalized, churn ignored, damage reported.
|
| 12 |
+
|
| 13 |
+
Systems: grounded (HEAD mock_plan), verified union (v6, tau=0.5 — identical plan
|
| 14 |
+
files to eval.raha_table), OpenRefine fingerprint/kNN, and Baran at labeling
|
| 15 |
+
budgets 0/5/20 (oracle detection; repaired CSVs from eval/run_baran.py, 3 seeds,
|
| 16 |
+
seed-mean). Baran-from-CSV caveat: corrections equal to the dirty value vanish
|
| 17 |
+
from the repaired-vs-dirty diff, so reconstructed |changed| is a lower bound on
|
| 18 |
+
Baran's own output_size (precision an upper bound; recall exact).
|
| 19 |
+
|
| 20 |
+
Also computes Kendall tau-b between the SYSTEM RANKINGS induced by the two F1s
|
| 21 |
+
(per dataset + macro), and a calibration block: our Baran oracle+20 repro vs the
|
| 22 |
+
published Table 3 "Baran" row (verified from the PVLDB PDF; see PUBLISHED below).
|
| 23 |
+
|
| 24 |
+
Acceptance: the churn-neutral rows must reproduce eval/results/raha_per_dataset.json
|
| 25 |
+
exactly (checked, hard-fails otherwise).
|
| 26 |
+
|
| 27 |
+
uv run python -m eval.cross_scoring
|
| 28 |
+
Writes eval/results/cross_scoring.json and prints LaTeX rows.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
from __future__ import annotations
|
| 32 |
+
|
| 33 |
+
import html
|
| 34 |
+
import json
|
| 35 |
+
import re
|
| 36 |
+
from pathlib import Path
|
| 37 |
+
|
| 38 |
+
import pandas as pd
|
| 39 |
+
|
| 40 |
+
from scrubdata.baselines import openrefine_fingerprint_plan, openrefine_knn_plan
|
| 41 |
+
from scrubdata.executor import apply_plan
|
| 42 |
+
from scrubdata.planner import mock_plan
|
| 43 |
+
from scrubdata.verifier import union_plans, verify_plan
|
| 44 |
+
|
| 45 |
+
from .precision_curve import _repairs_only
|
| 46 |
+
from .raha_table import TAU, UNION_PLANS, _gen_plan
|
| 47 |
+
from .run_real_multi import RAHA, _cell_only, _raha_pair, score
|
| 48 |
+
|
| 49 |
+
RESULTS = Path(__file__).resolve().parent / "results"
|
| 50 |
+
BARAN_DIRS = {0: RESULTS / "baran_n0", 5: RESULTS / "baran_n5", 20: RESULTS / "baran"}
|
| 51 |
+
|
| 52 |
+
# Baran PVLDB'20 Table 3, row "Baran" (no TL): complete set of data errors given as
|
| 53 |
+
# input (= oracle detection), labeling budget 20, mean of 10 runs. Verified by reading
|
| 54 |
+
# vldb.org/pvldb/vol13/p1948-mahdavi.pdf p1957 (2026-06-12). movies_1 is not evaluated
|
| 55 |
+
# in the paper (its real-error sets are hospital/flights/address/beers/rayyan/it/tax).
|
| 56 |
+
PUBLISHED = {"hospital": {"precision": 0.88, "recall": 0.86, "f1": 0.87},
|
| 57 |
+
"flights": {"precision": 1.00, "recall": 1.00, "f1": 1.00},
|
| 58 |
+
"beers": {"precision": 0.91, "recall": 0.89, "f1": 0.90},
|
| 59 |
+
"rayyan": {"precision": 0.76, "recall": 0.40, "f1": 0.52}}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _norm(v: str) -> str:
|
| 63 |
+
"""raha.dataset.Dataset.value_normalizer, verbatim semantics."""
|
| 64 |
+
v = html.unescape(str(v))
|
| 65 |
+
v = re.sub("[\t\n ]+", " ", v, re.UNICODE)
|
| 66 |
+
return v.strip("\t\n ")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def baran_score(dirty: pd.DataFrame, clean: pd.DataFrame, out: pd.DataFrame) -> dict:
|
| 70 |
+
"""The original Raha/Baran repair metric over a repaired DataFrame: minimal
|
| 71 |
+
normalization then raw equality; changed = repaired-vs-dirty diff."""
|
| 72 |
+
n = min(len(dirty), len(out), len(clean))
|
| 73 |
+
errors = changed = tp = 0
|
| 74 |
+
for j, col in enumerate(dirty.columns):
|
| 75 |
+
present = col in out.columns
|
| 76 |
+
for i in range(n):
|
| 77 |
+
dv, cv = _norm(dirty.iat[i, j]), _norm(clean.iat[i, j])
|
| 78 |
+
ov = _norm(out.iloc[i][col]) if present else dv
|
| 79 |
+
err, chg = dv != cv, ov != dv
|
| 80 |
+
errors += err
|
| 81 |
+
changed += chg
|
| 82 |
+
tp += chg and err and ov == cv
|
| 83 |
+
p = tp / changed if changed else 0.0
|
| 84 |
+
r = tp / errors if errors else 0.0
|
| 85 |
+
f1 = 2 * p * r / (p + r) if (p + r) else 0.0
|
| 86 |
+
return {"f1": f1, "precision": p, "recall": r,
|
| 87 |
+
"_errors": errors, "_changed": changed, "_tp": tp}
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def _both(dirty, clean, out) -> dict:
|
| 91 |
+
m = score(dirty, clean, out)
|
| 92 |
+
return {"original": baran_score(dirty, clean, out),
|
| 93 |
+
"churn_neutral": {k: m[k] for k in
|
| 94 |
+
("f1", "precision", "recall", "damage",
|
| 95 |
+
"_errors", "_changed", "_fixed")}}
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def kendall_tau(xs, ys) -> float:
|
| 99 |
+
"""Kendall tau-b (tie-corrected), stdlib."""
|
| 100 |
+
n = len(xs)
|
| 101 |
+
n0, n1, n2, nc, nd = n * (n - 1) // 2, 0, 0, 0, 0
|
| 102 |
+
for i in range(n):
|
| 103 |
+
for j in range(i + 1, n):
|
| 104 |
+
a, b = xs[i] - xs[j], ys[i] - ys[j]
|
| 105 |
+
n1 += a == 0
|
| 106 |
+
n2 += b == 0
|
| 107 |
+
if a != 0 and b != 0:
|
| 108 |
+
nc += (a > 0) == (b > 0)
|
| 109 |
+
nd += (a > 0) != (b > 0)
|
| 110 |
+
denom = ((n0 - n1) * (n0 - n2)) ** 0.5
|
| 111 |
+
return (nc - nd) / denom if denom else 0.0
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def _mean_rows(rows: list[dict]) -> dict:
|
| 115 |
+
return {k: sum(r[k] for r in rows) / len(rows) for k in rows[0]}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def main() -> None:
|
| 119 |
+
out = {"protocol": {
|
| 120 |
+
"original": "Raha/Baran convention: value_normalizer (html-unescape + "
|
| 121 |
+
"whitespace collapse) then raw string equality; P = exact-gold "
|
| 122 |
+
"repairs / changed cells, R = exact-gold repairs / (dirty->clean "
|
| 123 |
+
"diff); no churn-neutrality, no damage",
|
| 124 |
+
"churn_neutral": "eval.run_real_multi.score — the scoring contract",
|
| 125 |
+
"baran_rows": "oracle error positions + n gold labels, 3 seeds, seed-mean; "
|
| 126 |
+
"reconstructed from repaired CSVs (no-op corrections vanish: "
|
| 127 |
+
"|changed| lower-bounds Baran's output_size)",
|
| 128 |
+
"movies_1": "first 2000 rows (_raha_pair), as everywhere in the suite"},
|
| 129 |
+
"systems": {}}
|
| 130 |
+
|
| 131 |
+
deterministic = [("grounded", mock_plan),
|
| 132 |
+
("openrefine_fingerprint", openrefine_fingerprint_plan),
|
| 133 |
+
("openrefine_knn", openrefine_knn_plan)]
|
| 134 |
+
for label, planner in deterministic:
|
| 135 |
+
rows = []
|
| 136 |
+
for name, _dom in RAHA:
|
| 137 |
+
dirty, clean = _raha_pair(name)
|
| 138 |
+
cleaned, _ = apply_plan(dirty, _cell_only(planner(dirty)))
|
| 139 |
+
m = _both(dirty, clean, cleaned)
|
| 140 |
+
rows.append({"dataset": name, **m})
|
| 141 |
+
print(f" {label:<24}{name:<10} orig={m['original']['f1']:.3f} "
|
| 142 |
+
f"cn={m['churn_neutral']['f1']:.3f}", flush=True)
|
| 143 |
+
out["systems"][label] = {"per_dataset": rows}
|
| 144 |
+
|
| 145 |
+
rows = []
|
| 146 |
+
for name, _dom in RAHA:
|
| 147 |
+
base = (json.load(open(UNION_PLANS[name])) if name in UNION_PLANS
|
| 148 |
+
else _gen_plan(name))
|
| 149 |
+
dirty, clean = _raha_pair(name)
|
| 150 |
+
plan = _repairs_only(union_plans(verify_plan(dirty, base, tau=TAU),
|
| 151 |
+
mock_plan(dirty)))
|
| 152 |
+
cleaned, _ = apply_plan(dirty, plan)
|
| 153 |
+
m = _both(dirty, clean, cleaned)
|
| 154 |
+
rows.append({"dataset": name, **m})
|
| 155 |
+
print(f" {'verified_union':<24}{name:<10} orig={m['original']['f1']:.3f} "
|
| 156 |
+
f"cn={m['churn_neutral']['f1']:.3f}", flush=True)
|
| 157 |
+
out["systems"]["verified_union_v6_tau0.5"] = {"per_dataset": rows}
|
| 158 |
+
|
| 159 |
+
for n_labels, d in BARAN_DIRS.items():
|
| 160 |
+
rows = []
|
| 161 |
+
for name, _dom in RAHA:
|
| 162 |
+
dirty, clean = _raha_pair(name)
|
| 163 |
+
per_seed = []
|
| 164 |
+
for p in sorted(d.glob(f"{name}_seed*_repaired.csv")):
|
| 165 |
+
repaired = pd.read_csv(p, dtype=str, keep_default_na=False)
|
| 166 |
+
per_seed.append(_both(dirty, clean, repaired))
|
| 167 |
+
m = {"original": _mean_rows([s["original"] for s in per_seed]),
|
| 168 |
+
"churn_neutral": _mean_rows([s["churn_neutral"] for s in per_seed])}
|
| 169 |
+
rows.append({"dataset": name, "n_seeds": len(per_seed), **m})
|
| 170 |
+
print(f" {'baran_oracle%d' % n_labels:<24}{name:<10} "
|
| 171 |
+
f"orig={m['original']['f1']:.3f} "
|
| 172 |
+
f"cn={m['churn_neutral']['f1']:.3f}", flush=True)
|
| 173 |
+
out["systems"][f"baran_oracle{n_labels}"] = {"per_dataset": rows}
|
| 174 |
+
|
| 175 |
+
for sys in out["systems"].values():
|
| 176 |
+
for fam in ("original", "churn_neutral"):
|
| 177 |
+
sys[f"macro_f1_{fam}"] = _mean_rows(
|
| 178 |
+
[r[fam] for r in sys["per_dataset"]])["f1"]
|
| 179 |
+
|
| 180 |
+
# acceptance: churn-neutral rows == raha_per_dataset.json (exact)
|
| 181 |
+
ref = json.load(open(RESULTS / "raha_per_dataset.json"))
|
| 182 |
+
checks = []
|
| 183 |
+
for key, ref_key in [("grounded", "grounded"),
|
| 184 |
+
("openrefine_fingerprint", "openrefine_fingerprint"),
|
| 185 |
+
("openrefine_knn", "openrefine_knn"),
|
| 186 |
+
("verified_union_v6_tau0.5", "verified_union_v6_tau0.5"),
|
| 187 |
+
("baran_oracle20", "baran_oracle20")]:
|
| 188 |
+
for got, want in zip(out["systems"][key]["per_dataset"],
|
| 189 |
+
ref["systems"][ref_key]["per_dataset"]):
|
| 190 |
+
for k in ("f1", "precision", "recall", "damage"):
|
| 191 |
+
ok = abs(got["churn_neutral"][k] - want[k]) < 1e-9
|
| 192 |
+
checks.append(ok)
|
| 193 |
+
if not ok:
|
| 194 |
+
print(f"MISMATCH {key}/{got['dataset']}/{k}: "
|
| 195 |
+
f"{got['churn_neutral'][k]} vs {want[k]}")
|
| 196 |
+
out["acceptance"] = {"vs": "raha_per_dataset.json", "n_cells": len(checks),
|
| 197 |
+
"pass": all(checks)}
|
| 198 |
+
print(f"\nacceptance: {sum(checks)}/{len(checks)} cells match "
|
| 199 |
+
f"-> {'PASS' if all(checks) else 'FAIL'}")
|
| 200 |
+
if not all(checks):
|
| 201 |
+
raise SystemExit("acceptance FAILED")
|
| 202 |
+
|
| 203 |
+
# Kendall tau-b between system rankings under the two F1s
|
| 204 |
+
primary = ["grounded", "verified_union_v6_tau0.5", "openrefine_fingerprint",
|
| 205 |
+
"openrefine_knn", "baran_oracle20"]
|
| 206 |
+
extended = primary + ["baran_oracle0", "baran_oracle5"]
|
| 207 |
+
taus = {}
|
| 208 |
+
for label, sysset in [("primary", primary), ("extended", extended)]:
|
| 209 |
+
per_ds = {}
|
| 210 |
+
for i, (name, _dom) in enumerate(RAHA):
|
| 211 |
+
xs = [out["systems"][s]["per_dataset"][i]["original"]["f1"] for s in sysset]
|
| 212 |
+
ys = [out["systems"][s]["per_dataset"][i]["churn_neutral"]["f1"] for s in sysset]
|
| 213 |
+
per_ds[name] = kendall_tau(xs, ys)
|
| 214 |
+
xs = [out["systems"][s]["macro_f1_original"] for s in sysset]
|
| 215 |
+
ys = [out["systems"][s]["macro_f1_churn_neutral"] for s in sysset]
|
| 216 |
+
taus[label] = {"systems": sysset, "per_dataset": per_ds,
|
| 217 |
+
"macro": kendall_tau(xs, ys)}
|
| 218 |
+
print(f"tau-b ({label}): macro={taus[label]['macro']:.3f} " +
|
| 219 |
+
" ".join(f"{n}={t:.3f}" for n, t in per_ds.items()))
|
| 220 |
+
out["kendall_tau_b"] = taus
|
| 221 |
+
|
| 222 |
+
# calibration: our Baran oracle+20 repro (ORIGINAL metric) vs published Table 3
|
| 223 |
+
cal = []
|
| 224 |
+
b20 = {r["dataset"]: r for r in out["systems"]["baran_oracle20"]["per_dataset"]}
|
| 225 |
+
for name, pub in PUBLISHED.items():
|
| 226 |
+
ours = b20[name]["original"]
|
| 227 |
+
cal.append({"dataset": name, "published_f1": pub["f1"],
|
| 228 |
+
"published_precision": pub["precision"],
|
| 229 |
+
"published_recall": pub["recall"],
|
| 230 |
+
"repro_f1": ours["f1"], "repro_precision": ours["precision"],
|
| 231 |
+
"repro_recall": ours["recall"],
|
| 232 |
+
"delta_f1": ours["f1"] - pub["f1"]})
|
| 233 |
+
print(f"calibration {name:<10} published F1={pub['f1']:.2f} "
|
| 234 |
+
f"repro F1={ours['f1']:.3f} (d={ours['f1'] - pub['f1']:+.3f})")
|
| 235 |
+
out["calibration"] = {
|
| 236 |
+
"source": "Mahdavi & Abedjan, PVLDB 13(12) p1948, Table 3 row 'Baran' "
|
| 237 |
+
"(no TL): complete error set given (oracle detection), budget 20, "
|
| 238 |
+
"mean of 10 runs; PDF read 2026-06-12",
|
| 239 |
+
"notes": "their runs: full datasets, 10 label seeds, Wikipedia value models "
|
| 240 |
+
"available in package but Table-3 row is without TL; ours: 3 label "
|
| 241 |
+
"seeds, no pretraining, movies_1 not in their paper; our "
|
| 242 |
+
"churn-neutral macro for this row is the paper's 0.811",
|
| 243 |
+
"rows": cal}
|
| 244 |
+
|
| 245 |
+
dest = RESULTS / "cross_scoring.json"
|
| 246 |
+
json.dump(out, open(dest, "w"), indent=1)
|
| 247 |
+
print(f"written to {dest}")
|
| 248 |
+
print(latex(out))
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
LABELS = [("grounded", "Grounded (ours, deterministic)"),
|
| 252 |
+
("verified_union_v6_tau0.5", r"Verified union (v6, $\tau{=}0.5$)"),
|
| 253 |
+
("openrefine_fingerprint", "OpenRefine fingerprint"),
|
| 254 |
+
("openrefine_knn", "OpenRefine kNN"),
|
| 255 |
+
("baran_oracle20", r"Baran (oracle det.\ + 20 labels)")]
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def latex(out: dict) -> str:
|
| 259 |
+
"""Booktabs rows: per system x dataset, original P/R/F1 next to churn-neutral
|
| 260 |
+
P/R/F1 + damage."""
|
| 261 |
+
L = [r"\begin{tabular}{llrrrrrrr}", r"\toprule",
|
| 262 |
+
r" & & \multicolumn{3}{c}{Original (Baran) metric} & "
|
| 263 |
+
r"\multicolumn{4}{c}{Churn-neutral (ours)} \\",
|
| 264 |
+
r"\cmidrule(lr){3-5}\cmidrule(lr){6-9}",
|
| 265 |
+
r"System & Dataset & Prec. & Rec. & F1 & Prec. & Rec. & F1 & Damage \\",
|
| 266 |
+
r"\midrule"]
|
| 267 |
+
for key, label in LABELS:
|
| 268 |
+
for i, r in enumerate(out["systems"][key]["per_dataset"]):
|
| 269 |
+
o, c = r["original"], r["churn_neutral"]
|
| 270 |
+
L.append(f"{label if i == 0 else ''} & "
|
| 271 |
+
f"{r['dataset'].replace('_', r'\_')} & "
|
| 272 |
+
f"{o['precision']:.3f} & {o['recall']:.3f} & {o['f1']:.3f} & "
|
| 273 |
+
f"{c['precision']:.3f} & {c['recall']:.3f} & {c['f1']:.3f} & "
|
| 274 |
+
f"{c['damage']:.3f} \\\\")
|
| 275 |
+
L.append(f" & \\emph{{macro}} & & & "
|
| 276 |
+
f"\\emph{{{out['systems'][key]['macro_f1_original']:.3f}}} & & & "
|
| 277 |
+
f"\\emph{{{out['systems'][key]['macro_f1_churn_neutral']:.3f}}} & \\\\")
|
| 278 |
+
L.append(r"\midrule")
|
| 279 |
+
t = out["kendall_tau_b"]["primary"]
|
| 280 |
+
L.append(r"\multicolumn{9}{l}{Kendall $\tau_b$ between system rankings: "
|
| 281 |
+
f"macro {t['macro']:.2f}; per dataset " +
|
| 282 |
+
", ".join(f"{n.replace('_', r'\_')} {v:.2f}"
|
| 283 |
+
for n, v in t["per_dataset"].items()) + r"} \\")
|
| 284 |
+
cal = ", ".join(f"{r['dataset'].replace('_', r'\_')} {r['repro_f1']:.3f} vs "
|
| 285 |
+
f"{r['published_f1']:.2f}" for r in out["calibration"]["rows"])
|
| 286 |
+
L.append(r"\multicolumn{9}{l}{Calibration, original metric (our Baran oracle+20 "
|
| 287 |
+
r"repro vs PVLDB'20 Table~3): " + cal + r"} \\")
|
| 288 |
+
L.append(r"\bottomrule")
|
| 289 |
+
L.append(r"\end{tabular}")
|
| 290 |
+
return "\n".join(L)
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
if __name__ == "__main__":
|
| 294 |
+
main()
|
eval/degenerate.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""W4.3 + W4.4 — degenerate baselines + cost-weighted damage over the paired sets.
|
| 2 |
+
|
| 3 |
+
Four scorer-pinning policies over the same dirty/clean pairs eval/paired_bench.py
|
| 4 |
+
walks: no-op (output = dirty), abstain-all (no-op + flags; score-identical at the
|
| 5 |
+
cell level — the repair metric is flag-blind by design, flags surface in audit
|
| 6 |
+
metrics), random-edit (seeded vandalism: 5% of cells replaced with another value
|
| 7 |
+
from the same column) and oracle (output = clean, headers realigned to dirty's —
|
| 8 |
+
23/42 pairs differ in header naming only; cell alignment is positional). They pin
|
| 9 |
+
the metric's floor (no-op F1 = 0, damage = 0), ceiling (oracle F1 = 1, damage = 0)
|
| 10 |
+
and show it punishes vandalism. Also reruns the SHIPPED pipeline (mock_plan) to
|
| 11 |
+
capture raw fix/damage cell counts and reports Effective-Reliability-style
|
| 12 |
+
cost-weighted scores score_c = fixes - c*damage_cells for c in {1, 5, 10}.
|
| 13 |
+
|
| 14 |
+
uv run python -m eval.degenerate
|
| 15 |
+
Writes eval/results/degenerate.json + docs/DEGENERATE_BASELINES.md. Per-pair rows
|
| 16 |
+
are cached incrementally (eval/results/degenerate_pairs.json) so a killed run
|
| 17 |
+
resumes where it stopped.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import argparse
|
| 23 |
+
import json
|
| 24 |
+
import random
|
| 25 |
+
import time
|
| 26 |
+
from pathlib import Path
|
| 27 |
+
|
| 28 |
+
from scrubdata.executor import apply_plan
|
| 29 |
+
from scrubdata.planner import mock_plan
|
| 30 |
+
|
| 31 |
+
from .paired_bench import _load, pairs
|
| 32 |
+
from .run_real_multi import _cell_only, score
|
| 33 |
+
|
| 34 |
+
ROOT = Path(__file__).resolve().parent.parent
|
| 35 |
+
EDIT_FRAC = 0.05
|
| 36 |
+
SEED = 7
|
| 37 |
+
COSTS = (1, 5, 10)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _noop(dirty, clean):
|
| 41 |
+
return dirty
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _abstain_all(dirty, clean):
|
| 45 |
+
return dirty.copy() # + flags conceptually; the cell metric is flag-blind
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _random_edit(dirty, clean, seed=SEED):
|
| 49 |
+
rng = random.Random(seed)
|
| 50 |
+
out = dirty.copy()
|
| 51 |
+
n, m = out.shape
|
| 52 |
+
uniq = [list(dict.fromkeys(out.iloc[:, j])) for j in range(m)]
|
| 53 |
+
for idx in rng.sample(range(n * m), max(1, int(n * m * EDIT_FRAC))):
|
| 54 |
+
i, j = divmod(idx, m)
|
| 55 |
+
alts = [v for v in uniq[j] if v != out.iat[i, j]]
|
| 56 |
+
if alts:
|
| 57 |
+
out.iat[i, j] = rng.choice(alts)
|
| 58 |
+
return out
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _oracle(dirty, clean):
|
| 62 |
+
out = clean.copy()
|
| 63 |
+
out.columns = dirty.columns # header-naming variants only; alignment is positional
|
| 64 |
+
return out
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _shipped(dirty, clean):
|
| 68 |
+
return apply_plan(dirty, _cell_only(mock_plan(dirty)))[0]
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
POLICIES = [("no-op", _noop), ("abstain-all", _abstain_all),
|
| 72 |
+
("random-edit", _random_edit), ("oracle", _oracle),
|
| 73 |
+
("shipped", _shipped)]
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _mean(xs):
|
| 77 |
+
xs = list(xs)
|
| 78 |
+
return sum(xs) / len(xs) if xs else 0.0
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def main() -> None:
|
| 82 |
+
ap = argparse.ArgumentParser()
|
| 83 |
+
ap.add_argument("--only", default=None)
|
| 84 |
+
ap.add_argument("--out", default="eval/results/degenerate.json")
|
| 85 |
+
ap.add_argument("--cache", default="eval/results/degenerate_pairs.json")
|
| 86 |
+
args = ap.parse_args()
|
| 87 |
+
cache = json.load(open(args.cache)) if Path(args.cache).exists() else {}
|
| 88 |
+
for p in pairs():
|
| 89 |
+
if args.only and p.name != args.only:
|
| 90 |
+
continue
|
| 91 |
+
if p.name in cache:
|
| 92 |
+
continue
|
| 93 |
+
try:
|
| 94 |
+
dirty, clean = _load(p)
|
| 95 |
+
except Exception as e: # noqa: BLE001
|
| 96 |
+
print(f" {p.name}: LOAD FAILED {type(e).__name__}")
|
| 97 |
+
continue
|
| 98 |
+
entry = {}
|
| 99 |
+
for name, policy in POLICIES:
|
| 100 |
+
t0 = time.perf_counter()
|
| 101 |
+
m = score(dirty, clean, policy(dirty, clean))
|
| 102 |
+
n = min(len(dirty), len(clean))
|
| 103 |
+
clean_cells = n * dirty.shape[1] - m["_errors"]
|
| 104 |
+
entry[name] = {
|
| 105 |
+
"name": p.name, "errors": m["_errors"],
|
| 106 |
+
"f1": m["f1"], "precision": m["precision"], "recall": m["recall"],
|
| 107 |
+
"damage": m["damage"], "fixed": m["_fixed"], "changed": m["_changed"],
|
| 108 |
+
"damage_cells": round(m["damage"] * clean_cells),
|
| 109 |
+
"sec": round(time.perf_counter() - t0, 1)}
|
| 110 |
+
cache[p.name] = entry
|
| 111 |
+
json.dump(cache, open(args.cache, "w"), indent=1)
|
| 112 |
+
print(f" {p.name:<46} " + " ".join(
|
| 113 |
+
f"{name}={entry[name]['f1']:.3f}" for name, _ in POLICIES), flush=True)
|
| 114 |
+
res = {name: [cache[k][name] for k in sorted(cache)] for name, _ in POLICIES}
|
| 115 |
+
|
| 116 |
+
out = {"n_pairs": len(res["no-op"]), "edit_frac": EDIT_FRAC, "seed": SEED,
|
| 117 |
+
"policies": {}, "acceptance": {}}
|
| 118 |
+
for name, _ in POLICIES:
|
| 119 |
+
rows = res[name]
|
| 120 |
+
E, F, D = (sum(r[k] for r in rows) for k in ("errors", "fixed", "damage_cells"))
|
| 121 |
+
out["policies"][name] = {
|
| 122 |
+
"macro": {k: round(_mean(r[k] for r in rows), 4)
|
| 123 |
+
for k in ("f1", "precision", "recall", "damage")},
|
| 124 |
+
"micro": {"errors": E, "fixed": F, "changed": sum(r["changed"] for r in rows),
|
| 125 |
+
"damage_cells": D},
|
| 126 |
+
"score_c": {f"c={c}": {"raw": F - c * D,
|
| 127 |
+
"per_error": round((F - c * D) / E, 4)}
|
| 128 |
+
for c in COSTS},
|
| 129 |
+
"sec": round(sum(r["sec"] for r in rows), 1),
|
| 130 |
+
"per_pair": rows}
|
| 131 |
+
bad_oracle = [r["name"] for r in res["oracle"] if r["f1"] != 1.0]
|
| 132 |
+
bad_noop = [r["name"] for r in res["no-op"] if r["damage"] != 0.0]
|
| 133 |
+
out["acceptance"] = {"oracle_f1_all_exactly_1": not bad_oracle,
|
| 134 |
+
"noop_damage_all_exactly_0": not bad_noop,
|
| 135 |
+
"violations": {"oracle": bad_oracle, "no-op": bad_noop}}
|
| 136 |
+
Path(args.out).parent.mkdir(parents=True, exist_ok=True)
|
| 137 |
+
json.dump(out, open(args.out, "w"), indent=1)
|
| 138 |
+
|
| 139 |
+
P = out["policies"]
|
| 140 |
+
L = ["# Degenerate baselines + cost-weighted damage (W4.3 + W4.4)", "",
|
| 141 |
+
f"Same {out['n_pairs']} dirty/clean pairs as `eval/paired_bench.py`, scored with "
|
| 142 |
+
"`run_real_multi.score()` (churn-neutral F1 + damage). The degenerate policies pin",
|
| 143 |
+
"the metric: no-op = floor (F1 0, damage 0), oracle = ceiling (F1 1, damage 0),",
|
| 144 |
+
"random-edit (seeded, 5% of cells) = vandalism the metric must punish. Abstain-all",
|
| 145 |
+
"is score-identical to no-op — the repair metric is flag-blind by design.", "",
|
| 146 |
+
"| policy | macro F1 | macro P | macro R | macro damage | fixed | damage cells |",
|
| 147 |
+
"|---|---|---|---|---|---|---|"]
|
| 148 |
+
for name, _ in POLICIES:
|
| 149 |
+
ma, mi = P[name]["macro"], P[name]["micro"]
|
| 150 |
+
L.append(f"| {name} | {ma['f1']:.3f} | {ma['precision']:.3f} | {ma['recall']:.3f} "
|
| 151 |
+
f"| {ma['damage']:.4f} | {mi['fixed']} | {mi['damage_cells']} |")
|
| 152 |
+
L += ["", "## Cost-weighted scores (Effective-Reliability style, W4.4)", "",
|
| 153 |
+
"score_c = fixes − c·damage_cells, micro-summed over all pairs; per-error =",
|
| 154 |
+
f"score_c / {P['shipped']['micro']['errors']} total benchmark errors.", "",
|
| 155 |
+
"| policy | " + " | ".join(f"c={c} (per-error)" for c in COSTS) + " |",
|
| 156 |
+
"|---|" + "---|" * len(COSTS)]
|
| 157 |
+
for name, _ in POLICIES:
|
| 158 |
+
sc = P[name]["score_c"]
|
| 159 |
+
L.append(f"| {name} | " + " | ".join(
|
| 160 |
+
f"{sc[f'c={c}']['raw']} ({sc[f'c={c}']['per_error']:+.3f})" for c in COSTS) + " |")
|
| 161 |
+
a = out["acceptance"]
|
| 162 |
+
L += ["", f"Acceptance: oracle F1 = 1.0 on all pairs: **{a['oracle_f1_all_exactly_1']}** · "
|
| 163 |
+
f"no-op damage = 0.0 on all pairs: **{a['noop_damage_all_exactly_0']}**",
|
| 164 |
+
f"Repro: `uv run python -m eval.degenerate` (seed {SEED}, edit fraction {EDIT_FRAC})."]
|
| 165 |
+
(ROOT / "docs" / "DEGENERATE_BASELINES.md").write_text("\n".join(L) + "\n")
|
| 166 |
+
print(f"{out['n_pairs']} pairs x {len(POLICIES)} policies -> {args.out} "
|
| 167 |
+
"+ docs/DEGENERATE_BASELINES.md")
|
| 168 |
+
print("acceptance:", out["acceptance"])
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
if __name__ == "__main__":
|
| 172 |
+
main()
|
eval/diagnose_model.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Diagnose vanilla-model failures: truncation vs genuine schema violation.
|
| 2 |
+
|
| 3 |
+
Runs N examples through an Ollama Cloud model, categorizing each output:
|
| 4 |
+
empty / no_json / truncated / json_but_schema_invalid / valid
|
| 5 |
+
and reading `oll`'s stderr token counts to detect output hitting the cap.
|
| 6 |
+
|
| 7 |
+
uv run eval/diagnose_model.py --n 12 --model glm-5.1 --max-tokens 8000
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import argparse
|
| 13 |
+
import json
|
| 14 |
+
import random
|
| 15 |
+
import re
|
| 16 |
+
import subprocess
|
| 17 |
+
from collections import Counter
|
| 18 |
+
|
| 19 |
+
from jsonschema import Draft202012Validator
|
| 20 |
+
|
| 21 |
+
from scrubdata.prompt import SYSTEM_PROMPT, build_user_prompt
|
| 22 |
+
from scrubdata.profiler import profile_dataframe
|
| 23 |
+
from training.generate import make_example
|
| 24 |
+
|
| 25 |
+
from .metrics import PLAN_SCHEMA
|
| 26 |
+
|
| 27 |
+
_V = Draft202012Validator(PLAN_SCHEMA)
|
| 28 |
+
_TOK = re.compile(r"out\s+(\d+)\s*tok", re.I)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _call(user: str, model: str, max_tokens: int):
|
| 32 |
+
r = subprocess.run(
|
| 33 |
+
["oll", "--model", model, "--system", SYSTEM_PROMPT,
|
| 34 |
+
"--max-tokens", str(max_tokens), "--temperature", "0"],
|
| 35 |
+
input=user, capture_output=True, text=True, timeout=300)
|
| 36 |
+
out_tok = None
|
| 37 |
+
m = _TOK.search(r.stderr or "")
|
| 38 |
+
if m:
|
| 39 |
+
out_tok = int(m.group(1))
|
| 40 |
+
return r.stdout, out_tok
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _categorize(out: str, out_tok: int | None, max_tokens: int):
|
| 44 |
+
s = out.strip()
|
| 45 |
+
if not s:
|
| 46 |
+
return "empty", None
|
| 47 |
+
i, j = s.find("{"), s.rfind("}")
|
| 48 |
+
if i == -1:
|
| 49 |
+
return "no_json", None
|
| 50 |
+
near_cap = out_tok is not None and out_tok >= max_tokens - 50
|
| 51 |
+
if j < i:
|
| 52 |
+
return ("truncated" if near_cap else "no_close_brace"), None
|
| 53 |
+
try:
|
| 54 |
+
plan = json.loads(s[i:j + 1])
|
| 55 |
+
except json.JSONDecodeError:
|
| 56 |
+
return ("truncated" if near_cap else "malformed_json"), None
|
| 57 |
+
errs = sorted(_V.iter_errors(plan), key=lambda e: e.path)
|
| 58 |
+
if not errs:
|
| 59 |
+
return "valid", None
|
| 60 |
+
return "schema_invalid", errs[0].message[:90]
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def main() -> None:
|
| 64 |
+
ap = argparse.ArgumentParser()
|
| 65 |
+
ap.add_argument("--n", type=int, default=12)
|
| 66 |
+
ap.add_argument("--model", type=str, default="glm-5.1")
|
| 67 |
+
ap.add_argument("--max-tokens", type=int, default=8000)
|
| 68 |
+
ap.add_argument("--seed", type=int, default=4242)
|
| 69 |
+
args = ap.parse_args()
|
| 70 |
+
|
| 71 |
+
rng = random.Random(args.seed)
|
| 72 |
+
cats = Counter()
|
| 73 |
+
print(f"Diagnosing {args.model} @ max_tokens={args.max_tokens} on {args.n} examples\n")
|
| 74 |
+
for k in range(args.n):
|
| 75 |
+
ex = make_example(rng)
|
| 76 |
+
user = build_user_prompt(profile_dataframe(ex["dirty_df"]), ex["dirty_df"])
|
| 77 |
+
out, out_tok = _call(user, args.model, args.max_tokens)
|
| 78 |
+
cat, detail = _categorize(out, out_tok, args.max_tokens)
|
| 79 |
+
cats[cat] += 1
|
| 80 |
+
print(f" ex{k:2d}: {cat:<16} out_tok={out_tok}"
|
| 81 |
+
+ (f" [{detail}]" if detail else ""))
|
| 82 |
+
|
| 83 |
+
print("\nBreakdown:", dict(cats))
|
| 84 |
+
valid = cats.get("valid", 0)
|
| 85 |
+
trunc = cats.get("truncated", 0)
|
| 86 |
+
print(f"valid={valid}/{args.n} ({valid/args.n:.0%}) | truncated={trunc} "
|
| 87 |
+
f"| schema_invalid={cats.get('schema_invalid', 0)}")
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
if __name__ == "__main__":
|
| 91 |
+
main()
|
eval/equivalence.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""W2.d — TOST equivalence statistics for the SFT null (the bounded negative claim).
|
| 2 |
+
|
| 3 |
+
Operationalizes "weight interventions did not move held-out repair": paired
|
| 4 |
+
per-dataset GEN-F1 deltas (retrain minus champion v6) over the 3 held-out EVAL
|
| 5 |
+
sources x the 5-retrain SFT series (challenger seed31, v7 seed32, v8 seed33,
|
| 6 |
+
v9 seed34, v10 seed35), pooled (n=15). DISCLOSED granularity: the retrain series
|
| 7 |
+
was scored per held-out SOURCE only (eval/results/generalization_*.json) — the
|
| 8 |
+
42-pair paired bench exists for the shipped pipeline, not per retrain — so the
|
| 9 |
+
unit here is per-dataset, not per-pair, and within-retrain deltas are clustered
|
| 10 |
+
(flights/rayyan deltas are near-identical across retrains). A retrain-level
|
| 11 |
+
robustness check (n=5 macro deltas, one per retrain) is reported alongside.
|
| 12 |
+
|
| 13 |
+
PRE-REGISTERED (docs/ROADMAP_PUBLICATION.md W2.d, before this analysis ran):
|
| 14 |
+
SESOI delta = +/-0.05 GEN-F1, justified as smaller than the gain deterministic
|
| 15 |
+
grounding provides. TOST per Lakens'17: two one-sided t-tests against the SESOI
|
| 16 |
+
bounds; equivalence p = max of the two. Bootstrap: 10k resamples, seed 42, 90% CI.
|
| 17 |
+
|
| 18 |
+
uv run python -m eval.equivalence
|
| 19 |
+
Writes eval/results/equivalence.json.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
import json
|
| 25 |
+
from pathlib import Path
|
| 26 |
+
|
| 27 |
+
import numpy as np
|
| 28 |
+
from scipy import stats
|
| 29 |
+
|
| 30 |
+
RESULTS = Path(__file__).resolve().parent / "results"
|
| 31 |
+
SESOI = 0.05 # pre-registered (roadmap W2.d) — do not change post hoc
|
| 32 |
+
N_BOOT = 10_000
|
| 33 |
+
SEED = 42
|
| 34 |
+
|
| 35 |
+
CHAMPION = "generalization_champion.json" # champion v6/seed21 (union)
|
| 36 |
+
RETRAINS = [ # the five SFT retrains (paper sec:negative)
|
| 37 |
+
("generalization_challenger.json", "challenger seed31"),
|
| 38 |
+
("generalization_v7.json", "v7 seed32 (unicode-punct archetype)"),
|
| 39 |
+
("generalization_v8.json", "v8 seed33 (+109k harvested alias vocabs)"),
|
| 40 |
+
("generalization_v9.json", "v9 seed34 (+MusicBrainz hints, gidcl pairs)"),
|
| 41 |
+
("generalization_v10.json", "v10 seed35 (suspects-contract)"),
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _per_source_f1(fname: str) -> dict[str, float]:
|
| 46 |
+
rec = json.loads((RESULTS / fname).read_text())[0]
|
| 47 |
+
return {s["source"]: s["f1"] for s in rec["per_source"]}, rec["gen_f1"]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _tost(deltas: np.ndarray) -> dict:
|
| 51 |
+
"""Two one-sided t-tests against [-SESOI, +SESOI]; equivalence p = max."""
|
| 52 |
+
p_lo = stats.ttest_1samp(deltas, -SESOI, alternative="greater").pvalue
|
| 53 |
+
p_hi = stats.ttest_1samp(deltas, +SESOI, alternative="less").pvalue
|
| 54 |
+
return {"p_lower": float(p_lo), "p_upper": float(p_hi),
|
| 55 |
+
"p_tost": float(max(p_lo, p_hi)), "n": int(len(deltas)),
|
| 56 |
+
"mean": float(deltas.mean()), "sd": float(deltas.std(ddof=1))}
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def main() -> dict:
|
| 60 |
+
champ, champ_macro = _per_source_f1(CHAMPION)
|
| 61 |
+
pooled, per_retrain = [], []
|
| 62 |
+
for fname, label in RETRAINS:
|
| 63 |
+
ps, macro = _per_source_f1(fname)
|
| 64 |
+
assert set(ps) == set(champ), f"{fname}: source mismatch vs champion"
|
| 65 |
+
per_retrain.append({
|
| 66 |
+
"retrain": label, "file": fname,
|
| 67 |
+
"macro_gen_f1": round(macro, 6),
|
| 68 |
+
"macro_delta": round(macro - champ_macro, 6),
|
| 69 |
+
"per_dataset_delta": {s: round(ps[s] - champ[s], 6) for s in champ},
|
| 70 |
+
})
|
| 71 |
+
pooled += [ps[s] - champ[s] for s in sorted(champ)]
|
| 72 |
+
deltas = np.array(pooled)
|
| 73 |
+
|
| 74 |
+
rng = np.random.default_rng(SEED)
|
| 75 |
+
boot = np.array([rng.choice(deltas, size=len(deltas), replace=True).mean()
|
| 76 |
+
for _ in range(N_BOOT)])
|
| 77 |
+
ci = (float(np.percentile(boot, 5)), float(np.percentile(boot, 95)))
|
| 78 |
+
|
| 79 |
+
macro_deltas = np.array([r["macro_delta"] for r in per_retrain])
|
| 80 |
+
out = {
|
| 81 |
+
"spec": {"sesoi": SESOI, "sesoi_preregistered": "docs/ROADMAP_PUBLICATION.md W2.d",
|
| 82 |
+
"n_boot": N_BOOT, "seed": SEED, "ci_level": 0.90,
|
| 83 |
+
"champion": CHAMPION, "champion_macro_gen_f1": round(champ_macro, 6)},
|
| 84 |
+
"granularity": ("per-dataset (3 held-out sources x 5 retrains = 15 paired "
|
| 85 |
+
"deltas). Per-pair rows do not exist for the retrain series "
|
| 86 |
+
"(only the shipped pipeline was scored on the 42-pair bench); "
|
| 87 |
+
"within-retrain deltas are clustered, hence the retrain-level "
|
| 88 |
+
"robustness check below."),
|
| 89 |
+
"per_retrain": per_retrain,
|
| 90 |
+
"pooled_per_dataset": {
|
| 91 |
+
**_tost(deltas),
|
| 92 |
+
"ci90_bootstrap": [round(ci[0], 6), round(ci[1], 6)],
|
| 93 |
+
"ci90_width": round(ci[1] - ci[0], 6),
|
| 94 |
+
"equivalent_at_sesoi": bool(-SESOI < ci[0] and ci[1] < SESOI),
|
| 95 |
+
},
|
| 96 |
+
"retrain_level_robustness": _tost(macro_deltas),
|
| 97 |
+
"caveat": ("GEN-F1 sits near floor (champion 0.015 absolute), so the bound "
|
| 98 |
+
"certifies absence of movement on a low-dynamic-range metric; "
|
| 99 |
+
"the CI width (~0.004) shows the data could have detected effects "
|
| 100 |
+
"an order of magnitude smaller than the 0.05 SESOI."),
|
| 101 |
+
}
|
| 102 |
+
p = out["pooled_per_dataset"]
|
| 103 |
+
out["paper_sentence"] = (
|
| 104 |
+
f"Across the five-retrain series the mean held-out GEN-F1 delta (retrain "
|
| 105 |
+
f"minus champion, per-dataset, n={p['n']}) is {p['mean']:+.4f} (90\\% "
|
| 106 |
+
f"bootstrap CI [{ci[0]:+.4f}, {ci[1]:+.4f}]); TOST rejects effects larger "
|
| 107 |
+
f"than the pre-registered $\\pm$0.05 SESOI (p = {p['p_tost']:.1e}), and the "
|
| 108 |
+
f"retrain-level check (n=5 macro deltas) agrees "
|
| 109 |
+
f"(p = {out['retrain_level_robustness']['p_tost']:.1e}).")
|
| 110 |
+
|
| 111 |
+
(RESULTS / "equivalence.json").write_text(json.dumps(out, indent=2) + "\n")
|
| 112 |
+
print(json.dumps({k: out[k] for k in ("pooled_per_dataset",
|
| 113 |
+
"retrain_level_robustness",
|
| 114 |
+
"paper_sentence")}, indent=2))
|
| 115 |
+
return out
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
main()
|
eval/generalization.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""D1 — the GENERALIZATION metric: held-out-source real-error evaluation.
|
| 2 |
+
|
| 3 |
+
The wide-suite REAL slice mixes sources whose pairs are IN the champion's training mix
|
| 4 |
+
(hospital/beers/movies_1 -> mixA), so it part-measures memorization. This metric fixes
|
| 5 |
+
that and one more honesty problem:
|
| 6 |
+
|
| 7 |
+
* HELD-OUT SOURCES ONLY: a model is scored only on real-error benchmarks whose pairs
|
| 8 |
+
were never used to train it. The split is explicit and committed (TRAIN_SOURCES);
|
| 9 |
+
new harvested sources must be assigned to exactly one side.
|
| 10 |
+
* ERROR-CLASS BREAKDOWN: benchmark errors split by the SAME variant gate the training
|
| 11 |
+
derivation uses (training.real_data._is_variant — one source of truth). A
|
| 12 |
+
canonicalization system claims competence on the VARIANT class (typos / casing /
|
| 13 |
+
aliases); imputation-class errors (missing or non-variant rewrites) are reported,
|
| 14 |
+
never hidden, but a system that abstains on them is behaving correctly.
|
| 15 |
+
|
| 16 |
+
Headline numbers per system:
|
| 17 |
+
GEN-F1 churn-neutral F1 over ALL errors, macro over held-out sources
|
| 18 |
+
VARIANT-RECALL share of variant-class errors repaired (claimed competence)
|
| 19 |
+
VARIANT-PREC of committed changes on variant cells, share correct
|
| 20 |
+
damage clean cells corrupted (churn-neutral)
|
| 21 |
+
|
| 22 |
+
DISCLOSED class imperfection: the string-variant gate over-counts on flights —
|
| 23 |
+
single-digit time differences ('7:59 p.m.' vs '7:58 p.m.') pass the similarity
|
| 24 |
+
threshold but are cross-source VALUE disagreements (need per-entity cross-row
|
| 25 |
+
voting, a different capability), not surface canonicalization. ~950 of flights'
|
| 26 |
+
1049 "variant" errors are of this kind; treat flights' variant-recall as a
|
| 27 |
+
lower-bound stress number, not addressable headroom.
|
| 28 |
+
|
| 29 |
+
uv run python -m eval.generalization # grounded heuristic baseline
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from __future__ import annotations
|
| 33 |
+
|
| 34 |
+
import argparse
|
| 35 |
+
import json
|
| 36 |
+
|
| 37 |
+
from scrubdata.executor import apply_plan
|
| 38 |
+
from scrubdata.planner import mock_plan
|
| 39 |
+
from training.real_data import _is_variant
|
| 40 |
+
|
| 41 |
+
from .metrics import _cell_equal
|
| 42 |
+
from .run_real_multi import _cell_only, _fetch, _sem_equal, score
|
| 43 |
+
|
| 44 |
+
# pairs used to train the current champion (v6 = mixA) — anything here is OFF-LIMITS
|
| 45 |
+
# for generalization scoring of that model. Update per training run.
|
| 46 |
+
TRAIN_SOURCES = {"v6": {"hospital", "beers", "movies_1"}}
|
| 47 |
+
|
| 48 |
+
# held-out real-error sources. Harvested D1 sources get appended here OR to the
|
| 49 |
+
# training side — never both. ed2_restaurants (stage-2 harvest): real NYC-restaurant
|
| 50 |
+
# typos, in-regime, EVAL-ONLY — its sibling domain source (fodors_zagats) trains, so
|
| 51 |
+
# this measures cross-source same-domain transfer. dblp_scholar was REJECTED as an
|
| 52 |
+
# eval source: its gold systematically prefers the opposite case convention from the
|
| 53 |
+
# dirty side (Scholar lowercase vs DBLP Title Case), which measures convention
|
| 54 |
+
# preference, not cleaning — the artifact this metric is designed against.
|
| 55 |
+
EVAL_SOURCES = ["flights", "rayyan", "ed2_restaurants"]
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def variant_breakdown(dirty, clean, out) -> dict:
|
| 59 |
+
"""Split benchmark errors by class and count repairs per class (churn-neutral)."""
|
| 60 |
+
n = min(len(dirty), len(out), len(clean))
|
| 61 |
+
c = {"variant_errors": 0, "variant_fixed": 0, "variant_changed": 0,
|
| 62 |
+
"variant_good": 0, "other_errors": 0, "other_fixed": 0}
|
| 63 |
+
for j, col in enumerate(dirty.columns):
|
| 64 |
+
present = col in out.columns
|
| 65 |
+
for i in range(n):
|
| 66 |
+
dv, cv = dirty.iat[i, j], clean.iat[i, j]
|
| 67 |
+
if _cell_equal(dv, cv):
|
| 68 |
+
continue # not a benchmark error
|
| 69 |
+
ov = out.iloc[i][col] if present else dv
|
| 70 |
+
chg = present and not _cell_equal(ov, dv)
|
| 71 |
+
if chg and _sem_equal(ov, dv) and not _cell_equal(ov, cv):
|
| 72 |
+
chg = False # churn: ignore
|
| 73 |
+
fixed = _cell_equal(ov, cv) or (_sem_equal(ov, cv) and chg)
|
| 74 |
+
is_variant = (str(dv).strip() and str(cv).strip()
|
| 75 |
+
and _is_variant(str(dv), str(cv)))
|
| 76 |
+
if is_variant:
|
| 77 |
+
c["variant_errors"] += 1
|
| 78 |
+
c["variant_fixed"] += int(fixed)
|
| 79 |
+
if chg:
|
| 80 |
+
c["variant_changed"] += 1
|
| 81 |
+
c["variant_good"] += int(_sem_equal(ov, cv))
|
| 82 |
+
else:
|
| 83 |
+
c["other_errors"] += 1
|
| 84 |
+
c["other_fixed"] += int(fixed)
|
| 85 |
+
return c
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def evaluate_generalization(planner, sources=None, label: str = "system") -> dict:
|
| 89 |
+
sources = sources or EVAL_SOURCES
|
| 90 |
+
rows = []
|
| 91 |
+
for name in sources:
|
| 92 |
+
# FULL tables, no truncation — ed2_restaurants' real errors are concentrated
|
| 93 |
+
# outside the first 2k rows (_raha_pair's head(2000) hid 473 of 477).
|
| 94 |
+
dirty, clean = _fetch(name)
|
| 95 |
+
cleaned, _ = apply_plan(dirty, _cell_only(planner(dirty)))
|
| 96 |
+
m = score(dirty, clean, cleaned)
|
| 97 |
+
b = variant_breakdown(dirty, clean, cleaned)
|
| 98 |
+
rows.append({"source": name, **{k: m[k] for k in
|
| 99 |
+
("f1", "precision", "recall", "damage")}, **b})
|
| 100 |
+
print(f" {name:<10} F1={m['f1']:.3f} dmg={m['damage']:.3f} | variant: "
|
| 101 |
+
f"{b['variant_fixed']}/{b['variant_errors']} fixed, "
|
| 102 |
+
f"{b['variant_good']}/{b['variant_changed']} changes good | "
|
| 103 |
+
f"other: {b['other_fixed']}/{b['other_errors']}", flush=True)
|
| 104 |
+
|
| 105 |
+
return _aggregate(rows, sources, label)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def evaluate_captured_union(plans: dict, sources, label: str, tau: float = 0.5) -> dict:
|
| 109 |
+
"""Score the SHIPPED pipeline from captured raw model plans (Modal --capture):
|
| 110 |
+
per source, verify(tau) the captured plan, union with the grounded heuristic —
|
| 111 |
+
byte-identical composition to scrubdata/active.py."""
|
| 112 |
+
from scrubdata.verifier import union_plans, verify_plan
|
| 113 |
+
|
| 114 |
+
def planner_for(name):
|
| 115 |
+
def planner(df, *_):
|
| 116 |
+
return union_plans(verify_plan(df, plans[name], tau=tau), mock_plan(df))
|
| 117 |
+
return planner
|
| 118 |
+
|
| 119 |
+
rows = []
|
| 120 |
+
for name in sources:
|
| 121 |
+
dirty, clean = _fetch(name)
|
| 122 |
+
cleaned, _ = apply_plan(dirty, _cell_only(planner_for(name)(dirty)))
|
| 123 |
+
m = score(dirty, clean, cleaned)
|
| 124 |
+
b = variant_breakdown(dirty, clean, cleaned)
|
| 125 |
+
rows.append({"source": name, **{k: m[k] for k in
|
| 126 |
+
("f1", "precision", "recall", "damage")}, **b})
|
| 127 |
+
print(f" {name:<16} F1={m['f1']:.3f} dmg={m['damage']:.3f} | variant: "
|
| 128 |
+
f"{b['variant_fixed']}/{b['variant_errors']} fixed", flush=True)
|
| 129 |
+
return _aggregate(rows, sources, label)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def _aggregate(rows, sources, label) -> dict:
|
| 133 |
+
def mean(xs):
|
| 134 |
+
xs = list(xs)
|
| 135 |
+
return sum(xs) / len(xs) if xs else 0.0
|
| 136 |
+
|
| 137 |
+
def rate(num, den):
|
| 138 |
+
return num / den if den else 0.0
|
| 139 |
+
|
| 140 |
+
out = {
|
| 141 |
+
"system": label, "sources": list(sources),
|
| 142 |
+
"gen_f1": mean(r["f1"] for r in rows),
|
| 143 |
+
"variant_recall": mean(rate(r["variant_fixed"], r["variant_errors"]) for r in rows),
|
| 144 |
+
"variant_precision": mean(rate(r["variant_good"], r["variant_changed"])
|
| 145 |
+
if r["variant_changed"] else 1.0 for r in rows),
|
| 146 |
+
"other_recall": mean(rate(r["other_fixed"], r["other_errors"]) for r in rows),
|
| 147 |
+
"damage": mean(r["damage"] for r in rows),
|
| 148 |
+
"per_source": rows,
|
| 149 |
+
}
|
| 150 |
+
print(f"{label}: GEN-F1={out['gen_f1']:.3f} VARIANT-RECALL={out['variant_recall']:.3f} "
|
| 151 |
+
f"VARIANT-PREC={out['variant_precision']:.3f} dmg={out['damage']:.3f}")
|
| 152 |
+
return out
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def main() -> None:
|
| 156 |
+
ap = argparse.ArgumentParser()
|
| 157 |
+
ap.add_argument("--sources", default=",".join(EVAL_SOURCES))
|
| 158 |
+
ap.add_argument("--plans", default=None,
|
| 159 |
+
help="JSON file {source: captured raw model plan} -> score the "
|
| 160 |
+
"shipped union pipeline instead of the local baselines")
|
| 161 |
+
ap.add_argument("--label", default="captured union")
|
| 162 |
+
ap.add_argument("--out", default="eval/results/generalization_baseline.json")
|
| 163 |
+
args = ap.parse_args()
|
| 164 |
+
sources = args.sources.split(",")
|
| 165 |
+
if args.plans:
|
| 166 |
+
plans = json.load(open(args.plans))
|
| 167 |
+
results = [evaluate_captured_union(plans, sources, args.label)]
|
| 168 |
+
else:
|
| 169 |
+
results = [
|
| 170 |
+
evaluate_generalization(mock_plan, sources, "grounded heuristic"),
|
| 171 |
+
evaluate_generalization(
|
| 172 |
+
lambda df: {"table_operations": [], "columns": [], "flags": []},
|
| 173 |
+
sources, "no-op"),
|
| 174 |
+
]
|
| 175 |
+
json.dump(results, open(args.out, "w"), indent=1)
|
| 176 |
+
print(f"written to {args.out}")
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
if __name__ == "__main__":
|
| 180 |
+
main()
|
eval/gittables_audit.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""N=250 GitTables audit — the at-scale trust + repair board.
|
| 2 |
+
|
| 3 |
+
250 real GitHub tables (LUH-DBS Matelda GitTables-subsets, Apache-2.0; injected
|
| 4 |
+
typos on real heterogeneous tables) scored end-to-end with the shipped pipeline:
|
| 5 |
+
schema validity, SILENT-EDIT attribution (the trust contract at scale), and the
|
| 6 |
+
churn-neutral repair metric. No inject-recovery here (these pairs carry their own
|
| 7 |
+
errors). Summary feeds docs/GITTABLES_AUDIT.md.
|
| 8 |
+
|
| 9 |
+
uv run python -m eval.gittables_audit
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import json
|
| 15 |
+
import time
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
import pandas as pd
|
| 19 |
+
|
| 20 |
+
from scrubdata.executor import apply_plan
|
| 21 |
+
from scrubdata.planner import mock_plan
|
| 22 |
+
|
| 23 |
+
from .metrics import is_valid
|
| 24 |
+
from .run_real_multi import _cell_only, score
|
| 25 |
+
from .wild_bench import behavioral
|
| 26 |
+
|
| 27 |
+
ROOT = Path(__file__).resolve().parent.parent
|
| 28 |
+
DIR = ROOT / "data" / "gittables250"
|
| 29 |
+
N_CAP = 3000
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _load(p: Path):
|
| 33 |
+
kw = dict(dtype=str, keep_default_na=False, nrows=N_CAP, on_bad_lines="skip")
|
| 34 |
+
try:
|
| 35 |
+
return pd.read_csv(p, encoding_errors="replace", **kw)
|
| 36 |
+
except Exception: # noqa: BLE001
|
| 37 |
+
return pd.read_csv(p, engine="python", **kw)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def main() -> None:
|
| 41 |
+
slugs = sorted({p.name.split("_")[0] for p in DIR.glob("t*_dirty.csv")})
|
| 42 |
+
rows, failures = [], []
|
| 43 |
+
t0 = time.perf_counter()
|
| 44 |
+
for slug in slugs:
|
| 45 |
+
try:
|
| 46 |
+
dirty = _load(DIR / f"{slug}_dirty.csv")
|
| 47 |
+
clean = _load(DIR / f"{slug}_clean.csv")
|
| 48 |
+
n = min(len(dirty), len(clean))
|
| 49 |
+
if n < 3 or dirty.shape[1] < 2:
|
| 50 |
+
continue
|
| 51 |
+
dirty, clean = dirty.head(n), clean.head(n)
|
| 52 |
+
b = behavioral(dirty)
|
| 53 |
+
plan = _cell_only(mock_plan(dirty))
|
| 54 |
+
cleaned, _ = apply_plan(dirty, plan)
|
| 55 |
+
m = score(dirty, clean, cleaned)
|
| 56 |
+
rows.append({"table": slug, "rows": n, "cols": dirty.shape[1],
|
| 57 |
+
"plan_valid": b["plan_valid"],
|
| 58 |
+
"silent_edit_columns": len(b["silent_edit_columns"]),
|
| 59 |
+
"errors": m["_errors"], "f1": round(m["f1"], 3),
|
| 60 |
+
"damage": round(m["damage"], 4)})
|
| 61 |
+
except Exception as e: # noqa: BLE001
|
| 62 |
+
failures.append(f"{slug}: {type(e).__name__}")
|
| 63 |
+
dt = time.perf_counter() - t0
|
| 64 |
+
|
| 65 |
+
n = len(rows)
|
| 66 |
+
valid = sum(r["plan_valid"] for r in rows)
|
| 67 |
+
silent = sum(1 for r in rows if r["silent_edit_columns"])
|
| 68 |
+
scored = [r for r in rows if r["errors"] > 0]
|
| 69 |
+
f1s = [r["f1"] for r in scored]
|
| 70 |
+
dmgs = [r["damage"] for r in rows]
|
| 71 |
+
summary = {
|
| 72 |
+
"tables_audited": n, "pipeline_failures": len(failures),
|
| 73 |
+
"plan_valid": valid, "tables_with_silent_edits": silent,
|
| 74 |
+
"tables_with_errors": len(scored),
|
| 75 |
+
"macro_f1_on_errored": round(sum(f1s) / len(f1s), 3) if f1s else None,
|
| 76 |
+
"macro_damage": round(sum(dmgs) / len(dmgs), 4),
|
| 77 |
+
"zero_damage_tables": sum(1 for d in dmgs if d == 0),
|
| 78 |
+
"seconds": round(dt, 1),
|
| 79 |
+
}
|
| 80 |
+
json.dump({"summary": summary, "rows": rows, "failures": failures},
|
| 81 |
+
open(ROOT / "eval" / "results" / "gittables_audit.json", "w"), indent=1)
|
| 82 |
+
L = ["# GitTables N=250 audit — trust contract at scale", "",
|
| 83 |
+
f"Shipped pipeline over {n} real GitHub tables (Matelda GitTables-subsets,",
|
| 84 |
+
"Apache-2.0; injected typos on real heterogeneous tables).", "",
|
| 85 |
+
"| metric | value |", "|---|---|"]
|
| 86 |
+
for k, v in summary.items():
|
| 87 |
+
L.append(f"| {k} | {v} |")
|
| 88 |
+
(ROOT / "docs" / "GITTABLES_AUDIT.md").write_text("\n".join(L) + "\n")
|
| 89 |
+
print(json.dumps(summary, indent=1))
|
| 90 |
+
if failures:
|
| 91 |
+
print("failures:", failures[:8])
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
if __name__ == "__main__":
|
| 95 |
+
main()
|
eval/gold.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval/gold.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Frozen held-out gold eval set (committed to eval/gold.jsonl).
|
| 2 |
+
|
| 3 |
+
A FIXED test set so every fine-tune iteration (and generator change) is scored on the
|
| 4 |
+
same examples — v1 vs v2 stay comparable. Regenerate intentionally with `build_gold`.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import random
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
import pandas as pd
|
| 14 |
+
|
| 15 |
+
from training.generate import make_example
|
| 16 |
+
|
| 17 |
+
from . import metrics
|
| 18 |
+
|
| 19 |
+
GOLD_PATH = Path(__file__).resolve().parent / "gold.jsonl"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def build_gold(n: int = 300, seed: int = 4242, path: Path = GOLD_PATH) -> list[dict]:
|
| 23 |
+
rng = random.Random(seed)
|
| 24 |
+
out = []
|
| 25 |
+
while len(out) < n:
|
| 26 |
+
ex = make_example(rng)
|
| 27 |
+
if metrics.recovery(ex["clean_df"], ex["dirty_df"], ex["plan"]) >= 0.999:
|
| 28 |
+
out.append(ex)
|
| 29 |
+
with Path(path).open("w", encoding="utf-8") as f:
|
| 30 |
+
for ex in out:
|
| 31 |
+
clean = ex["clean_df"].where(pd.notna(ex["clean_df"]), None)
|
| 32 |
+
f.write(json.dumps({
|
| 33 |
+
"dirty": ex["dirty_df"].to_dict("records"),
|
| 34 |
+
"clean": clean.to_dict("records"),
|
| 35 |
+
"dirty_cols": list(ex["dirty_df"].columns),
|
| 36 |
+
"clean_cols": list(ex["clean_df"].columns),
|
| 37 |
+
"plan": ex["plan"],
|
| 38 |
+
}, ensure_ascii=False, default=str) + "\n")
|
| 39 |
+
return out
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def load_gold(path: Path = GOLD_PATH) -> list[dict]:
|
| 43 |
+
p = Path(path)
|
| 44 |
+
if not p.exists():
|
| 45 |
+
return build_gold(path=p)
|
| 46 |
+
out = []
|
| 47 |
+
for line in p.read_text(encoding="utf-8").splitlines():
|
| 48 |
+
d = json.loads(line)
|
| 49 |
+
dirty = (pd.DataFrame(d["dirty"])[d["dirty_cols"]] if d["dirty"]
|
| 50 |
+
else pd.DataFrame(columns=d["dirty_cols"]))
|
| 51 |
+
clean = (pd.DataFrame(d["clean"])[d["clean_cols"]] if d["clean"]
|
| 52 |
+
else pd.DataFrame(columns=d["clean_cols"]))
|
| 53 |
+
out.append({"dirty_df": dirty, "clean_df": clean, "plan": d["plan"]})
|
| 54 |
+
return out
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
import argparse
|
| 59 |
+
ap = argparse.ArgumentParser()
|
| 60 |
+
ap.add_argument("--n", type=int, default=300)
|
| 61 |
+
ap.add_argument("--seed", type=int, default=4242)
|
| 62 |
+
args = ap.parse_args()
|
| 63 |
+
g = build_gold(args.n, args.seed)
|
| 64 |
+
print(f"Wrote {len(g)} frozen gold examples to {GOLD_PATH}")
|
eval/inject.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Seeded, self-verifying error injection — turns any CLEAN table into dirty/clean
|
| 2 |
+
validation. This is the de-biasing core of the north-star: our 20+ harvested clean
|
| 3 |
+
domains become per-cell-ground-truth validation across error types, far beyond any one
|
| 4 |
+
published benchmark.
|
| 5 |
+
|
| 6 |
+
Self-contained (no nlpaug/BART deps): we inject a KNOWN corruption into a clean cell, so
|
| 7 |
+
the (dirty -> clean) ground truth is exact and the run is reproducible (fixed seed).
|
| 8 |
+
|
| 9 |
+
Injects RECOVERABLE error types (the cleaner can restore the clean value): typo, ocr,
|
| 10 |
+
case, whitespace — i.e. the canonicalization + format axes. Targets CATEGORICAL text
|
| 11 |
+
columns (recurring values), where canonicalization is the task.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
import random
|
| 17 |
+
import string
|
| 18 |
+
|
| 19 |
+
_OCR = {"O": "0", "o": "0", "l": "1", "I": "1", "S": "5", "s": "5",
|
| 20 |
+
"B": "8", "Z": "2", "z": "2", "g": "9", "G": "6", "b": "6"}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _typo(s: str, rng: random.Random) -> str:
|
| 24 |
+
if len(s) < 4:
|
| 25 |
+
return s
|
| 26 |
+
i = rng.randrange(1, len(s) - 1)
|
| 27 |
+
if not s[i].isalpha():
|
| 28 |
+
return s
|
| 29 |
+
m = rng.random()
|
| 30 |
+
if m < 0.55: # substitute (the classic 'birminghxm')
|
| 31 |
+
pool = string.ascii_uppercase if s[i].isupper() else string.ascii_lowercase
|
| 32 |
+
return s[:i] + rng.choice([c for c in pool if c != s[i].lower()]) + s[i + 1:]
|
| 33 |
+
if m < 0.78: # delete
|
| 34 |
+
return s[:i] + s[i + 1:]
|
| 35 |
+
return s[:i] + s[i + 1] + s[i] + s[i + 2:] # transpose
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _ocr(s: str, rng: random.Random) -> str:
|
| 39 |
+
idxs = [i for i, c in enumerate(s) if c in _OCR]
|
| 40 |
+
if not idxs:
|
| 41 |
+
return _typo(s, rng)
|
| 42 |
+
i = rng.choice(idxs)
|
| 43 |
+
return s[:i] + _OCR[s[i]] + s[i + 1:]
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _case(s: str, rng: random.Random) -> str:
|
| 47 |
+
return rng.choice([s.upper(), s.lower(), s.title()])
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _ws(s: str, rng: random.Random) -> str:
|
| 51 |
+
return rng.choice([" " * rng.randint(1, 2) + s, s + " " * rng.randint(1, 2),
|
| 52 |
+
s.replace(" ", " ", 1) if " " in s else " " + s])
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
INJECTORS = {"typo": _typo, "ocr": _ocr, "case": _case, "whitespace": _ws}
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def _categorical_text_cols(df, max_cols: int = 12) -> list[str]:
|
| 59 |
+
"""Text columns whose values RECUR (canonicalization is meaningful)."""
|
| 60 |
+
out = []
|
| 61 |
+
for c in df.columns:
|
| 62 |
+
vals = [str(v).strip() for v in df[c].tolist() if str(v).strip()]
|
| 63 |
+
if len(vals) < 20:
|
| 64 |
+
continue
|
| 65 |
+
alpha = sum(1 for v in vals if any(ch.isalpha() for ch in v)) / len(vals)
|
| 66 |
+
nonnum = 0
|
| 67 |
+
for v in vals:
|
| 68 |
+
try:
|
| 69 |
+
float(v.replace(",", ""))
|
| 70 |
+
except ValueError:
|
| 71 |
+
nonnum += 1
|
| 72 |
+
if alpha < 0.7 or nonnum / len(vals) < 0.7:
|
| 73 |
+
continue
|
| 74 |
+
if len(set(vals)) / len(vals) > 0.5: # must recur (categorical)
|
| 75 |
+
continue
|
| 76 |
+
out.append(c)
|
| 77 |
+
if len(out) >= max_cols:
|
| 78 |
+
break
|
| 79 |
+
return out
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def inject(clean_df, error_type: str, seed: int, rate: float = 0.07):
|
| 83 |
+
"""Return a dirty copy of `clean_df` with `error_type` errors injected into a
|
| 84 |
+
`rate` fraction of cells in its categorical-text columns, or None if no eligible
|
| 85 |
+
column. The original `clean_df` is the exact ground truth."""
|
| 86 |
+
fn = INJECTORS[error_type]
|
| 87 |
+
cols = _categorical_text_cols(clean_df)
|
| 88 |
+
if not cols:
|
| 89 |
+
return None
|
| 90 |
+
rng = random.Random(seed)
|
| 91 |
+
dirty = clean_df.copy()
|
| 92 |
+
touched = 0
|
| 93 |
+
for c in cols:
|
| 94 |
+
col = dirty[c].tolist()
|
| 95 |
+
for i, v in enumerate(col):
|
| 96 |
+
s = str(v)
|
| 97 |
+
if s.strip() and rng.random() < rate:
|
| 98 |
+
nv = fn(s, rng)
|
| 99 |
+
if nv != s:
|
| 100 |
+
col[i] = nv
|
| 101 |
+
touched += 1
|
| 102 |
+
dirty[c] = col
|
| 103 |
+
return dirty if touched else None
|
eval/inject_validity.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""W4.5 inject-validity (TableEG-style) — does the injected slice LOOK like and RANK
|
| 2 |
+
like the real slice?
|
| 3 |
+
|
| 4 |
+
(1) Classifies every real dirty->gold cell error (hospital's 509 + all 42 paired
|
| 5 |
+
sources eval/paired_bench.py walks) with a deterministic taxonomy (typo/edit-dist<=2,
|
| 6 |
+
case-only, whitespace, encoding/mojibake, numeric, date-format, token-swap, missing,
|
| 7 |
+
other); (2) classifies the suite's INJECTED errors at the money-table seeds (7/17/27);
|
| 8 |
+
(3) reports Jensen-Shannon divergence (base 2) between injected and real type
|
| 9 |
+
distributions, pooled and per real source; (4) reports Kendall tau-b between system
|
| 10 |
+
rankings on the injected vs real F1 slices of money_table_head.json, with degenerate
|
| 11 |
+
policies (abstain-all / random-edit / oracle) run through the same suite as anchors.
|
| 12 |
+
Honesty rule: if the injector is far from real (high JSD), that IS the result — the
|
| 13 |
+
paper's mitigation (both slices reported separately) already stands.
|
| 14 |
+
|
| 15 |
+
uv run python -m eval.inject_validity # full run (~15 min CPU)
|
| 16 |
+
uv run python -m eval.inject_validity --tex-only # rebuild the snippet from JSON
|
| 17 |
+
Writes eval/results/inject_validity.json + eval/results/inject_validity_appendix.tex.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import collections
|
| 23 |
+
import json
|
| 24 |
+
import math
|
| 25 |
+
import time
|
| 26 |
+
from datetime import datetime
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
|
| 29 |
+
from .degenerate import _abstain_all, _oracle, _random_edit
|
| 30 |
+
from .metrics import _cell_equal
|
| 31 |
+
from .paired_bench import _load, pairs
|
| 32 |
+
from .run_real_multi import build_suite, score
|
| 33 |
+
|
| 34 |
+
ROOT = Path(__file__).resolve().parent.parent
|
| 35 |
+
SEEDS = (7, 17, 27) # money-table seeds (run_real_multi.main)
|
| 36 |
+
CATS = ["typo", "case", "whitespace", "encoding", "numeric", "date-format",
|
| 37 |
+
"token-swap", "missing", "other"]
|
| 38 |
+
EXPECT = {"typo": "typo", "ocr": "typo", "case": "case", "whitespace": "whitespace"}
|
| 39 |
+
_MOJI = ("�", "Ã", "Â", "â€", "ï¿")
|
| 40 |
+
_DATE_FMTS = ("%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y", "%m/%d/%y", "%Y/%m/%d",
|
| 41 |
+
"%d-%m-%Y", "%b %d, %Y", "%B %d, %Y", "%d %b %Y", "%Y%m%d")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _num(s: str):
|
| 45 |
+
t = s.strip().replace(",", "").lstrip("$").rstrip("%")
|
| 46 |
+
try:
|
| 47 |
+
return float(t)
|
| 48 |
+
except ValueError:
|
| 49 |
+
return None
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _date(s: str):
|
| 53 |
+
for f in _DATE_FMTS:
|
| 54 |
+
try:
|
| 55 |
+
return datetime.strptime(s.strip(), f).date()
|
| 56 |
+
except ValueError:
|
| 57 |
+
pass
|
| 58 |
+
return None
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _lev_gt2(a: str, b: str) -> bool:
|
| 62 |
+
"""True iff Levenshtein(a, b) > 2 (banded DP, O(len*5))."""
|
| 63 |
+
k = 2
|
| 64 |
+
la, lb = len(a), len(b)
|
| 65 |
+
if abs(la - lb) > k:
|
| 66 |
+
return True
|
| 67 |
+
INF = k + 1
|
| 68 |
+
prev = [min(j, INF) for j in range(lb + 1)]
|
| 69 |
+
for i in range(1, la + 1):
|
| 70 |
+
lo, hi = max(1, i - k), min(lb, i + k)
|
| 71 |
+
cur = [INF] * (lb + 1)
|
| 72 |
+
if i <= k:
|
| 73 |
+
cur[0] = i
|
| 74 |
+
for j in range(lo, hi + 1):
|
| 75 |
+
cur[j] = min(prev[j] + 1, cur[j - 1] + 1,
|
| 76 |
+
prev[j - 1] + (a[i - 1] != b[j - 1]), INF)
|
| 77 |
+
prev = cur
|
| 78 |
+
if min(prev[max(0, lo - 1):hi + 1]) >= INF:
|
| 79 |
+
return True
|
| 80 |
+
return prev[lb] > k
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def classify(d, g) -> str:
|
| 84 |
+
"""Deterministic error type from (dirty, gold) cell pair. Order matters:
|
| 85 |
+
surface classes first, then value classes, edit-distance last."""
|
| 86 |
+
ds, gs = str(d), str(g)
|
| 87 |
+
if not ds.strip() or not gs.strip():
|
| 88 |
+
return "missing"
|
| 89 |
+
if "".join(ds.split()) == "".join(gs.split()):
|
| 90 |
+
return "whitespace"
|
| 91 |
+
if "".join(ds.split()).casefold() == "".join(gs.split()).casefold():
|
| 92 |
+
return "case"
|
| 93 |
+
if any(m in ds for m in _MOJI) != any(m in gs for m in _MOJI):
|
| 94 |
+
return "encoding"
|
| 95 |
+
if _num(ds) is not None and _num(gs) is not None:
|
| 96 |
+
return "numeric"
|
| 97 |
+
dd, gd = _date(ds), _date(gs)
|
| 98 |
+
if dd is not None and dd == gd:
|
| 99 |
+
return "date-format"
|
| 100 |
+
dt, gt = sorted(ds.casefold().split()), sorted(gs.casefold().split())
|
| 101 |
+
if dt == gt and len(dt) > 1:
|
| 102 |
+
return "token-swap"
|
| 103 |
+
if not _lev_gt2(ds.strip(), gs.strip()):
|
| 104 |
+
return "typo"
|
| 105 |
+
return "other"
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _classify_pair(dirty, clean) -> collections.Counter:
|
| 109 |
+
n = min(len(dirty), len(clean))
|
| 110 |
+
c = collections.Counter()
|
| 111 |
+
for j in range(dirty.shape[1]):
|
| 112 |
+
for i in range(n):
|
| 113 |
+
dv, cv = dirty.iat[i, j], clean.iat[i, j]
|
| 114 |
+
if not _cell_equal(dv, cv):
|
| 115 |
+
c[classify(dv, cv)] += 1
|
| 116 |
+
return c
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def _jsd(p: dict, q: dict) -> float:
|
| 120 |
+
"""Jensen-Shannon divergence, base 2 (0 = identical, 1 = disjoint)."""
|
| 121 |
+
sp, sq = sum(p.values()), sum(q.values())
|
| 122 |
+
out = 0.0
|
| 123 |
+
for k in set(p) | set(q):
|
| 124 |
+
a, b = p.get(k, 0) / sp, q.get(k, 0) / sq
|
| 125 |
+
m = (a + b) / 2
|
| 126 |
+
if a:
|
| 127 |
+
out += 0.5 * a * math.log2(a / m)
|
| 128 |
+
if b:
|
| 129 |
+
out += 0.5 * b * math.log2(b / m)
|
| 130 |
+
return out
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def _tau_b(xs, ys) -> float:
|
| 134 |
+
"""Kendall tau-b (tie-corrected); n is small, O(n^2) is fine."""
|
| 135 |
+
n0 = nc = nd = tx = ty = 0
|
| 136 |
+
for i in range(len(xs)):
|
| 137 |
+
for j in range(i + 1, len(xs)):
|
| 138 |
+
n0 += 1
|
| 139 |
+
a, b = xs[i] - xs[j], ys[i] - ys[j]
|
| 140 |
+
tx += a == 0
|
| 141 |
+
ty += b == 0
|
| 142 |
+
nc += a * b > 0
|
| 143 |
+
nd += a * b < 0
|
| 144 |
+
den = ((n0 - tx) * (n0 - ty)) ** 0.5
|
| 145 |
+
return (nc - nd) / den if den else 0.0
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def _dist(counter) -> dict:
|
| 149 |
+
tot = sum(counter.values())
|
| 150 |
+
return {k: round(counter.get(k, 0) / tot, 4) for k in CATS} if tot else {}
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def _suite_slices(cleaner) -> tuple[float, float]:
|
| 154 |
+
"""(real-slice mean F1, injected-slice mean F1 over SEEDS) for a degenerate
|
| 155 |
+
cleaner(dirty, clean) -> out, mirroring run_real_multi's by-source means."""
|
| 156 |
+
real = []
|
| 157 |
+
for spec in build_suite(seed=SEEDS[0]):
|
| 158 |
+
if spec["source"] != "real":
|
| 159 |
+
continue
|
| 160 |
+
dirty, clean = spec["load"]()
|
| 161 |
+
real.append(score(dirty, clean, cleaner(dirty, clean))["f1"])
|
| 162 |
+
inj = []
|
| 163 |
+
for s in SEEDS:
|
| 164 |
+
fs = []
|
| 165 |
+
for spec in build_suite(seed=s):
|
| 166 |
+
if spec["source"] != "injected":
|
| 167 |
+
continue
|
| 168 |
+
loaded = spec["load"]()
|
| 169 |
+
if loaded is None:
|
| 170 |
+
continue
|
| 171 |
+
dirty, clean = loaded
|
| 172 |
+
fs.append(score(dirty, clean, cleaner(dirty, clean))["f1"])
|
| 173 |
+
inj.append(sum(fs) / len(fs))
|
| 174 |
+
return sum(real) / len(real), sum(inj) / len(inj)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def _write_tex(out: dict, res: Path) -> None:
|
| 178 |
+
rd, jd = out["real"]["pooled_dist"], out["injected"]["pooled_dist"]
|
| 179 |
+
j, rk = out["jsd"], out["ranking"]
|
| 180 |
+
L = [r"% Auto-generated by eval/inject_validity.py — do not edit by hand.",
|
| 181 |
+
r"\subsection{Validity of the Injected Slice}\label{app:inject-validity}",
|
| 182 |
+
r"Following the TableEG-style audit, we classify every error cell (dirty vs.\ gold)",
|
| 183 |
+
r"with a deterministic taxonomy and compare the suite's injected errors (money-table",
|
| 184 |
+
r"seeds " + "/".join(map(str, out["seeds"])) + r", $n=" +
|
| 185 |
+
f"{out['injected']['n']:,}".replace(",", r"{,}") + r"$) against the $" +
|
| 186 |
+
f"{out['real']['n']:,}".replace(",", r"{,}") +
|
| 187 |
+
r"$ real errors across the 42 paired sources (hospital's " +
|
| 188 |
+
f"{out['real']['hospital_n']}" + r" included).",
|
| 189 |
+
r"\begin{table}[t]\centering\small",
|
| 190 |
+
r"\caption{Error-type distributions, real vs.\ injected (pooled).}",
|
| 191 |
+
r"\label{tab:inject-validity}",
|
| 192 |
+
r"\begin{tabular}{lrr}\toprule",
|
| 193 |
+
r"error type & real & injected \\ \midrule"]
|
| 194 |
+
for c in CATS:
|
| 195 |
+
L.append(f"{c} & {rd.get(c, 0):.3f} & {jd.get(c, 0):.3f} " + r"\\")
|
| 196 |
+
L += [r"\bottomrule\end{tabular}\end{table}",
|
| 197 |
+
r"The injector covers only the recoverable surface classes it targets by design",
|
| 198 |
+
r"(typo/case/whitespace; injector--taxonomy agreement " +
|
| 199 |
+
f"{out['injected']['injector_taxonomy_agreement']:.3f}" + r"), whereas real errors",
|
| 200 |
+
r"are dominated by substitutions beyond edit distance~2 (other, " +
|
| 201 |
+
f"{rd['other']:.3f}" + r") and short typos (" + f"{rd['typo']:.3f}" +
|
| 202 |
+
r"), with numeric (" + f"{rd['numeric']:.3f}" + r"), missing-value (" +
|
| 203 |
+
f"{rd['missing']:.3f}" + r"), and encoding classes the injector never produces.",
|
| 204 |
+
r"Pooled Jensen--Shannon divergence is " + f"{j['pooled']:.3f}" +
|
| 205 |
+
r"~bits (per-source median " + f"{j['median']:.3f}" + r", range " +
|
| 206 |
+
f"{j['min']:.3f}" + r"--" + f"{j['max']:.3f}" + r"; hospital " +
|
| 207 |
+
f"{j['hospital_vs_injected']:.3f}" + r"): the two slices are \emph{not}",
|
| 208 |
+
r"interchangeable, which is why the paper reports them separately and localizes",
|
| 209 |
+
r"the grounding claim in the real slice. Ranking preservation is partial: Kendall",
|
| 210 |
+
r"$\tau_b$ between system rankings on the injected vs.\ real F1 slices is $" +
|
| 211 |
+
f"{rk['kendall_tau_b_money_table']:.2f}" + r"$ over the four cross-system rows and $" +
|
| 212 |
+
f"{rk['kendall_tau_b_with_anchors']:.2f}" + r"$ with the degenerate anchors",
|
| 213 |
+
r"(abstain-all, random-edit, oracle) included. The injected slice preserves the",
|
| 214 |
+
r"floor/ceiling ordering but ranks OpenRefine fingerprint above both our system",
|
| 215 |
+
r"and OpenRefine kNN, the reverse of the real slice --- frequency clustering looks",
|
| 216 |
+
r"strong exactly where the canonical form is present and dominant by construction.",
|
| 217 |
+
r"Injected-only evaluation would therefore overstate frequency-clustering",
|
| 218 |
+
r"baselines."]
|
| 219 |
+
(res / "inject_validity_appendix.tex").write_text("\n".join(L) + "\n")
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def main() -> None:
|
| 223 |
+
t0 = time.perf_counter()
|
| 224 |
+
# (1) real errors: all 42 paired sources (hospital included -> its 509)
|
| 225 |
+
real_per: dict[str, collections.Counter] = {}
|
| 226 |
+
for p in pairs():
|
| 227 |
+
try:
|
| 228 |
+
dirty, clean = _load(p)
|
| 229 |
+
except Exception as e: # noqa: BLE001
|
| 230 |
+
print(f" {p.name}: LOAD FAILED {type(e).__name__}")
|
| 231 |
+
continue
|
| 232 |
+
real_per[p.name] = _classify_pair(dirty, clean)
|
| 233 |
+
print(f" real {p.name:<46} n={sum(real_per[p.name].values())}", flush=True)
|
| 234 |
+
real_pool = sum(real_per.values(), collections.Counter())
|
| 235 |
+
t_real = time.perf_counter() - t0
|
| 236 |
+
|
| 237 |
+
# (2) injected errors at the money-table seeds, via the SAME suite generator
|
| 238 |
+
inj_pool = collections.Counter()
|
| 239 |
+
inj_per_injector: dict[str, collections.Counter] = collections.defaultdict(collections.Counter)
|
| 240 |
+
inj_per_seed = {}
|
| 241 |
+
for s in SEEDS:
|
| 242 |
+
cs = collections.Counter()
|
| 243 |
+
for spec in build_suite(seed=s):
|
| 244 |
+
if spec["source"] != "injected":
|
| 245 |
+
continue
|
| 246 |
+
loaded = spec["load"]()
|
| 247 |
+
if loaded is None:
|
| 248 |
+
continue
|
| 249 |
+
dirty, clean = loaded
|
| 250 |
+
c = _classify_pair(dirty, clean)
|
| 251 |
+
cs += c
|
| 252 |
+
inj_per_injector[spec["name"].split(":")[1]] += c
|
| 253 |
+
inj_per_seed[s] = sum(cs.values())
|
| 254 |
+
inj_pool += cs
|
| 255 |
+
print(f" injected seed={s} n={inj_per_seed[s]}", flush=True)
|
| 256 |
+
agree = sum(inj_per_injector[et][want] for et, want in EXPECT.items())
|
| 257 |
+
t_inj = time.perf_counter() - t0 - t_real
|
| 258 |
+
|
| 259 |
+
# (3) distribution similarity
|
| 260 |
+
jsd_per_source = {k: round(_jsd(real_per[k], inj_pool), 4)
|
| 261 |
+
for k in sorted(real_per) if real_per[k]}
|
| 262 |
+
jsd_vals = sorted(jsd_per_source.values())
|
| 263 |
+
# (4) ranking preservation: money-table systems + degenerate anchors
|
| 264 |
+
money = json.load(open(ROOT / "eval" / "results" / "money_table_head.json"))
|
| 265 |
+
systems = [{"system": r["system"], "real_f1": r["real_f1"], "inj_f1": r["inj_f1"],
|
| 266 |
+
"anchor": False} for r in money]
|
| 267 |
+
for name, fn in [("abstain-all", _abstain_all), ("random-edit", _random_edit),
|
| 268 |
+
("oracle", _oracle)]:
|
| 269 |
+
rf, jf = _suite_slices(fn)
|
| 270 |
+
systems.append({"system": name, "real_f1": rf, "inj_f1": jf, "anchor": True})
|
| 271 |
+
print(f" anchor {name:<12} real={rf:.3f} inj={jf:.3f}", flush=True)
|
| 272 |
+
tau_money = _tau_b([s["real_f1"] for s in systems if not s["anchor"]],
|
| 273 |
+
[s["inj_f1"] for s in systems if not s["anchor"]])
|
| 274 |
+
tau_all = _tau_b([s["real_f1"] for s in systems], [s["inj_f1"] for s in systems])
|
| 275 |
+
|
| 276 |
+
out = {
|
| 277 |
+
"taxonomy": CATS, "seeds": list(SEEDS),
|
| 278 |
+
"real": {"n": sum(real_pool.values()), "n_sources": len(real_per),
|
| 279 |
+
"hospital_n": sum(real_per.get("hospital", {}).values()),
|
| 280 |
+
"pooled_counts": dict(real_pool), "pooled_dist": _dist(real_pool),
|
| 281 |
+
"per_source": {k: {"n": sum(v.values()), "dist": _dist(v)}
|
| 282 |
+
for k, v in sorted(real_per.items())}},
|
| 283 |
+
"injected": {"n": sum(inj_pool.values()), "per_seed_n": inj_per_seed,
|
| 284 |
+
"pooled_counts": dict(inj_pool), "pooled_dist": _dist(inj_pool),
|
| 285 |
+
"per_injector_dist": {k: _dist(v)
|
| 286 |
+
for k, v in sorted(inj_per_injector.items())},
|
| 287 |
+
"injector_taxonomy_agreement": round(agree / sum(inj_pool.values()), 4)},
|
| 288 |
+
"jsd": {"pooled": round(_jsd(real_pool, inj_pool), 4),
|
| 289 |
+
"hospital_vs_injected": round(_jsd(real_per["hospital"], inj_pool), 4),
|
| 290 |
+
"per_real_source_vs_injected": jsd_per_source,
|
| 291 |
+
"min": jsd_vals[0], "median": jsd_vals[len(jsd_vals) // 2],
|
| 292 |
+
"max": jsd_vals[-1]},
|
| 293 |
+
"ranking": {"systems": systems,
|
| 294 |
+
"kendall_tau_b_money_table": round(tau_money, 4),
|
| 295 |
+
"kendall_tau_b_with_anchors": round(tau_all, 4)},
|
| 296 |
+
"sec": {"real_classify": round(t_real, 1), "injected_classify": round(t_inj, 1),
|
| 297 |
+
"total": round(time.perf_counter() - t0, 1)},
|
| 298 |
+
}
|
| 299 |
+
res = ROOT / "eval" / "results"
|
| 300 |
+
json.dump(out, open(res / "inject_validity.json", "w"), indent=1)
|
| 301 |
+
_write_tex(out, res)
|
| 302 |
+
print(f"JSD pooled={out['jsd']['pooled']} tau(money)={tau_money:.3f} "
|
| 303 |
+
f"tau(+anchors)={tau_all:.3f} -> {res / 'inject_validity.json'} "
|
| 304 |
+
f"+ inject_validity_appendix.tex ({out['sec']['total']}s)")
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
if __name__ == "__main__":
|
| 308 |
+
import argparse
|
| 309 |
+
ap = argparse.ArgumentParser()
|
| 310 |
+
ap.add_argument("--tex-only", action="store_true",
|
| 311 |
+
help="rebuild the LaTeX snippet from the existing JSON")
|
| 312 |
+
if ap.parse_args().tex_only:
|
| 313 |
+
res = ROOT / "eval" / "results"
|
| 314 |
+
_write_tex(json.load(open(res / "inject_validity.json")), res)
|
| 315 |
+
print(f"-> {res / 'inject_validity_appendix.tex'}")
|
| 316 |
+
else:
|
| 317 |
+
main()
|