File size: 3,008 Bytes
16dc556
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""ScrubData β€” hands-off data cleaning (Gradio app).

Runnable MOCK demo on gr.Blocks: upload β†’ profile β†’ plan β†’ clean β†’ diff +
report β†’ download. The planner is a heuristic stand-in for the fine-tuned ≀4B
model; the rest of the pipeline is real. Final version will port this flow to
gr.Server + a custom HTML frontend for the Off-Brand bonus quest.
"""

from __future__ import annotations

import tempfile
from pathlib import Path

import gradio as gr
import pandas as pd

from scrubdata import apply_plan, mock_plan, profile_dataframe, render_report
from scrubdata.active import get_planner
from scrubdata.trace import log_run

PLANNER = get_planner()   # fine-tuned model if SCRUBDATA_MODEL is set, else heuristic

SAMPLE = Path(__file__).parent / "samples" / "dirty_contacts.csv"


def _read_any(path: str) -> pd.DataFrame:
    """Read CSV or Excel as raw strings (cleaning decides the real types)."""
    p = Path(path)
    if p.suffix.lower() in {".xlsx", ".xls"}:
        return pd.read_excel(p, dtype=str)
    return pd.read_csv(p, dtype=str, keep_default_na=False)


def clean(file_path: str):
    if not file_path:
        return (gr.update(), gr.update(), "Upload a CSV or Excel file to begin.", None)

    raw = _read_any(file_path)
    before = profile_dataframe(raw)
    plan = PLANNER(raw)
    cleaned, log = apply_plan(raw, plan)
    after = profile_dataframe(cleaned)
    report = render_report(plan, log, before, after)

    out = Path(tempfile.gettempdir()) / "scrubbed.csv"
    cleaned.to_csv(out, index=False)

    try:  # best-effort agent-trace capture (Open trace bonus quest)
        log_run(before, raw, plan, log, model=plan.get("_generated_by", "mock_planner"))
    except Exception:
        pass

    return raw, cleaned, report, str(out)


def load_sample():
    return str(SAMPLE)


with gr.Blocks(title="ScrubData") as demo:
    gr.Markdown(
        "# 🧽 ScrubData\n"
        "**Upload your dirty spreadsheet. Get clean data back. No config.**\n\n"
        "_Mock demo β€” heuristic planner standing in for the fine-tuned model._"
    )

    with gr.Row():
        file_in = gr.File(label="Upload CSV / Excel", file_types=[".csv", ".xlsx", ".xls"],
                          type="filepath")
        with gr.Column():
            run_btn = gr.Button("🧽 Clean it", variant="primary")
            sample_btn = gr.Button("Use the messy sample")

    with gr.Row():
        with gr.Column():
            gr.Markdown("### Before")
            before_df = gr.Dataframe(label="Original", interactive=False, wrap=True)
        with gr.Column():
            gr.Markdown("### After")
            after_df = gr.Dataframe(label="Cleaned", interactive=False, wrap=True)

    report_md = gr.Markdown()
    download = gr.File(label="Download cleaned file")

    run_btn.click(clean, inputs=file_in, outputs=[before_df, after_df, report_md, download])
    sample_btn.click(load_sample, outputs=file_in)


if __name__ == "__main__":
    demo.launch(theme=gr.themes.Soft())