Spaces:
Running
Running
File size: 3,008 Bytes
16dc556 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | """ScrubData β hands-off data cleaning (Gradio app).
Runnable MOCK demo on gr.Blocks: upload β profile β plan β clean β diff +
report β download. The planner is a heuristic stand-in for the fine-tuned β€4B
model; the rest of the pipeline is real. Final version will port this flow to
gr.Server + a custom HTML frontend for the Off-Brand bonus quest.
"""
from __future__ import annotations
import tempfile
from pathlib import Path
import gradio as gr
import pandas as pd
from scrubdata import apply_plan, mock_plan, profile_dataframe, render_report
from scrubdata.active import get_planner
from scrubdata.trace import log_run
PLANNER = get_planner() # fine-tuned model if SCRUBDATA_MODEL is set, else heuristic
SAMPLE = Path(__file__).parent / "samples" / "dirty_contacts.csv"
def _read_any(path: str) -> pd.DataFrame:
"""Read CSV or Excel as raw strings (cleaning decides the real types)."""
p = Path(path)
if p.suffix.lower() in {".xlsx", ".xls"}:
return pd.read_excel(p, dtype=str)
return pd.read_csv(p, dtype=str, keep_default_na=False)
def clean(file_path: str):
if not file_path:
return (gr.update(), gr.update(), "Upload a CSV or Excel file to begin.", None)
raw = _read_any(file_path)
before = profile_dataframe(raw)
plan = PLANNER(raw)
cleaned, log = apply_plan(raw, plan)
after = profile_dataframe(cleaned)
report = render_report(plan, log, before, after)
out = Path(tempfile.gettempdir()) / "scrubbed.csv"
cleaned.to_csv(out, index=False)
try: # best-effort agent-trace capture (Open trace bonus quest)
log_run(before, raw, plan, log, model=plan.get("_generated_by", "mock_planner"))
except Exception:
pass
return raw, cleaned, report, str(out)
def load_sample():
return str(SAMPLE)
with gr.Blocks(title="ScrubData") as demo:
gr.Markdown(
"# π§½ ScrubData\n"
"**Upload your dirty spreadsheet. Get clean data back. No config.**\n\n"
"_Mock demo β heuristic planner standing in for the fine-tuned model._"
)
with gr.Row():
file_in = gr.File(label="Upload CSV / Excel", file_types=[".csv", ".xlsx", ".xls"],
type="filepath")
with gr.Column():
run_btn = gr.Button("π§½ Clean it", variant="primary")
sample_btn = gr.Button("Use the messy sample")
with gr.Row():
with gr.Column():
gr.Markdown("### Before")
before_df = gr.Dataframe(label="Original", interactive=False, wrap=True)
with gr.Column():
gr.Markdown("### After")
after_df = gr.Dataframe(label="Cleaned", interactive=False, wrap=True)
report_md = gr.Markdown()
download = gr.File(label="Download cleaned file")
run_btn.click(clean, inputs=file_in, outputs=[before_df, after_df, report_md, download])
sample_btn.click(load_sample, outputs=file_in)
if __name__ == "__main__":
demo.launch(theme=gr.themes.Soft())
|