DataSense_E2B / app.py
sanjaymalladi's picture
Rollback app.py to 7d1f6af (last stable before chart experiment)
d1fcb83 verified
Raw
History Blame Contribute Delete
15 kB
"""
DataSense E2B — Hugging Face Space demo
Execution-grounded data agent (SFT v1) with bundled or uploaded CSVs/Excel.
"""
from __future__ import annotations
import html
import re
import spaces # must be first — before any torch/CUDA import
from pathlib import Path
import gradio as gr
import pandas as pd
from agent import run_agent
from config import ADAPTER_MODEL, AGENT_MAX_STEPS, DATA_DIR
from examples import DEMO_DATASETS, DEMO_EXAMPLES
MODEL, TOKENIZER = None, None
STORY_URL = "https://datasense-e2b.netlify.app/"
DEMO_VIDEO_URL = "https://youtu.be/ucFoCdMK7sE"
LINKEDIN_POST_URL = (
"https://www.linkedin.com/posts/sanjaymalladi_buildsmall-huggingface-modal-share-7471993638814654464-47hY/"
)
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500&family=Newsreader:ital,opsz,wght@0,6..72,400;0,6..72,600;1,6..72,400&display=swap');
.gradio-container {
--ds-bg: #0c1117;
--ds-surface: #151c26;
--ds-border: #2a3544;
--ds-accent: #3ecfae;
--ds-accent-dim: #1f6f5c;
--ds-text: #e8eef5;
--ds-muted: #8b9cb3;
max-width: 1280px !important;
font-family: 'Newsreader', Georgia, serif !important;
}
#ds-header {
background: linear-gradient(135deg, #0f1a24 0%, #122a2a 55%, #0c1117 100%);
border: 1px solid var(--ds-border);
border-radius: 16px;
padding: 1.5rem 1.75rem;
margin-bottom: 1rem;
}
#ds-header h1 {
font-family: 'Newsreader', Georgia, serif;
font-size: 2rem;
font-weight: 600;
margin: 0 0 0.35rem 0;
color: var(--ds-text);
}
#ds-badge {
display: inline-block;
font-family: 'IBM Plex Mono', monospace;
font-size: 0.72rem;
letter-spacing: 0.08em;
text-transform: uppercase;
color: var(--ds-accent);
border: 1px solid var(--ds-accent-dim);
border-radius: 999px;
padding: 0.2rem 0.65rem;
margin-bottom: 0.75rem;
}
#ds-panel, #ds-results {
background: var(--ds-surface);
border: 1px solid var(--ds-border);
border-radius: 14px;
padding: 1.1rem;
min-height: 520px;
}
#ds-preview-box {
margin-top: 0.5rem;
border: 1px solid var(--ds-border);
border-radius: 10px;
overflow: hidden;
}
#ds-preview-box .label-wrap { padding: 0.5rem 0.75rem !important; }
#run-btn {
background: linear-gradient(90deg, #1f6f5c, #3ecfae) !important;
border: none !important;
font-weight: 600 !important;
letter-spacing: 0.02em;
}
#ds-progress-label {
font-family: 'IBM Plex Mono', monospace;
font-size: 0.82rem;
color: var(--ds-accent);
margin: 0 0 0.35rem 0;
}
.ds-progress-wrap {
margin: 0 0 1rem 0;
padding: 0.85rem 1rem;
background: #0f1419;
border: 1px solid #2a3544;
border-radius: 10px;
}
.ds-progress-text {
font-family: 'IBM Plex Mono', monospace;
font-size: 0.82rem;
color: #3ecfae;
margin-bottom: 0.55rem;
line-height: 1.4;
}
.ds-progress-text .ds-pct {
color: #8b9cb3;
font-size: 0.75rem;
}
.ds-progress-track {
height: 6px;
background: #1a2330;
border-radius: 999px;
overflow: hidden;
}
.ds-progress-fill {
height: 100%;
background: linear-gradient(90deg, #1f6f5c, #3ecfae);
border-radius: 999px;
transition: width 0.35s ease;
}
.ds-progress-wrap.ds-idle .ds-progress-track { display: none; }
#ds-results .tabitem { padding-top: 0.75rem !important; }
.ds-answer-card {
background: linear-gradient(145deg, #122a2a 0%, #151c26 100%);
border: 1px solid #2a3544;
border-left: 4px solid #3ecfae;
border-radius: 12px;
padding: 1.5rem 1.75rem;
margin: 0;
min-height: 200px;
}
.ds-answer-label {
font-family: 'IBM Plex Mono', monospace;
font-size: 0.72rem;
letter-spacing: 0.1em;
text-transform: uppercase;
color: #3ecfae;
margin-bottom: 0.75rem;
}
.ds-answer-value {
font-family: 'Newsreader', Georgia, serif;
font-size: 1.35rem;
font-weight: 400;
color: #c5d0de;
line-height: 1.55;
word-break: break-word;
}
.ds-answer-value strong {
color: #ffffff;
font-weight: 700;
}
.ds-summary {
margin-top: 1.25rem;
padding-top: 1rem;
border-top: 1px solid #2a3544;
font-size: 1.05rem;
color: #8b9cb3;
line-height: 1.55;
}
.ds-answer-empty {
color: #8b9cb3;
font-style: italic;
padding: 1rem 0;
}
.ds-trace-wrap { margin: 0; padding: 0; }
.ds-trace-wrap h3 { margin-top: 1rem; color: #3ecfae; font-size: 1rem; }
footer { visibility: hidden; }
"""
def build_theme() -> gr.Theme:
return (
gr.themes.Base(
primary_hue=gr.themes.colors.emerald,
secondary_hue=gr.themes.colors.slate,
neutral_hue=gr.themes.colors.gray,
font=gr.themes.GoogleFont("Newsreader"),
font_mono=gr.themes.GoogleFont("IBM Plex Mono"),
)
.set(
body_background_fill="#0c1117",
body_background_fill_dark="#0c1117",
block_background_fill="#151c26",
block_background_fill_dark="#151c26",
block_border_width="1px",
block_border_color="#2a3544",
block_border_color_dark="#2a3544",
body_text_color="#e8eef5",
body_text_color_dark="#e8eef5",
button_primary_background_fill="linear-gradient(90deg, #1f6f5c, #3ecfae)",
button_primary_background_fill_hover="linear-gradient(90deg, #2a8a72, #4de0c0)",
)
)
def _load_model():
global MODEL, TOKENIZER
if MODEL is None or TOKENIZER is None:
from model_loader import load_model_and_tokenizer
MODEL, TOKENIZER = load_model_and_tokenizer()
return MODEL, TOKENIZER
def _resolve_data_path(data_mode: str, dataset_name: str, upload_file) -> Path | None:
if data_mode == "Upload your file":
if upload_file is None:
return None
path_str = upload_file[0] if isinstance(upload_file, list) else upload_file
if not path_str:
return None
path = Path(path_str)
return path if path.is_file() else None
return DEMO_DATASETS.get(dataset_name)
def _load_preview(data_mode: str, dataset_name: str, upload_file):
path = _resolve_data_path(data_mode, dataset_name, upload_file)
if path is None:
return pd.DataFrame(), "_Select or upload a file to preview._"
try:
suffix = path.suffix.lower()
if suffix in (".xlsx", ".xls"):
df = pd.read_excel(path, nrows=100)
else:
df = pd.read_csv(path, nrows=100)
meta = f"**{path.name}** · {len(df)} rows · {len(df.columns)} columns"
return df, meta
except Exception as exc:
return pd.DataFrame(), f"_Could not preview file: {exc}_"
def _inline_markdown_to_html(text: str) -> str:
"""Turn **bold** into <strong>; escape everything else."""
if not text:
return ""
parts = re.split(r"\*\*(.+?)\*\*", text)
chunks: list[str] = []
for i, part in enumerate(parts):
safe = html.escape(part)
chunks.append(f"<strong>{safe}</strong>" if i % 2 == 1 else safe)
return "".join(chunks).replace("**", "")
def _format_answer_html(answer: str, summary: str = "") -> str:
if not answer:
return '<div class="ds-answer-empty">Could not parse an answer — check the execution trace tab.</div>'
answer_html = _inline_markdown_to_html(answer)
summary_block = ""
if summary:
summary_block = f'<p class="ds-summary">{_inline_markdown_to_html(summary)}</p>'
return (
f'<div class="ds-answer-card">'
f'<div class="ds-answer-label">Verified answer</div>'
f'<div class="ds-answer-value">{answer_html}</div>'
f"{summary_block}"
f"</div>"
)
def _toggle_data_inputs(data_mode: str):
is_upload = data_mode == "Upload your file"
return (
gr.update(visible=not is_upload),
gr.update(visible=is_upload),
)
def _wrap_trace(trace: str) -> str:
if not trace:
return ""
return f'<div class="ds-trace-wrap">\n\n{trace}\n\n</div>'
def _progress_html(pct: int | None, label: str) -> str:
safe_label = html.escape(label)
if pct is None:
return f'<div class="ds-progress-wrap ds-idle"><div class="ds-progress-text">{safe_label}</div></div>'
pct = max(0, min(100, pct))
return (
f'<div class="ds-progress-wrap">'
f'<div class="ds-progress-text">{safe_label} <span class="ds-pct">{pct}%</span></div>'
f'<div class="ds-progress-track"><div class="ds-progress-fill" style="width:{pct}%"></div></div>'
f"</div>"
)
def _progress_update(pct: int | None, label: str, trace: str = "", answer: str = ""):
return (
_progress_html(pct, label),
_wrap_trace(trace),
answer,
)
IDLE_PROGRESS_HTML = _progress_html(None, "Ready — click Run DataSense")
@spaces.GPU(duration=180)
def run_task(
data_mode: str,
dataset_name: str,
upload_file,
task: str,
max_steps: int,
progress=gr.Progress(),
):
if not task.strip():
yield _progress_update(None, "Enter a question to run DataSense", "", '<div class="ds-answer-empty">Enter a task question.</div>')
return
yield _progress_update(5, "Reading your dataset…")
data_path = _resolve_data_path(data_mode, dataset_name, upload_file)
if data_path is None:
msg = "⚠️ Upload a `.csv` or `.xlsx` file first." if data_mode == "Upload your file" else f"⚠️ Dataset not found: {dataset_name}"
yield _progress_update(0, "Dataset missing", "", f'<div class="ds-answer-empty">{html.escape(msg)}</div>')
return
try:
yield _progress_update(12, "Loading DataSense on GPU…")
progress(0.15, desc="Loading DataSense…")
model, tokenizer = _load_model()
yield _progress_update(22, "DataSense is starting…")
agent_stream = run_agent(
model,
tokenizer,
data_path,
task.strip(),
max_steps=int(max_steps),
progress=progress,
stream=True,
)
for event in agent_stream:
if event[0] == "progress":
_, step, total, trace_md = event
pct = int(22 + (73 * step / max(total, 1)))
yield _progress_update(
pct,
f"DataSense · step {step}/{total}",
trace_md,
"",
)
elif event[0] == "final":
result = event[1]
answer_html = _format_answer_html(result.get("answer", ""), result.get("summary", ""))
yield _progress_update(
100,
"DataSense finished",
result["steps_markdown"],
answer_html,
)
except Exception as exc:
yield _progress_update(0, "Error", "", f'<div class="ds-answer-empty">Error: {html.escape(str(exc))}</div>')
@spaces.GPU(duration=300)
def preload_model():
_load_model()
def build_ui() -> gr.Blocks:
dataset_choices = list(DEMO_DATASETS.keys())
default_df, default_meta = _load_preview("Bundled examples", dataset_choices[0], None)
with gr.Blocks(title="DataSense E2B") as demo:
with gr.Column(elem_id="ds-header"):
gr.HTML('<div id="ds-badge">Execution-verified · Gemma / DataBench</div>')
gr.Markdown(
"""
# DataSense E2B
**Live inference** — Gemma-4 2B + SFT v1 writes Python, runs it on your data, reads real stdout/errors.
"""
)
gr.Markdown(
f"📖 [Full project story]({STORY_URL}) · "
f"🎬 [Demo video]({DEMO_VIDEO_URL}) · "
f"[LinkedIn post]({LINKEDIN_POST_URL}) · "
f"LoRA [`DataSense-Modal-E2B-SFT`](https://huggingface.co/{ADAPTER_MODEL})",
)
with gr.Row(equal_height=False):
with gr.Column(scale=5, elem_id="ds-panel"):
gr.Markdown("### Configure")
data_mode = gr.Radio(
choices=["Bundled examples", "Upload your file"],
value="Bundled examples",
label="Data source",
)
dataset = gr.Dropdown(
choices=dataset_choices,
value=dataset_choices[0],
label="Demo dataset",
)
upload = gr.File(
label="Your CSV or Excel file",
file_types=[".csv", ".xlsx", ".xls"],
type="filepath",
visible=False,
)
gr.Markdown("### Data preview")
preview_meta = gr.Markdown(default_meta)
with gr.Group(elem_id="ds-preview-box"):
preview_df = gr.Dataframe(
value=default_df,
interactive=False,
wrap=True,
max_height=280,
)
task = gr.Textbox(
label="Question / task",
placeholder="e.g. Which product had the highest total revenue?",
lines=3,
)
max_steps = gr.Slider(
minimum=3,
maximum=12,
value=AGENT_MAX_STEPS,
step=1,
label="Max agent steps",
)
run_btn = gr.Button("▶ Run DataSense", variant="primary", elem_id="run-btn")
gr.Examples(
examples=DEMO_EXAMPLES,
inputs=[dataset, task],
label="Quick examples (bundled data)",
)
with gr.Column(scale=7, elem_id="ds-results"):
gr.Markdown("### Results")
progress_out = gr.HTML(value=IDLE_PROGRESS_HTML)
with gr.Tabs():
with gr.Tab("🔍 Execution trace", id="trace_tab"):
steps_out = gr.Markdown()
with gr.Tab("✅ Answer", id="answer_tab"):
answer_out = gr.HTML()
preview_inputs = [data_mode, dataset, upload]
data_mode.change(_toggle_data_inputs, data_mode, [dataset, upload]).then(
_load_preview, preview_inputs, [preview_df, preview_meta]
)
dataset.change(_load_preview, preview_inputs, [preview_df, preview_meta])
upload.change(_load_preview, preview_inputs, [preview_df, preview_meta])
run_btn.click(
fn=run_task,
inputs=[data_mode, dataset, upload, task, max_steps],
outputs=[progress_out, steps_out, answer_out],
show_progress="hidden",
)
demo.load(_load_preview, preview_inputs, [preview_df, preview_meta])
return demo
try:
preload_model()
except Exception as exc:
print(f"Startup preload skipped (will load on first run): {exc}")
demo = build_ui()
if __name__ == "__main__":
demo.queue(max_size=8).launch(theme=build_theme(), css=CUSTOM_CSS)