""" DataSense E2B — Hugging Face Space demo Execution-grounded data agent (SFT v1) with bundled or uploaded CSVs/Excel. """ from __future__ import annotations import html import re import spaces # must be first — before any torch/CUDA import from pathlib import Path import gradio as gr import pandas as pd from agent import run_agent from config import ADAPTER_MODEL, AGENT_MAX_STEPS, DATA_DIR from examples import DEMO_DATASETS, DEMO_EXAMPLES MODEL, TOKENIZER = None, None STORY_URL = "https://datasense-e2b.netlify.app/" DEMO_VIDEO_URL = "https://youtu.be/ucFoCdMK7sE" LINKEDIN_POST_URL = ( "https://www.linkedin.com/posts/sanjaymalladi_buildsmall-huggingface-modal-share-7471993638814654464-47hY/" ) CUSTOM_CSS = """ @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500&family=Newsreader:ital,opsz,wght@0,6..72,400;0,6..72,600;1,6..72,400&display=swap'); .gradio-container { --ds-bg: #0c1117; --ds-surface: #151c26; --ds-border: #2a3544; --ds-accent: #3ecfae; --ds-accent-dim: #1f6f5c; --ds-text: #e8eef5; --ds-muted: #8b9cb3; max-width: 1280px !important; font-family: 'Newsreader', Georgia, serif !important; } #ds-header { background: linear-gradient(135deg, #0f1a24 0%, #122a2a 55%, #0c1117 100%); border: 1px solid var(--ds-border); border-radius: 16px; padding: 1.5rem 1.75rem; margin-bottom: 1rem; } #ds-header h1 { font-family: 'Newsreader', Georgia, serif; font-size: 2rem; font-weight: 600; margin: 0 0 0.35rem 0; color: var(--ds-text); } #ds-badge { display: inline-block; font-family: 'IBM Plex Mono', monospace; font-size: 0.72rem; letter-spacing: 0.08em; text-transform: uppercase; color: var(--ds-accent); border: 1px solid var(--ds-accent-dim); border-radius: 999px; padding: 0.2rem 0.65rem; margin-bottom: 0.75rem; } #ds-panel, #ds-results { background: var(--ds-surface); border: 1px solid var(--ds-border); border-radius: 14px; padding: 1.1rem; min-height: 520px; } #ds-preview-box { margin-top: 0.5rem; border: 1px solid var(--ds-border); border-radius: 10px; overflow: hidden; } #ds-preview-box .label-wrap { padding: 0.5rem 0.75rem !important; } #run-btn { background: linear-gradient(90deg, #1f6f5c, #3ecfae) !important; border: none !important; font-weight: 600 !important; letter-spacing: 0.02em; } #ds-progress-label { font-family: 'IBM Plex Mono', monospace; font-size: 0.82rem; color: var(--ds-accent); margin: 0 0 0.35rem 0; } .ds-progress-wrap { margin: 0 0 1rem 0; padding: 0.85rem 1rem; background: #0f1419; border: 1px solid #2a3544; border-radius: 10px; } .ds-progress-text { font-family: 'IBM Plex Mono', monospace; font-size: 0.82rem; color: #3ecfae; margin-bottom: 0.55rem; line-height: 1.4; } .ds-progress-text .ds-pct { color: #8b9cb3; font-size: 0.75rem; } .ds-progress-track { height: 6px; background: #1a2330; border-radius: 999px; overflow: hidden; } .ds-progress-fill { height: 100%; background: linear-gradient(90deg, #1f6f5c, #3ecfae); border-radius: 999px; transition: width 0.35s ease; } .ds-progress-wrap.ds-idle .ds-progress-track { display: none; } #ds-results .tabitem { padding-top: 0.75rem !important; } .ds-answer-card { background: linear-gradient(145deg, #122a2a 0%, #151c26 100%); border: 1px solid #2a3544; border-left: 4px solid #3ecfae; border-radius: 12px; padding: 1.5rem 1.75rem; margin: 0; min-height: 200px; } .ds-answer-label { font-family: 'IBM Plex Mono', monospace; font-size: 0.72rem; letter-spacing: 0.1em; text-transform: uppercase; color: #3ecfae; margin-bottom: 0.75rem; } .ds-answer-value { font-family: 'Newsreader', Georgia, serif; font-size: 1.35rem; font-weight: 400; color: #c5d0de; line-height: 1.55; word-break: break-word; } .ds-answer-value strong { color: #ffffff; font-weight: 700; } .ds-summary { margin-top: 1.25rem; padding-top: 1rem; border-top: 1px solid #2a3544; font-size: 1.05rem; color: #8b9cb3; line-height: 1.55; } .ds-answer-empty { color: #8b9cb3; font-style: italic; padding: 1rem 0; } .ds-trace-wrap { margin: 0; padding: 0; } .ds-trace-wrap h3 { margin-top: 1rem; color: #3ecfae; font-size: 1rem; } footer { visibility: hidden; } """ def build_theme() -> gr.Theme: return ( gr.themes.Base( primary_hue=gr.themes.colors.emerald, secondary_hue=gr.themes.colors.slate, neutral_hue=gr.themes.colors.gray, font=gr.themes.GoogleFont("Newsreader"), font_mono=gr.themes.GoogleFont("IBM Plex Mono"), ) .set( body_background_fill="#0c1117", body_background_fill_dark="#0c1117", block_background_fill="#151c26", block_background_fill_dark="#151c26", block_border_width="1px", block_border_color="#2a3544", block_border_color_dark="#2a3544", body_text_color="#e8eef5", body_text_color_dark="#e8eef5", button_primary_background_fill="linear-gradient(90deg, #1f6f5c, #3ecfae)", button_primary_background_fill_hover="linear-gradient(90deg, #2a8a72, #4de0c0)", ) ) def _load_model(): global MODEL, TOKENIZER if MODEL is None or TOKENIZER is None: from model_loader import load_model_and_tokenizer MODEL, TOKENIZER = load_model_and_tokenizer() return MODEL, TOKENIZER def _resolve_data_path(data_mode: str, dataset_name: str, upload_file) -> Path | None: if data_mode == "Upload your file": if upload_file is None: return None path_str = upload_file[0] if isinstance(upload_file, list) else upload_file if not path_str: return None path = Path(path_str) return path if path.is_file() else None return DEMO_DATASETS.get(dataset_name) def _load_preview(data_mode: str, dataset_name: str, upload_file): path = _resolve_data_path(data_mode, dataset_name, upload_file) if path is None: return pd.DataFrame(), "_Select or upload a file to preview._" try: suffix = path.suffix.lower() if suffix in (".xlsx", ".xls"): df = pd.read_excel(path, nrows=100) else: df = pd.read_csv(path, nrows=100) meta = f"**{path.name}** · {len(df)} rows · {len(df.columns)} columns" return df, meta except Exception as exc: return pd.DataFrame(), f"_Could not preview file: {exc}_" def _inline_markdown_to_html(text: str) -> str: """Turn **bold** into ; escape everything else.""" if not text: return "" parts = re.split(r"\*\*(.+?)\*\*", text) chunks: list[str] = [] for i, part in enumerate(parts): safe = html.escape(part) chunks.append(f"{safe}" if i % 2 == 1 else safe) return "".join(chunks).replace("**", "") def _format_answer_html(answer: str, summary: str = "") -> str: if not answer: return '
Could not parse an answer — check the execution trace tab.
' answer_html = _inline_markdown_to_html(answer) summary_block = "" if summary: summary_block = f'

{_inline_markdown_to_html(summary)}

' return ( f'
' f'
Verified answer
' f'
{answer_html}
' f"{summary_block}" f"
" ) def _toggle_data_inputs(data_mode: str): is_upload = data_mode == "Upload your file" return ( gr.update(visible=not is_upload), gr.update(visible=is_upload), ) def _wrap_trace(trace: str) -> str: if not trace: return "" return f'
\n\n{trace}\n\n
' def _progress_html(pct: int | None, label: str) -> str: safe_label = html.escape(label) if pct is None: return f'
{safe_label}
' pct = max(0, min(100, pct)) return ( f'
' f'
{safe_label} {pct}%
' f'
' f"
" ) def _progress_update(pct: int | None, label: str, trace: str = "", answer: str = ""): return ( _progress_html(pct, label), _wrap_trace(trace), answer, ) IDLE_PROGRESS_HTML = _progress_html(None, "Ready — click Run DataSense") @spaces.GPU(duration=180) def run_task( data_mode: str, dataset_name: str, upload_file, task: str, max_steps: int, progress=gr.Progress(), ): if not task.strip(): yield _progress_update(None, "Enter a question to run DataSense", "", '
Enter a task question.
') return yield _progress_update(5, "Reading your dataset…") data_path = _resolve_data_path(data_mode, dataset_name, upload_file) if data_path is None: msg = "⚠️ Upload a `.csv` or `.xlsx` file first." if data_mode == "Upload your file" else f"⚠️ Dataset not found: {dataset_name}" yield _progress_update(0, "Dataset missing", "", f'
{html.escape(msg)}
') return try: yield _progress_update(12, "Loading DataSense on GPU…") progress(0.15, desc="Loading DataSense…") model, tokenizer = _load_model() yield _progress_update(22, "DataSense is starting…") agent_stream = run_agent( model, tokenizer, data_path, task.strip(), max_steps=int(max_steps), progress=progress, stream=True, ) for event in agent_stream: if event[0] == "progress": _, step, total, trace_md = event pct = int(22 + (73 * step / max(total, 1))) yield _progress_update( pct, f"DataSense · step {step}/{total}", trace_md, "", ) elif event[0] == "final": result = event[1] answer_html = _format_answer_html(result.get("answer", ""), result.get("summary", "")) yield _progress_update( 100, "DataSense finished", result["steps_markdown"], answer_html, ) except Exception as exc: yield _progress_update(0, "Error", "", f'
Error: {html.escape(str(exc))}
') @spaces.GPU(duration=300) def preload_model(): _load_model() def build_ui() -> gr.Blocks: dataset_choices = list(DEMO_DATASETS.keys()) default_df, default_meta = _load_preview("Bundled examples", dataset_choices[0], None) with gr.Blocks(title="DataSense E2B") as demo: with gr.Column(elem_id="ds-header"): gr.HTML('
Execution-verified · Gemma / DataBench
') gr.Markdown( """ # DataSense E2B **Live inference** — Gemma-4 2B + SFT v1 writes Python, runs it on your data, reads real stdout/errors. """ ) gr.Markdown( f"📖 [Full project story]({STORY_URL}) · " f"🎬 [Demo video]({DEMO_VIDEO_URL}) · " f"[LinkedIn post]({LINKEDIN_POST_URL}) · " f"LoRA [`DataSense-Modal-E2B-SFT`](https://huggingface.co/{ADAPTER_MODEL})", ) with gr.Row(equal_height=False): with gr.Column(scale=5, elem_id="ds-panel"): gr.Markdown("### Configure") data_mode = gr.Radio( choices=["Bundled examples", "Upload your file"], value="Bundled examples", label="Data source", ) dataset = gr.Dropdown( choices=dataset_choices, value=dataset_choices[0], label="Demo dataset", ) upload = gr.File( label="Your CSV or Excel file", file_types=[".csv", ".xlsx", ".xls"], type="filepath", visible=False, ) gr.Markdown("### Data preview") preview_meta = gr.Markdown(default_meta) with gr.Group(elem_id="ds-preview-box"): preview_df = gr.Dataframe( value=default_df, interactive=False, wrap=True, max_height=280, ) task = gr.Textbox( label="Question / task", placeholder="e.g. Which product had the highest total revenue?", lines=3, ) max_steps = gr.Slider( minimum=3, maximum=12, value=AGENT_MAX_STEPS, step=1, label="Max agent steps", ) run_btn = gr.Button("▶ Run DataSense", variant="primary", elem_id="run-btn") gr.Examples( examples=DEMO_EXAMPLES, inputs=[dataset, task], label="Quick examples (bundled data)", ) with gr.Column(scale=7, elem_id="ds-results"): gr.Markdown("### Results") progress_out = gr.HTML(value=IDLE_PROGRESS_HTML) with gr.Tabs(): with gr.Tab("🔍 Execution trace", id="trace_tab"): steps_out = gr.Markdown() with gr.Tab("✅ Answer", id="answer_tab"): answer_out = gr.HTML() preview_inputs = [data_mode, dataset, upload] data_mode.change(_toggle_data_inputs, data_mode, [dataset, upload]).then( _load_preview, preview_inputs, [preview_df, preview_meta] ) dataset.change(_load_preview, preview_inputs, [preview_df, preview_meta]) upload.change(_load_preview, preview_inputs, [preview_df, preview_meta]) run_btn.click( fn=run_task, inputs=[data_mode, dataset, upload, task, max_steps], outputs=[progress_out, steps_out, answer_out], show_progress="hidden", ) demo.load(_load_preview, preview_inputs, [preview_df, preview_meta]) return demo try: preload_model() except Exception as exc: print(f"Startup preload skipped (will load on first run): {exc}") demo = build_ui() if __name__ == "__main__": demo.queue(max_size=8).launch(theme=build_theme(), css=CUSTOM_CSS)