Spaces:

build-small-hackathon
/

DataSense_E2B

Running on Zero

App Files Files Community

DataSense_E2B / app.py

sanjaymalladi

Rollback app.py to 7d1f6af (last stable before chart experiment)

d1fcb83 verified 18 days ago

Raw

History Blame Contribute Delete

15 kB

	"""
	DataSense E2B — Hugging Face Space demo
	Execution-grounded data agent (SFT v1) with bundled or uploaded CSVs/Excel.
	"""

	from __future__ import annotations

	import html
	import re
	import spaces # must be first — before any torch/CUDA import

	from pathlib import Path

	import gradio as gr
	import pandas as pd

	from agent import run_agent
	from config import ADAPTER_MODEL, AGENT_MAX_STEPS, DATA_DIR
	from examples import DEMO_DATASETS, DEMO_EXAMPLES

	MODEL, TOKENIZER = None, None
	STORY_URL = "https://datasense-e2b.netlify.app/"
	DEMO_VIDEO_URL = "https://youtu.be/ucFoCdMK7sE"
	LINKEDIN_POST_URL = (
	"https://www.linkedin.com/posts/sanjaymalladi_buildsmall-huggingface-modal-share-7471993638814654464-47hY/"
	)

	CUSTOM_CSS = """
	@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500&family=Newsreader:ital,opsz,wght@0,6..72,400;0,6..72,600;1,6..72,400&display=swap');

	.gradio-container {
	--ds-bg: #0c1117;
	--ds-surface: #151c26;
	--ds-border: #2a3544;
	--ds-accent: #3ecfae;
	--ds-accent-dim: #1f6f5c;
	--ds-text: #e8eef5;
	--ds-muted: #8b9cb3;
	max-width: 1280px !important;
	font-family: 'Newsreader', Georgia, serif !important;
	}
	#ds-header {
	background: linear-gradient(135deg, #0f1a24 0%, #122a2a 55%, #0c1117 100%);
	border: 1px solid var(--ds-border);
	border-radius: 16px;
	padding: 1.5rem 1.75rem;
	margin-bottom: 1rem;
	}
	#ds-header h1 {
	font-family: 'Newsreader', Georgia, serif;
	font-size: 2rem;
	font-weight: 600;
	margin: 0 0 0.35rem 0;
	color: var(--ds-text);
	}
	#ds-badge {
	display: inline-block;
	font-family: 'IBM Plex Mono', monospace;
	font-size: 0.72rem;
	letter-spacing: 0.08em;
	text-transform: uppercase;
	color: var(--ds-accent);
	border: 1px solid var(--ds-accent-dim);
	border-radius: 999px;
	padding: 0.2rem 0.65rem;
	margin-bottom: 0.75rem;
	}
	#ds-panel, #ds-results {
	background: var(--ds-surface);
	border: 1px solid var(--ds-border);
	border-radius: 14px;
	padding: 1.1rem;
	min-height: 520px;
	}
	#ds-preview-box {
	margin-top: 0.5rem;
	border: 1px solid var(--ds-border);
	border-radius: 10px;
	overflow: hidden;
	}
	#ds-preview-box .label-wrap { padding: 0.5rem 0.75rem !important; }
	#run-btn {
	background: linear-gradient(90deg, #1f6f5c, #3ecfae) !important;
	border: none !important;
	font-weight: 600 !important;
	letter-spacing: 0.02em;
	}
	#ds-progress-label {
	font-family: 'IBM Plex Mono', monospace;
	font-size: 0.82rem;
	color: var(--ds-accent);
	margin: 0 0 0.35rem 0;
	}
	.ds-progress-wrap {
	margin: 0 0 1rem 0;
	padding: 0.85rem 1rem;
	background: #0f1419;
	border: 1px solid #2a3544;
	border-radius: 10px;
	}
	.ds-progress-text {
	font-family: 'IBM Plex Mono', monospace;
	font-size: 0.82rem;
	color: #3ecfae;
	margin-bottom: 0.55rem;
	line-height: 1.4;
	}
	.ds-progress-text .ds-pct {
	color: #8b9cb3;
	font-size: 0.75rem;
	}
	.ds-progress-track {
	height: 6px;
	background: #1a2330;
	border-radius: 999px;
	overflow: hidden;
	}
	.ds-progress-fill {
	height: 100%;
	background: linear-gradient(90deg, #1f6f5c, #3ecfae);
	border-radius: 999px;
	transition: width 0.35s ease;
	}
	.ds-progress-wrap.ds-idle .ds-progress-track { display: none; }
	#ds-results .tabitem { padding-top: 0.75rem !important; }
	.ds-answer-card {
	background: linear-gradient(145deg, #122a2a 0%, #151c26 100%);
	border: 1px solid #2a3544;
	border-left: 4px solid #3ecfae;
	border-radius: 12px;
	padding: 1.5rem 1.75rem;
	margin: 0;
	min-height: 200px;
	}
	.ds-answer-label {
	font-family: 'IBM Plex Mono', monospace;
	font-size: 0.72rem;
	letter-spacing: 0.1em;
	text-transform: uppercase;
	color: #3ecfae;
	margin-bottom: 0.75rem;
	}
	.ds-answer-value {
	font-family: 'Newsreader', Georgia, serif;
	font-size: 1.35rem;
	font-weight: 400;
	color: #c5d0de;
	line-height: 1.55;
	word-break: break-word;
	}
	.ds-answer-value strong {
	color: #ffffff;
	font-weight: 700;
	}
	.ds-summary {
	margin-top: 1.25rem;
	padding-top: 1rem;
	border-top: 1px solid #2a3544;
	font-size: 1.05rem;
	color: #8b9cb3;
	line-height: 1.55;
	}
	.ds-answer-empty {
	color: #8b9cb3;
	font-style: italic;
	padding: 1rem 0;
	}
	.ds-trace-wrap { margin: 0; padding: 0; }
	.ds-trace-wrap h3 { margin-top: 1rem; color: #3ecfae; font-size: 1rem; }
	footer { visibility: hidden; }
	"""


	def build_theme() -> gr.Theme:
	return (
	gr.themes.Base(
	primary_hue=gr.themes.colors.emerald,
	secondary_hue=gr.themes.colors.slate,
	neutral_hue=gr.themes.colors.gray,
	font=gr.themes.GoogleFont("Newsreader"),
	font_mono=gr.themes.GoogleFont("IBM Plex Mono"),
	)
	.set(
	body_background_fill="#0c1117",
	body_background_fill_dark="#0c1117",
	block_background_fill="#151c26",
	block_background_fill_dark="#151c26",
	block_border_width="1px",
	block_border_color="#2a3544",
	block_border_color_dark="#2a3544",
	body_text_color="#e8eef5",
	body_text_color_dark="#e8eef5",
	button_primary_background_fill="linear-gradient(90deg, #1f6f5c, #3ecfae)",
	button_primary_background_fill_hover="linear-gradient(90deg, #2a8a72, #4de0c0)",
	)
	)


	def _load_model():
	global MODEL, TOKENIZER
	if MODEL is None or TOKENIZER is None:
	from model_loader import load_model_and_tokenizer

	MODEL, TOKENIZER = load_model_and_tokenizer()
	return MODEL, TOKENIZER


	def _resolve_data_path(data_mode: str, dataset_name: str, upload_file) -> Path \| None:
	if data_mode == "Upload your file":
	if upload_file is None:
	return None
	path_str = upload_file[0] if isinstance(upload_file, list) else upload_file
	if not path_str:
	return None
	path = Path(path_str)
	return path if path.is_file() else None
	return DEMO_DATASETS.get(dataset_name)


	def _load_preview(data_mode: str, dataset_name: str, upload_file):
	path = _resolve_data_path(data_mode, dataset_name, upload_file)
	if path is None:
	return pd.DataFrame(), "_Select or upload a file to preview._"
	try:
	suffix = path.suffix.lower()
	if suffix in (".xlsx", ".xls"):
	df = pd.read_excel(path, nrows=100)
	else:
	df = pd.read_csv(path, nrows=100)
	meta = f"{path.name} · {len(df)} rows · {len(df.columns)} columns"
	return df, meta
	except Exception as exc:
	return pd.DataFrame(), f"_Could not preview file: {exc}_"


	def _inline_markdown_to_html(text: str) -> str:
	"""Turn bold into <strong>; escape everything else."""
	if not text:
	return ""
	parts = re.split(r"\\(.+?)\\", text)
	chunks: list[str] = []
	for i, part in enumerate(parts):
	safe = html.escape(part)
	chunks.append(f"<strong>{safe}</strong>" if i % 2 == 1 else safe)
	return "".join(chunks).replace("**", "")


	def _format_answer_html(answer: str, summary: str = "") -> str:
	if not answer:
	return '<div class="ds-answer-empty">Could not parse an answer — check the execution trace tab.</div>'
	answer_html = _inline_markdown_to_html(answer)
	summary_block = ""
	if summary:
	summary_block = f'<p class="ds-summary">{_inline_markdown_to_html(summary)}</p>'
	return (
	f'<div class="ds-answer-card">'
	f'<div class="ds-answer-label">Verified answer</div>'
	f'<div class="ds-answer-value">{answer_html}</div>'
	f"{summary_block}"
	f"</div>"
	)


	def _toggle_data_inputs(data_mode: str):
	is_upload = data_mode == "Upload your file"
	return (
	gr.update(visible=not is_upload),
	gr.update(visible=is_upload),
	)


	def _wrap_trace(trace: str) -> str:
	if not trace:
	return ""
	return f'<div class="ds-trace-wrap">\n\n{trace}\n\n</div>'


	def _progress_html(pct: int \| None, label: str) -> str:
	safe_label = html.escape(label)
	if pct is None:
	return f'<div class="ds-progress-wrap ds-idle"><div class="ds-progress-text">{safe_label}</div></div>'
	pct = max(0, min(100, pct))
	return (
	f'<div class="ds-progress-wrap">'
	f'<div class="ds-progress-text">{safe_label} <span class="ds-pct">{pct}%</span></div>'
	f'<div class="ds-progress-track"><div class="ds-progress-fill" style="width:{pct}%"></div></div>'
	f"</div>"
	)


	def _progress_update(pct: int \| None, label: str, trace: str = "", answer: str = ""):
	return (
	_progress_html(pct, label),
	_wrap_trace(trace),
	answer,
	)


	IDLE_PROGRESS_HTML = _progress_html(None, "Ready — click Run DataSense")


	@spaces.GPU(duration=180)
	def run_task(
	data_mode: str,
	dataset_name: str,
	upload_file,
	task: str,
	max_steps: int,
	progress=gr.Progress(),
	):
	if not task.strip():
	yield _progress_update(None, "Enter a question to run DataSense", "", '<div class="ds-answer-empty">Enter a task question.</div>')
	return

	yield _progress_update(5, "Reading your dataset…")
	data_path = _resolve_data_path(data_mode, dataset_name, upload_file)
	if data_path is None:
	msg = "⚠️ Upload a `.csv` or `.xlsx` file first." if data_mode == "Upload your file" else f"⚠️ Dataset not found: {dataset_name}"
	yield _progress_update(0, "Dataset missing", "", f'<div class="ds-answer-empty">{html.escape(msg)}</div>')
	return

	try:
	yield _progress_update(12, "Loading DataSense on GPU…")
	progress(0.15, desc="Loading DataSense…")
	model, tokenizer = _load_model()

	yield _progress_update(22, "DataSense is starting…")
	agent_stream = run_agent(
	model,
	tokenizer,
	data_path,
	task.strip(),
	max_steps=int(max_steps),
	progress=progress,
	stream=True,
	)

	for event in agent_stream:
	if event[0] == "progress":
	_, step, total, trace_md = event
	pct = int(22 + (73 * step / max(total, 1)))
	yield _progress_update(
	pct,
	f"DataSense · step {step}/{total}",
	trace_md,
	"",
	)
	elif event[0] == "final":
	result = event[1]
	answer_html = _format_answer_html(result.get("answer", ""), result.get("summary", ""))
	yield _progress_update(
	100,
	"DataSense finished",
	result["steps_markdown"],
	answer_html,
	)
	except Exception as exc:
	yield _progress_update(0, "Error", "", f'<div class="ds-answer-empty">Error: {html.escape(str(exc))}</div>')


	@spaces.GPU(duration=300)
	def preload_model():
	_load_model()


	def build_ui() -> gr.Blocks:
	dataset_choices = list(DEMO_DATASETS.keys())
	default_df, default_meta = _load_preview("Bundled examples", dataset_choices[0], None)

	with gr.Blocks(title="DataSense E2B") as demo:
	with gr.Column(elem_id="ds-header"):
	gr.HTML('<div id="ds-badge">Execution-verified · Gemma / DataBench</div>')
	gr.Markdown(
	"""
	# DataSense E2B
	Live inference — Gemma-4 2B + SFT v1 writes Python, runs it on your data, reads real stdout/errors.
	"""
	)
	gr.Markdown(
	f"📖 [Full project story]({STORY_URL}) · "
	f"🎬 [Demo video]({DEMO_VIDEO_URL}) · "
	f"[LinkedIn post]({LINKEDIN_POST_URL}) · "
	f"LoRA [`DataSense-Modal-E2B-SFT`](https://huggingface.co/{ADAPTER_MODEL})",
	)

	with gr.Row(equal_height=False):
	with gr.Column(scale=5, elem_id="ds-panel"):
	gr.Markdown("### Configure")
	data_mode = gr.Radio(
	choices=["Bundled examples", "Upload your file"],
	value="Bundled examples",
	label="Data source",
	)
	dataset = gr.Dropdown(
	choices=dataset_choices,
	value=dataset_choices[0],
	label="Demo dataset",
	)
	upload = gr.File(
	label="Your CSV or Excel file",
	file_types=[".csv", ".xlsx", ".xls"],
	type="filepath",
	visible=False,
	)

	gr.Markdown("### Data preview")
	preview_meta = gr.Markdown(default_meta)
	with gr.Group(elem_id="ds-preview-box"):
	preview_df = gr.Dataframe(
	value=default_df,
	interactive=False,
	wrap=True,
	max_height=280,
	)

	task = gr.Textbox(
	label="Question / task",
	placeholder="e.g. Which product had the highest total revenue?",
	lines=3,
	)
	max_steps = gr.Slider(
	minimum=3,
	maximum=12,
	value=AGENT_MAX_STEPS,
	step=1,
	label="Max agent steps",
	)
	run_btn = gr.Button("▶ Run DataSense", variant="primary", elem_id="run-btn")

	gr.Examples(
	examples=DEMO_EXAMPLES,
	inputs=[dataset, task],
	label="Quick examples (bundled data)",
	)

	with gr.Column(scale=7, elem_id="ds-results"):
	gr.Markdown("### Results")
	progress_out = gr.HTML(value=IDLE_PROGRESS_HTML)
	with gr.Tabs():
	with gr.Tab("🔍 Execution trace", id="trace_tab"):
	steps_out = gr.Markdown()
	with gr.Tab("✅ Answer", id="answer_tab"):
	answer_out = gr.HTML()

	preview_inputs = [data_mode, dataset, upload]
	data_mode.change(_toggle_data_inputs, data_mode, [dataset, upload]).then(
	_load_preview, preview_inputs, [preview_df, preview_meta]
	)
	dataset.change(_load_preview, preview_inputs, [preview_df, preview_meta])
	upload.change(_load_preview, preview_inputs, [preview_df, preview_meta])

	run_btn.click(
	fn=run_task,
	inputs=[data_mode, dataset, upload, task, max_steps],
	outputs=[progress_out, steps_out, answer_out],
	show_progress="hidden",
	)

	demo.load(_load_preview, preview_inputs, [preview_df, preview_meta])

	return demo


	try:
	preload_model()
	except Exception as exc:
	print(f"Startup preload skipped (will load on first run): {exc}")

	demo = build_ui()

	if __name__ == "__main__":
	demo.queue(max_size=8).launch(theme=build_theme(), css=CUSTOM_CSS)