Spaces:

melikakheirieh
/

nl2sql-copilot

Sleeping

nl2sql-copilot / ui /benchmark_app.py

Melika Kheirieh

style: fix Ruff issues (F401 unused import, E741 ambiguous var)

3c7eebd about 2 months ago

1.71 kB

	import json
	import pandas as pd
	import streamlit as st
	import plotly.express as px
	from pathlib import Path

	st.set_page_config(page_title="NL2SQL Benchmark Dashboard", layout="wide")

	st.title("📊 NL2SQL Copilot – Benchmark Dashboard")

	# 1. Load results
	result_files = list(Path("benchmarks/results").glob("*.jsonl"))
	if not result_files:
	st.warning("No benchmark result files found in benchmarks/results/")
	st.stop()

	file = st.selectbox("Select benchmark file", result_files)
	rows = [json.loads(line) for line in open(file)]
	df = pd.DataFrame(rows)

	# 2. Summary metrics
	st.subheader("Aggregate Metrics")
	col1, col2, col3, col4 = st.columns(4)
	col1.metric("Total Queries", len(df))
	col2.metric("Execution Accuracy", f"{df['exec_acc'].mean() * 100:.1f}%")
	col3.metric("Safety Violations", f"{df['safe_fail'].mean() * 100:.1f}%")
	col4.metric("Average Latency (ms)", f"{df['latency_ms'].mean():.0f}")

	# 3. Latency Distribution
	st.subheader("Latency Distribution")
	fig1 = px.histogram(df, x="latency_ms", nbins=30, title="Latency Histogram")
	st.plotly_chart(fig1, use_container_width=True)

	# 4. Cost vs Accuracy
	st.subheader("Cost vs Execution Accuracy")
	fig2 = px.scatter(
	df,
	x="cost_usd",
	y="exec_acc",
	color="provider",
	title="Trade-off: Cost vs Accuracy",
	hover_data=["query"],
	)
	st.plotly_chart(fig2, use_container_width=True)

	# 5. Repair Stats
	if "repair_attempts" in df.columns:
	st.subheader("Repair Attempts")
	fig3 = px.bar(
	df.groupby("repair_attempts").size().reset_index(name="count"),
	x="repair_attempts",
	y="count",
	title="Number of Repair Attempts per Query",
	)
	st.plotly_chart(fig3, use_container_width=True)