nl2sql-copilot / ui /benchmark_app.py
Melika Kheirieh
style: fix Ruff issues (F401 unused import, E741 ambiguous var)
3c7eebd
raw
history blame
1.71 kB
import json
import pandas as pd
import streamlit as st
import plotly.express as px
from pathlib import Path
st.set_page_config(page_title="NL2SQL Benchmark Dashboard", layout="wide")
st.title("πŸ“Š NL2SQL Copilot – Benchmark Dashboard")
# 1. Load results
result_files = list(Path("benchmarks/results").glob("*.jsonl"))
if not result_files:
st.warning("No benchmark result files found in benchmarks/results/")
st.stop()
file = st.selectbox("Select benchmark file", result_files)
rows = [json.loads(line) for line in open(file)]
df = pd.DataFrame(rows)
# 2. Summary metrics
st.subheader("Aggregate Metrics")
col1, col2, col3, col4 = st.columns(4)
col1.metric("Total Queries", len(df))
col2.metric("Execution Accuracy", f"{df['exec_acc'].mean() * 100:.1f}%")
col3.metric("Safety Violations", f"{df['safe_fail'].mean() * 100:.1f}%")
col4.metric("Average Latency (ms)", f"{df['latency_ms'].mean():.0f}")
# 3. Latency Distribution
st.subheader("Latency Distribution")
fig1 = px.histogram(df, x="latency_ms", nbins=30, title="Latency Histogram")
st.plotly_chart(fig1, use_container_width=True)
# 4. Cost vs Accuracy
st.subheader("Cost vs Execution Accuracy")
fig2 = px.scatter(
df,
x="cost_usd",
y="exec_acc",
color="provider",
title="Trade-off: Cost vs Accuracy",
hover_data=["query"],
)
st.plotly_chart(fig2, use_container_width=True)
# 5. Repair Stats
if "repair_attempts" in df.columns:
st.subheader("Repair Attempts")
fig3 = px.bar(
df.groupby("repair_attempts").size().reset_index(name="count"),
x="repair_attempts",
y="count",
title="Number of Repair Attempts per Query",
)
st.plotly_chart(fig3, use_container_width=True)