import json import pandas as pd import streamlit as st import plotly.express as px from pathlib import Path st.set_page_config(page_title="NL2SQL Benchmark Dashboard", layout="wide") st.title("📊 NL2SQL Copilot – Benchmark Dashboard") # 1. Load results result_files = list(Path("benchmarks/results").glob("*.jsonl")) if not result_files: st.warning("No benchmark result files found in benchmarks/results/") st.stop() file = st.selectbox("Select benchmark file", result_files) rows = [json.loads(line) for line in open(file)] df = pd.DataFrame(rows) # 2. Summary metrics st.subheader("Aggregate Metrics") col1, col2, col3, col4 = st.columns(4) col1.metric("Total Queries", len(df)) col2.metric("Execution Accuracy", f"{df['exec_acc'].mean() * 100:.1f}%") col3.metric("Safety Violations", f"{df['safe_fail'].mean() * 100:.1f}%") col4.metric("Average Latency (ms)", f"{df['latency_ms'].mean():.0f}") # 3. Latency Distribution st.subheader("Latency Distribution") fig1 = px.histogram(df, x="latency_ms", nbins=30, title="Latency Histogram") st.plotly_chart(fig1, use_container_width=True) # 4. Cost vs Accuracy st.subheader("Cost vs Execution Accuracy") fig2 = px.scatter( df, x="cost_usd", y="exec_acc", color="provider", title="Trade-off: Cost vs Accuracy", hover_data=["query"], ) st.plotly_chart(fig2, use_container_width=True) # 5. Repair Stats if "repair_attempts" in df.columns: st.subheader("Repair Attempts") fig3 = px.bar( df.groupby("repair_attempts").size().reset_index(name="count"), x="repair_attempts", y="count", title="Number of Repair Attempts per Query", ) st.plotly_chart(fig3, use_container_width=True)