File size: 1,672 Bytes
570f7bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import json
import pandas as pd
import streamlit as st
import plotly.express as px
from pathlib import Path

st.set_page_config(page_title="NL2SQL Benchmark Dashboard", layout="wide")

st.title("📊 NL2SQL Copilot – Benchmark Dashboard")

# 1. Load results
result_files = list(Path("benchmarks/results").glob("*.jsonl"))
if not result_files:
    st.warning("No benchmark result files found in benchmarks/results/")
    st.stop()

file = st.selectbox("Select benchmark file", result_files)
rows = [json.loads(l) for l in open(file)]
df = pd.DataFrame(rows)

# 2. Summary metrics
st.subheader("Aggregate Metrics")
col1, col2, col3, col4 = st.columns(4)
col1.metric("Total Queries", len(df))
col2.metric("Execution Accuracy", f"{df['exec_acc'].mean()*100:.1f}%")
col3.metric("Safety Violations", f"{df['safe_fail'].mean()*100:.1f}%")
col4.metric("Average Latency (ms)", f"{df['latency_ms'].mean():.0f}")

# 3. Latency Distribution
st.subheader("Latency Distribution")
fig1 = px.histogram(df, x="latency_ms", nbins=30, title="Latency Histogram")
st.plotly_chart(fig1, use_container_width=True)

# 4. Cost vs Accuracy
st.subheader("Cost vs Execution Accuracy")
fig2 = px.scatter(df, x="cost_usd", y="exec_acc", color="provider",
                  title="Trade-off: Cost vs Accuracy", hover_data=["query"])
st.plotly_chart(fig2, use_container_width=True)

# 5. Repair Stats
if "repair_attempts" in df.columns:
    st.subheader("Repair Attempts")
    fig3 = px.bar(df.groupby("repair_attempts").size().reset_index(name="count"),
                  x="repair_attempts", y="count", title="Number of Repair Attempts per Query")
    st.plotly_chart(fig3, use_container_width=True)