Spaces:
Sleeping
Sleeping
| import json | |
| import pandas as pd | |
| import streamlit as st | |
| import plotly.express as px | |
| from pathlib import Path | |
| st.set_page_config(page_title="NL2SQL Benchmark Dashboard", layout="wide") | |
| st.title("π NL2SQL Copilot β Benchmark Dashboard") | |
| # 1. Load results | |
| result_files = list(Path("benchmarks/results").glob("*.jsonl")) | |
| if not result_files: | |
| st.warning("No benchmark result files found in benchmarks/results/") | |
| st.stop() | |
| file = st.selectbox("Select benchmark file", result_files) | |
| rows = [json.loads(line) for line in open(file)] | |
| df = pd.DataFrame(rows) | |
| # 2. Summary metrics | |
| st.subheader("Aggregate Metrics") | |
| col1, col2, col3, col4 = st.columns(4) | |
| col1.metric("Total Queries", len(df)) | |
| col2.metric("Execution Accuracy", f"{df['exec_acc'].mean() * 100:.1f}%") | |
| col3.metric("Safety Violations", f"{df['safe_fail'].mean() * 100:.1f}%") | |
| col4.metric("Average Latency (ms)", f"{df['latency_ms'].mean():.0f}") | |
| # 3. Latency Distribution | |
| st.subheader("Latency Distribution") | |
| fig1 = px.histogram(df, x="latency_ms", nbins=30, title="Latency Histogram") | |
| st.plotly_chart(fig1, use_container_width=True) | |
| # 4. Cost vs Accuracy | |
| st.subheader("Cost vs Execution Accuracy") | |
| fig2 = px.scatter( | |
| df, | |
| x="cost_usd", | |
| y="exec_acc", | |
| color="provider", | |
| title="Trade-off: Cost vs Accuracy", | |
| hover_data=["query"], | |
| ) | |
| st.plotly_chart(fig2, use_container_width=True) | |
| # 5. Repair Stats | |
| if "repair_attempts" in df.columns: | |
| st.subheader("Repair Attempts") | |
| fig3 = px.bar( | |
| df.groupby("repair_attempts").size().reset_index(name="count"), | |
| x="repair_attempts", | |
| y="count", | |
| title="Number of Repair Attempts per Query", | |
| ) | |
| st.plotly_chart(fig3, use_container_width=True) | |