Spaces:
Sleeping
Sleeping
File size: 13,756 Bytes
0847744 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 | import streamlit as st
from utils.visualization import create_radar_chart, create_bar_chart
def display_eval_results(eval_results, chart_type, group_by_thread=False):
"""
Display evaluation results with tabs and charts
Args:
eval_results (dict): Evaluation results
chart_type (str): Type of chart to create ('radar' or 'bar')
group_by_thread (bool): Whether results are grouped by thread instead of plot
"""
if not eval_results:
st.info("Select filters to view evaluation results.")
return
if "message" in eval_results:
st.warning(eval_results["message"])
return
# Display raw data in a collapsible section
with st.expander("View Raw Data"):
st.json(eval_results)
# Case 1: Results grouped by thread/plot with journeyEvals and aiScriptEvals
if isinstance(eval_results, dict) and all(isinstance(val, dict) and "journeyEvals" in val and "aiScriptEvals" in val for val in eval_results.values()):
# For each thread/plot, create a section with tabs for journey evals and ai script evals
for group_name, group_data in eval_results.items():
# Use appropriate header based on grouping type
if group_by_thread:
st.subheader(f"Thread: {group_name}")
else:
st.subheader(f"Plot: {group_name}")
# Create tabs for Journey Evals and AI Script Evals
journey_tab, aiscript_tab = st.tabs(["Journey Evaluations", "AI Script Evaluations"])
# Process Journey Evaluations
with journey_tab:
if "journeyEvals" in group_data and group_data["journeyEvals"]:
eval_names = list(group_data["journeyEvals"].keys())
# Create tabs for each journey eval type
if len(eval_names) > 0:
journey_eval_tabs = st.tabs(eval_names)
for i, eval_name in enumerate(eval_names):
with journey_eval_tabs[i]:
metrics = group_data["journeyEvals"][eval_name]
if chart_type == 'radar':
chart = create_radar_chart(metrics, f"{eval_name}")
else:
chart = create_bar_chart(metrics, f"{eval_name}")
# Add unique key for each chart
st.plotly_chart(chart, use_container_width=True, key=f"journey_{group_name}_{eval_name}")
else:
st.info("No journey evaluation metrics available for this group.")
else:
st.info("No journey evaluation data available for this group.")
# Process AI Script Evaluations
with aiscript_tab:
if "aiScriptEvals" in group_data and group_data["aiScriptEvals"]:
eval_names = list(group_data["aiScriptEvals"].keys())
# Create tabs for each AI script eval type
if len(eval_names) > 0:
aiscript_eval_tabs = st.tabs(eval_names)
for i, eval_name in enumerate(eval_names):
with aiscript_eval_tabs[i]:
metrics = group_data["aiScriptEvals"][eval_name]
if chart_type == 'radar':
chart = create_radar_chart(metrics, f"{eval_name}")
else:
chart = create_bar_chart(metrics, f"{eval_name}")
# Add unique key for each chart
st.plotly_chart(chart, use_container_width=True, key=f"aiscript_{group_name}_{eval_name}")
else:
st.info("No AI script evaluation metrics available for this group.")
else:
st.info("No AI script evaluation data available for this group.")
# Add a separator between plots/threads
st.markdown("---")
# Case 2: AI Script Evaluation grouped by thread/plot with nested eval structure
elif isinstance(eval_results, dict) and all(isinstance(val, dict) and not ("journeyEvals" in val or "aiScriptEvals" in val) for val in eval_results.values()):
# For each thread/plot, create a section with tabs for AI script evals
for group_name, group_data in eval_results.items():
# Use appropriate header based on grouping type
if group_by_thread:
st.subheader(f"Thread: {group_name}")
else:
st.subheader(f"Plot: {group_name}")
# Get eval names
eval_names = list(group_data.keys())
# Create tabs for each AI script eval type
if len(eval_names) > 0:
aiscript_eval_tabs = st.tabs(eval_names)
for i, eval_name in enumerate(eval_names):
with aiscript_eval_tabs[i]:
metrics = group_data[eval_name]
if chart_type == 'radar':
chart = create_radar_chart(metrics, f"{eval_name}")
else:
chart = create_bar_chart(metrics, f"{eval_name}")
# Add unique key for each chart
st.plotly_chart(chart, use_container_width=True, key=f"aiscript_{group_name}_{eval_name}")
else:
st.info("No AI script evaluation metrics available for this group.")
# Add a separator between groups
st.markdown("---")
# Case 3: Journey Evaluation with group_by_plots=False or other query types with flat structure
elif isinstance(eval_results, dict) and "journeyEvals" in eval_results and "aiScriptEvals" in eval_results:
# Create tabs for Journey Evals and AI Script Evals
journey_tab, aiscript_tab = st.tabs(["Journey Evaluations", "AI Script Evaluations"])
# Process Journey Evaluations
with journey_tab:
if eval_results["journeyEvals"]:
eval_names = list(eval_results["journeyEvals"].keys())
# Create tabs for each journey eval type
if len(eval_names) > 0:
journey_eval_tabs = st.tabs(eval_names)
for i, eval_name in enumerate(eval_names):
with journey_eval_tabs[i]:
metrics = eval_results["journeyEvals"][eval_name]
if chart_type == 'radar':
chart = create_radar_chart(metrics, f"{eval_name}")
else:
chart = create_bar_chart(metrics, f"{eval_name}")
# Add unique key for each chart
st.plotly_chart(chart, use_container_width=True, key=f"journey_{eval_name}")
else:
st.info("No journey evaluation metrics available.")
else:
st.info("No journey evaluation data available.")
# Process AI Script Evaluations
with aiscript_tab:
if eval_results["aiScriptEvals"]:
eval_names = list(eval_results["aiScriptEvals"].keys())
# Create tabs for each AI script eval type
if len(eval_names) > 0:
aiscript_eval_tabs = st.tabs(eval_names)
for i, eval_name in enumerate(eval_names):
with aiscript_eval_tabs[i]:
metrics = eval_results["aiScriptEvals"][eval_name]
if chart_type == 'radar':
chart = create_radar_chart(metrics, f"{eval_name}")
else:
chart = create_bar_chart(metrics, f"{eval_name}")
# Add unique key for each chart
st.plotly_chart(chart, use_container_width=True, key=f"aiscript_{eval_name}")
else:
st.info("No AI script evaluation metrics available.")
else:
st.info("No AI script evaluation data available.")
# Case 4: AI Script Evaluation with group_by_plots=False
elif isinstance(eval_results, dict) and not ("journeyEvals" in eval_results or "aiScriptEvals" in eval_results):
# Get eval names
eval_names = list(eval_results.keys())
# Create tabs for each AI script eval type
if len(eval_names) > 0:
aiscript_eval_tabs = st.tabs(eval_names)
for i, eval_name in enumerate(eval_names):
with aiscript_eval_tabs[i]:
metrics = eval_results[eval_name]
if chart_type == 'radar':
chart = create_radar_chart(metrics, f"{eval_name}")
else:
chart = create_bar_chart(metrics, f"{eval_name}")
# Add unique key for each chart
st.plotly_chart(chart, use_container_width=True, key=f"aiscript_direct_{eval_name}")
else:
st.info("No AI script evaluation metrics available.")
else:
st.error("Unrecognized result format")
def display_data_overview(runs_df, turns_df, ai_script_evals_df, journey_evals_df):
"""
Display data overview tabs
Args:
runs_df (pd.DataFrame): Runs data
turns_df (pd.DataFrame): Turns data
ai_script_evals_df (pd.DataFrame): AI script evaluations data
journey_evals_df (pd.DataFrame): Journey evaluations data
"""
st.header("Data Overview")
# Check if any data is available
if runs_df.empty and turns_df.empty and ai_script_evals_df.empty and journey_evals_df.empty:
st.warning("No data available. Please check your connection to AWS Athena and ensure that the database and tables exist.")
# Add some troubleshooting tips
with st.expander("Troubleshooting Tips"):
st.markdown("""
### Troubleshooting Steps:
1. **AWS SSO Authentication**: Make sure you've run `aws sso login --profile your_profile` before starting the dashboard
2. **AWS Region**: Verify that the region in your `.env` file matches the region where your Athena database is located
3. **Athena Database and Tables**: Confirm that the database name and table names in your `.env` file are correct
4. **AWS Permissions**: Ensure your AWS role has permissions to query Athena and access the S3 bucket for query results
5. **Network Connectivity**: Check that you have network connectivity to AWS services
6. **Check Logs**: Look at the application logs for more detailed error messages
""")
return
# Display tabs with data
overview_tabs = st.tabs(["Runs", "Turns", "AI Script Evals", "Journey Evals"])
with overview_tabs[0]:
st.subheader("Evaluation Runs")
if runs_df.empty:
st.info("No run data available.")
else:
st.dataframe(runs_df)
with overview_tabs[1]:
st.subheader("Turns")
if turns_df.empty:
st.info("No turn data available.")
else:
st.dataframe(turns_df)
with overview_tabs[2]:
st.subheader("AI Script Evaluations")
if ai_script_evals_df.empty:
st.info("No AI script evaluation data available.")
else:
st.dataframe(ai_script_evals_df)
with overview_tabs[3]:
st.subheader("Journey Evaluations")
if journey_evals_df.empty:
st.info("No journey evaluation data available.")
else:
st.dataframe(journey_evals_df)
def display_documentation():
"""
Display documentation section
"""
with st.expander("Documentation"):
st.markdown("""
## Evaluation Dashboard Documentation
This dashboard allows you to explore and visualize evaluation data from AI runs.
### Query Types
1. **Plot Evaluation**: View metrics for a specific plot
2. **Journey Evaluation**: View metrics for a specific journey, optionally grouped by plots or threads
3. **AI Script Evaluation**: View metrics for a specific AI script, optionally grouped by plots or threads
4. **Shared Evaluations**: View metrics for evaluations that are shared across all runs
### Filters
- **Aggregation Type**: Choose how to aggregate metric scores (mean, median, etc.)
- **Filter by Last N Days**: Only include evaluations from the last N days
- **Runtime Evaluations Only**: Only include evaluations that were run during runtime (thread_id is not null)
When this option is selected, results are grouped by thread ID instead of plot
- **Chart Type**: Choose between radar charts and bar charts
### Data Overview
The Data Overview section shows the raw data in tabular format.
""") |