import streamlit as st from utils.visualization import create_radar_chart, create_bar_chart def display_eval_results(eval_results, chart_type, group_by_thread=False): """ Display evaluation results with tabs and charts Args: eval_results (dict): Evaluation results chart_type (str): Type of chart to create ('radar' or 'bar') group_by_thread (bool): Whether results are grouped by thread instead of plot """ if not eval_results: st.info("Select filters to view evaluation results.") return if "message" in eval_results: st.warning(eval_results["message"]) return # Display raw data in a collapsible section with st.expander("View Raw Data"): st.json(eval_results) # Case 1: Results grouped by thread/plot with journeyEvals and aiScriptEvals if isinstance(eval_results, dict) and all(isinstance(val, dict) and "journeyEvals" in val and "aiScriptEvals" in val for val in eval_results.values()): # For each thread/plot, create a section with tabs for journey evals and ai script evals for group_name, group_data in eval_results.items(): # Use appropriate header based on grouping type if group_by_thread: st.subheader(f"Thread: {group_name}") else: st.subheader(f"Plot: {group_name}") # Create tabs for Journey Evals and AI Script Evals journey_tab, aiscript_tab = st.tabs(["Journey Evaluations", "AI Script Evaluations"]) # Process Journey Evaluations with journey_tab: if "journeyEvals" in group_data and group_data["journeyEvals"]: eval_names = list(group_data["journeyEvals"].keys()) # Create tabs for each journey eval type if len(eval_names) > 0: journey_eval_tabs = st.tabs(eval_names) for i, eval_name in enumerate(eval_names): with journey_eval_tabs[i]: metrics = group_data["journeyEvals"][eval_name] if chart_type == 'radar': chart = create_radar_chart(metrics, f"{eval_name}") else: chart = create_bar_chart(metrics, f"{eval_name}") # Add unique key for each chart st.plotly_chart(chart, use_container_width=True, key=f"journey_{group_name}_{eval_name}") else: st.info("No journey evaluation metrics available for this group.") else: st.info("No journey evaluation data available for this group.") # Process AI Script Evaluations with aiscript_tab: if "aiScriptEvals" in group_data and group_data["aiScriptEvals"]: eval_names = list(group_data["aiScriptEvals"].keys()) # Create tabs for each AI script eval type if len(eval_names) > 0: aiscript_eval_tabs = st.tabs(eval_names) for i, eval_name in enumerate(eval_names): with aiscript_eval_tabs[i]: metrics = group_data["aiScriptEvals"][eval_name] if chart_type == 'radar': chart = create_radar_chart(metrics, f"{eval_name}") else: chart = create_bar_chart(metrics, f"{eval_name}") # Add unique key for each chart st.plotly_chart(chart, use_container_width=True, key=f"aiscript_{group_name}_{eval_name}") else: st.info("No AI script evaluation metrics available for this group.") else: st.info("No AI script evaluation data available for this group.") # Add a separator between plots/threads st.markdown("---") # Case 2: AI Script Evaluation grouped by thread/plot with nested eval structure elif isinstance(eval_results, dict) and all(isinstance(val, dict) and not ("journeyEvals" in val or "aiScriptEvals" in val) for val in eval_results.values()): # For each thread/plot, create a section with tabs for AI script evals for group_name, group_data in eval_results.items(): # Use appropriate header based on grouping type if group_by_thread: st.subheader(f"Thread: {group_name}") else: st.subheader(f"Plot: {group_name}") # Get eval names eval_names = list(group_data.keys()) # Create tabs for each AI script eval type if len(eval_names) > 0: aiscript_eval_tabs = st.tabs(eval_names) for i, eval_name in enumerate(eval_names): with aiscript_eval_tabs[i]: metrics = group_data[eval_name] if chart_type == 'radar': chart = create_radar_chart(metrics, f"{eval_name}") else: chart = create_bar_chart(metrics, f"{eval_name}") # Add unique key for each chart st.plotly_chart(chart, use_container_width=True, key=f"aiscript_{group_name}_{eval_name}") else: st.info("No AI script evaluation metrics available for this group.") # Add a separator between groups st.markdown("---") # Case 3: Journey Evaluation with group_by_plots=False or other query types with flat structure elif isinstance(eval_results, dict) and "journeyEvals" in eval_results and "aiScriptEvals" in eval_results: # Create tabs for Journey Evals and AI Script Evals journey_tab, aiscript_tab = st.tabs(["Journey Evaluations", "AI Script Evaluations"]) # Process Journey Evaluations with journey_tab: if eval_results["journeyEvals"]: eval_names = list(eval_results["journeyEvals"].keys()) # Create tabs for each journey eval type if len(eval_names) > 0: journey_eval_tabs = st.tabs(eval_names) for i, eval_name in enumerate(eval_names): with journey_eval_tabs[i]: metrics = eval_results["journeyEvals"][eval_name] if chart_type == 'radar': chart = create_radar_chart(metrics, f"{eval_name}") else: chart = create_bar_chart(metrics, f"{eval_name}") # Add unique key for each chart st.plotly_chart(chart, use_container_width=True, key=f"journey_{eval_name}") else: st.info("No journey evaluation metrics available.") else: st.info("No journey evaluation data available.") # Process AI Script Evaluations with aiscript_tab: if eval_results["aiScriptEvals"]: eval_names = list(eval_results["aiScriptEvals"].keys()) # Create tabs for each AI script eval type if len(eval_names) > 0: aiscript_eval_tabs = st.tabs(eval_names) for i, eval_name in enumerate(eval_names): with aiscript_eval_tabs[i]: metrics = eval_results["aiScriptEvals"][eval_name] if chart_type == 'radar': chart = create_radar_chart(metrics, f"{eval_name}") else: chart = create_bar_chart(metrics, f"{eval_name}") # Add unique key for each chart st.plotly_chart(chart, use_container_width=True, key=f"aiscript_{eval_name}") else: st.info("No AI script evaluation metrics available.") else: st.info("No AI script evaluation data available.") # Case 4: AI Script Evaluation with group_by_plots=False elif isinstance(eval_results, dict) and not ("journeyEvals" in eval_results or "aiScriptEvals" in eval_results): # Get eval names eval_names = list(eval_results.keys()) # Create tabs for each AI script eval type if len(eval_names) > 0: aiscript_eval_tabs = st.tabs(eval_names) for i, eval_name in enumerate(eval_names): with aiscript_eval_tabs[i]: metrics = eval_results[eval_name] if chart_type == 'radar': chart = create_radar_chart(metrics, f"{eval_name}") else: chart = create_bar_chart(metrics, f"{eval_name}") # Add unique key for each chart st.plotly_chart(chart, use_container_width=True, key=f"aiscript_direct_{eval_name}") else: st.info("No AI script evaluation metrics available.") else: st.error("Unrecognized result format") def display_data_overview(runs_df, turns_df, ai_script_evals_df, journey_evals_df): """ Display data overview tabs Args: runs_df (pd.DataFrame): Runs data turns_df (pd.DataFrame): Turns data ai_script_evals_df (pd.DataFrame): AI script evaluations data journey_evals_df (pd.DataFrame): Journey evaluations data """ st.header("Data Overview") # Check if any data is available if runs_df.empty and turns_df.empty and ai_script_evals_df.empty and journey_evals_df.empty: st.warning("No data available. Please check your connection to AWS Athena and ensure that the database and tables exist.") # Add some troubleshooting tips with st.expander("Troubleshooting Tips"): st.markdown(""" ### Troubleshooting Steps: 1. **AWS SSO Authentication**: Make sure you've run `aws sso login --profile your_profile` before starting the dashboard 2. **AWS Region**: Verify that the region in your `.env` file matches the region where your Athena database is located 3. **Athena Database and Tables**: Confirm that the database name and table names in your `.env` file are correct 4. **AWS Permissions**: Ensure your AWS role has permissions to query Athena and access the S3 bucket for query results 5. **Network Connectivity**: Check that you have network connectivity to AWS services 6. **Check Logs**: Look at the application logs for more detailed error messages """) return # Display tabs with data overview_tabs = st.tabs(["Runs", "Turns", "AI Script Evals", "Journey Evals"]) with overview_tabs[0]: st.subheader("Evaluation Runs") if runs_df.empty: st.info("No run data available.") else: st.dataframe(runs_df) with overview_tabs[1]: st.subheader("Turns") if turns_df.empty: st.info("No turn data available.") else: st.dataframe(turns_df) with overview_tabs[2]: st.subheader("AI Script Evaluations") if ai_script_evals_df.empty: st.info("No AI script evaluation data available.") else: st.dataframe(ai_script_evals_df) with overview_tabs[3]: st.subheader("Journey Evaluations") if journey_evals_df.empty: st.info("No journey evaluation data available.") else: st.dataframe(journey_evals_df) def display_documentation(): """ Display documentation section """ with st.expander("Documentation"): st.markdown(""" ## Evaluation Dashboard Documentation This dashboard allows you to explore and visualize evaluation data from AI runs. ### Query Types 1. **Plot Evaluation**: View metrics for a specific plot 2. **Journey Evaluation**: View metrics for a specific journey, optionally grouped by plots or threads 3. **AI Script Evaluation**: View metrics for a specific AI script, optionally grouped by plots or threads 4. **Shared Evaluations**: View metrics for evaluations that are shared across all runs ### Filters - **Aggregation Type**: Choose how to aggregate metric scores (mean, median, etc.) - **Filter by Last N Days**: Only include evaluations from the last N days - **Runtime Evaluations Only**: Only include evaluations that were run during runtime (thread_id is not null) When this option is selected, results are grouped by thread ID instead of plot - **Chart Type**: Choose between radar charts and bar charts ### Data Overview The Data Overview section shows the raw data in tabular format. """)