Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from utils.visualization import create_radar_chart, create_bar_chart | |
| def display_eval_results(eval_results, chart_type, group_by_thread=False): | |
| """ | |
| Display evaluation results with tabs and charts | |
| Args: | |
| eval_results (dict): Evaluation results | |
| chart_type (str): Type of chart to create ('radar' or 'bar') | |
| group_by_thread (bool): Whether results are grouped by thread instead of plot | |
| """ | |
| if not eval_results: | |
| st.info("Select filters to view evaluation results.") | |
| return | |
| if "message" in eval_results: | |
| st.warning(eval_results["message"]) | |
| return | |
| # Display raw data in a collapsible section | |
| with st.expander("View Raw Data"): | |
| st.json(eval_results) | |
| # Case 1: Results grouped by thread/plot with journeyEvals and aiScriptEvals | |
| if isinstance(eval_results, dict) and all(isinstance(val, dict) and "journeyEvals" in val and "aiScriptEvals" in val for val in eval_results.values()): | |
| # For each thread/plot, create a section with tabs for journey evals and ai script evals | |
| for group_name, group_data in eval_results.items(): | |
| # Use appropriate header based on grouping type | |
| if group_by_thread: | |
| st.subheader(f"Thread: {group_name}") | |
| else: | |
| st.subheader(f"Plot: {group_name}") | |
| # Create tabs for Journey Evals and AI Script Evals | |
| journey_tab, aiscript_tab = st.tabs(["Journey Evaluations", "AI Script Evaluations"]) | |
| # Process Journey Evaluations | |
| with journey_tab: | |
| if "journeyEvals" in group_data and group_data["journeyEvals"]: | |
| eval_names = list(group_data["journeyEvals"].keys()) | |
| # Create tabs for each journey eval type | |
| if len(eval_names) > 0: | |
| journey_eval_tabs = st.tabs(eval_names) | |
| for i, eval_name in enumerate(eval_names): | |
| with journey_eval_tabs[i]: | |
| metrics = group_data["journeyEvals"][eval_name] | |
| if chart_type == 'radar': | |
| chart = create_radar_chart(metrics, f"{eval_name}") | |
| else: | |
| chart = create_bar_chart(metrics, f"{eval_name}") | |
| # Add unique key for each chart | |
| st.plotly_chart(chart, use_container_width=True, key=f"journey_{group_name}_{eval_name}") | |
| else: | |
| st.info("No journey evaluation metrics available for this group.") | |
| else: | |
| st.info("No journey evaluation data available for this group.") | |
| # Process AI Script Evaluations | |
| with aiscript_tab: | |
| if "aiScriptEvals" in group_data and group_data["aiScriptEvals"]: | |
| eval_names = list(group_data["aiScriptEvals"].keys()) | |
| # Create tabs for each AI script eval type | |
| if len(eval_names) > 0: | |
| aiscript_eval_tabs = st.tabs(eval_names) | |
| for i, eval_name in enumerate(eval_names): | |
| with aiscript_eval_tabs[i]: | |
| metrics = group_data["aiScriptEvals"][eval_name] | |
| if chart_type == 'radar': | |
| chart = create_radar_chart(metrics, f"{eval_name}") | |
| else: | |
| chart = create_bar_chart(metrics, f"{eval_name}") | |
| # Add unique key for each chart | |
| st.plotly_chart(chart, use_container_width=True, key=f"aiscript_{group_name}_{eval_name}") | |
| else: | |
| st.info("No AI script evaluation metrics available for this group.") | |
| else: | |
| st.info("No AI script evaluation data available for this group.") | |
| # Add a separator between plots/threads | |
| st.markdown("---") | |
| # Case 2: AI Script Evaluation grouped by thread/plot with nested eval structure | |
| elif isinstance(eval_results, dict) and all(isinstance(val, dict) and not ("journeyEvals" in val or "aiScriptEvals" in val) for val in eval_results.values()): | |
| # For each thread/plot, create a section with tabs for AI script evals | |
| for group_name, group_data in eval_results.items(): | |
| # Use appropriate header based on grouping type | |
| if group_by_thread: | |
| st.subheader(f"Thread: {group_name}") | |
| else: | |
| st.subheader(f"Plot: {group_name}") | |
| # Get eval names | |
| eval_names = list(group_data.keys()) | |
| # Create tabs for each AI script eval type | |
| if len(eval_names) > 0: | |
| aiscript_eval_tabs = st.tabs(eval_names) | |
| for i, eval_name in enumerate(eval_names): | |
| with aiscript_eval_tabs[i]: | |
| metrics = group_data[eval_name] | |
| if chart_type == 'radar': | |
| chart = create_radar_chart(metrics, f"{eval_name}") | |
| else: | |
| chart = create_bar_chart(metrics, f"{eval_name}") | |
| # Add unique key for each chart | |
| st.plotly_chart(chart, use_container_width=True, key=f"aiscript_{group_name}_{eval_name}") | |
| else: | |
| st.info("No AI script evaluation metrics available for this group.") | |
| # Add a separator between groups | |
| st.markdown("---") | |
| # Case 3: Journey Evaluation with group_by_plots=False or other query types with flat structure | |
| elif isinstance(eval_results, dict) and "journeyEvals" in eval_results and "aiScriptEvals" in eval_results: | |
| # Create tabs for Journey Evals and AI Script Evals | |
| journey_tab, aiscript_tab = st.tabs(["Journey Evaluations", "AI Script Evaluations"]) | |
| # Process Journey Evaluations | |
| with journey_tab: | |
| if eval_results["journeyEvals"]: | |
| eval_names = list(eval_results["journeyEvals"].keys()) | |
| # Create tabs for each journey eval type | |
| if len(eval_names) > 0: | |
| journey_eval_tabs = st.tabs(eval_names) | |
| for i, eval_name in enumerate(eval_names): | |
| with journey_eval_tabs[i]: | |
| metrics = eval_results["journeyEvals"][eval_name] | |
| if chart_type == 'radar': | |
| chart = create_radar_chart(metrics, f"{eval_name}") | |
| else: | |
| chart = create_bar_chart(metrics, f"{eval_name}") | |
| # Add unique key for each chart | |
| st.plotly_chart(chart, use_container_width=True, key=f"journey_{eval_name}") | |
| else: | |
| st.info("No journey evaluation metrics available.") | |
| else: | |
| st.info("No journey evaluation data available.") | |
| # Process AI Script Evaluations | |
| with aiscript_tab: | |
| if eval_results["aiScriptEvals"]: | |
| eval_names = list(eval_results["aiScriptEvals"].keys()) | |
| # Create tabs for each AI script eval type | |
| if len(eval_names) > 0: | |
| aiscript_eval_tabs = st.tabs(eval_names) | |
| for i, eval_name in enumerate(eval_names): | |
| with aiscript_eval_tabs[i]: | |
| metrics = eval_results["aiScriptEvals"][eval_name] | |
| if chart_type == 'radar': | |
| chart = create_radar_chart(metrics, f"{eval_name}") | |
| else: | |
| chart = create_bar_chart(metrics, f"{eval_name}") | |
| # Add unique key for each chart | |
| st.plotly_chart(chart, use_container_width=True, key=f"aiscript_{eval_name}") | |
| else: | |
| st.info("No AI script evaluation metrics available.") | |
| else: | |
| st.info("No AI script evaluation data available.") | |
| # Case 4: AI Script Evaluation with group_by_plots=False | |
| elif isinstance(eval_results, dict) and not ("journeyEvals" in eval_results or "aiScriptEvals" in eval_results): | |
| # Get eval names | |
| eval_names = list(eval_results.keys()) | |
| # Create tabs for each AI script eval type | |
| if len(eval_names) > 0: | |
| aiscript_eval_tabs = st.tabs(eval_names) | |
| for i, eval_name in enumerate(eval_names): | |
| with aiscript_eval_tabs[i]: | |
| metrics = eval_results[eval_name] | |
| if chart_type == 'radar': | |
| chart = create_radar_chart(metrics, f"{eval_name}") | |
| else: | |
| chart = create_bar_chart(metrics, f"{eval_name}") | |
| # Add unique key for each chart | |
| st.plotly_chart(chart, use_container_width=True, key=f"aiscript_direct_{eval_name}") | |
| else: | |
| st.info("No AI script evaluation metrics available.") | |
| else: | |
| st.error("Unrecognized result format") | |
| def display_data_overview(runs_df, turns_df, ai_script_evals_df, journey_evals_df): | |
| """ | |
| Display data overview tabs | |
| Args: | |
| runs_df (pd.DataFrame): Runs data | |
| turns_df (pd.DataFrame): Turns data | |
| ai_script_evals_df (pd.DataFrame): AI script evaluations data | |
| journey_evals_df (pd.DataFrame): Journey evaluations data | |
| """ | |
| st.header("Data Overview") | |
| # Check if any data is available | |
| if runs_df.empty and turns_df.empty and ai_script_evals_df.empty and journey_evals_df.empty: | |
| st.warning("No data available. Please check your connection to AWS Athena and ensure that the database and tables exist.") | |
| # Add some troubleshooting tips | |
| with st.expander("Troubleshooting Tips"): | |
| st.markdown(""" | |
| ### Troubleshooting Steps: | |
| 1. **AWS SSO Authentication**: Make sure you've run `aws sso login --profile your_profile` before starting the dashboard | |
| 2. **AWS Region**: Verify that the region in your `.env` file matches the region where your Athena database is located | |
| 3. **Athena Database and Tables**: Confirm that the database name and table names in your `.env` file are correct | |
| 4. **AWS Permissions**: Ensure your AWS role has permissions to query Athena and access the S3 bucket for query results | |
| 5. **Network Connectivity**: Check that you have network connectivity to AWS services | |
| 6. **Check Logs**: Look at the application logs for more detailed error messages | |
| """) | |
| return | |
| # Display tabs with data | |
| overview_tabs = st.tabs(["Runs", "Turns", "AI Script Evals", "Journey Evals"]) | |
| with overview_tabs[0]: | |
| st.subheader("Evaluation Runs") | |
| if runs_df.empty: | |
| st.info("No run data available.") | |
| else: | |
| st.dataframe(runs_df) | |
| with overview_tabs[1]: | |
| st.subheader("Turns") | |
| if turns_df.empty: | |
| st.info("No turn data available.") | |
| else: | |
| st.dataframe(turns_df) | |
| with overview_tabs[2]: | |
| st.subheader("AI Script Evaluations") | |
| if ai_script_evals_df.empty: | |
| st.info("No AI script evaluation data available.") | |
| else: | |
| st.dataframe(ai_script_evals_df) | |
| with overview_tabs[3]: | |
| st.subheader("Journey Evaluations") | |
| if journey_evals_df.empty: | |
| st.info("No journey evaluation data available.") | |
| else: | |
| st.dataframe(journey_evals_df) | |
| def display_documentation(): | |
| """ | |
| Display documentation section | |
| """ | |
| with st.expander("Documentation"): | |
| st.markdown(""" | |
| ## Evaluation Dashboard Documentation | |
| This dashboard allows you to explore and visualize evaluation data from AI runs. | |
| ### Query Types | |
| 1. **Plot Evaluation**: View metrics for a specific plot | |
| 2. **Journey Evaluation**: View metrics for a specific journey, optionally grouped by plots or threads | |
| 3. **AI Script Evaluation**: View metrics for a specific AI script, optionally grouped by plots or threads | |
| 4. **Shared Evaluations**: View metrics for evaluations that are shared across all runs | |
| ### Filters | |
| - **Aggregation Type**: Choose how to aggregate metric scores (mean, median, etc.) | |
| - **Filter by Last N Days**: Only include evaluations from the last N days | |
| - **Runtime Evaluations Only**: Only include evaluations that were run during runtime (thread_id is not null) | |
| When this option is selected, results are grouped by thread ID instead of plot | |
| - **Chart Type**: Choose between radar charts and bar charts | |
| ### Data Overview | |
| The Data Overview section shows the raw data in tabular format. | |
| """) |