Spaces:

paulokewunmi
/

ba_dashboard

Sleeping

File size: 13,756 Bytes
import streamlit as st
from utils.visualization import create_radar_chart, create_bar_chart

def display_eval_results(eval_results, chart_type, group_by_thread=False):
    """
    Display evaluation results with tabs and charts
    
    Args:
        eval_results (dict): Evaluation results
        chart_type (str): Type of chart to create ('radar' or 'bar')
        group_by_thread (bool): Whether results are grouped by thread instead of plot
    """
    if not eval_results:
        st.info("Select filters to view evaluation results.")
        return
    
    if "message" in eval_results:
        st.warning(eval_results["message"])
        return
    
    # Display raw data in a collapsible section
    with st.expander("View Raw Data"):
        st.json(eval_results)
    
    # Case 1: Results grouped by thread/plot with journeyEvals and aiScriptEvals
    if isinstance(eval_results, dict) and all(isinstance(val, dict) and "journeyEvals" in val and "aiScriptEvals" in val for val in eval_results.values()):
        # For each thread/plot, create a section with tabs for journey evals and ai script evals
        for group_name, group_data in eval_results.items():
            # Use appropriate header based on grouping type
            if group_by_thread:
                st.subheader(f"Thread: {group_name}")
            else:
                st.subheader(f"Plot: {group_name}")
            
            # Create tabs for Journey Evals and AI Script Evals
            journey_tab, aiscript_tab = st.tabs(["Journey Evaluations", "AI Script Evaluations"])
            
            # Process Journey Evaluations
            with journey_tab:
                if "journeyEvals" in group_data and group_data["journeyEvals"]:
                    eval_names = list(group_data["journeyEvals"].keys())
                    
                    # Create tabs for each journey eval type
                    if len(eval_names) > 0:
                        journey_eval_tabs = st.tabs(eval_names)
                        
                        for i, eval_name in enumerate(eval_names):
                            with journey_eval_tabs[i]:
                                metrics = group_data["journeyEvals"][eval_name]
                                if chart_type == 'radar':
                                    chart = create_radar_chart(metrics, f"{eval_name}")
                                else:
                                    chart = create_bar_chart(metrics, f"{eval_name}")
                                # Add unique key for each chart
                                st.plotly_chart(chart, use_container_width=True, key=f"journey_{group_name}_{eval_name}")
                    else:
                        st.info("No journey evaluation metrics available for this group.")
                else:
                    st.info("No journey evaluation data available for this group.")
            
            # Process AI Script Evaluations
            with aiscript_tab:
                if "aiScriptEvals" in group_data and group_data["aiScriptEvals"]:
                    eval_names = list(group_data["aiScriptEvals"].keys())
                    
                    # Create tabs for each AI script eval type
                    if len(eval_names) > 0:
                        aiscript_eval_tabs = st.tabs(eval_names)
                        
                        for i, eval_name in enumerate(eval_names):
                            with aiscript_eval_tabs[i]:
                                metrics = group_data["aiScriptEvals"][eval_name]
                                if chart_type == 'radar':
                                    chart = create_radar_chart(metrics, f"{eval_name}")
                                else:
                                    chart = create_bar_chart(metrics, f"{eval_name}")
                                # Add unique key for each chart
                                st.plotly_chart(chart, use_container_width=True, key=f"aiscript_{group_name}_{eval_name}")
                    else:
                        st.info("No AI script evaluation metrics available for this group.")
                else:
                    st.info("No AI script evaluation data available for this group.")
            
            # Add a separator between plots/threads
            st.markdown("---")
    
    # Case 2: AI Script Evaluation grouped by thread/plot with nested eval structure
    elif isinstance(eval_results, dict) and all(isinstance(val, dict) and not ("journeyEvals" in val or "aiScriptEvals" in val) for val in eval_results.values()):
        # For each thread/plot, create a section with tabs for AI script evals
        for group_name, group_data in eval_results.items():
            # Use appropriate header based on grouping type
            if group_by_thread:
                st.subheader(f"Thread: {group_name}")
            else:
                st.subheader(f"Plot: {group_name}")
            
            # Get eval names
            eval_names = list(group_data.keys())
            
            # Create tabs for each AI script eval type
            if len(eval_names) > 0:
                aiscript_eval_tabs = st.tabs(eval_names)
                
                for i, eval_name in enumerate(eval_names):
                    with aiscript_eval_tabs[i]:
                        metrics = group_data[eval_name]
                        if chart_type == 'radar':
                            chart = create_radar_chart(metrics, f"{eval_name}")
                        else:
                            chart = create_bar_chart(metrics, f"{eval_name}")
                        # Add unique key for each chart
                        st.plotly_chart(chart, use_container_width=True, key=f"aiscript_{group_name}_{eval_name}")
            else:
                st.info("No AI script evaluation metrics available for this group.")
            
            # Add a separator between groups
            st.markdown("---")
    
    # Case 3: Journey Evaluation with group_by_plots=False or other query types with flat structure
    elif isinstance(eval_results, dict) and "journeyEvals" in eval_results and "aiScriptEvals" in eval_results:
        
        # Create tabs for Journey Evals and AI Script Evals
        journey_tab, aiscript_tab = st.tabs(["Journey Evaluations", "AI Script Evaluations"])
        
        # Process Journey Evaluations
        with journey_tab:
            if eval_results["journeyEvals"]:
                eval_names = list(eval_results["journeyEvals"].keys())
                
                # Create tabs for each journey eval type
                if len(eval_names) > 0:
                    journey_eval_tabs = st.tabs(eval_names)
                    
                    for i, eval_name in enumerate(eval_names):
                        with journey_eval_tabs[i]:
                            metrics = eval_results["journeyEvals"][eval_name]
                            if chart_type == 'radar':
                                chart = create_radar_chart(metrics, f"{eval_name}")
                            else:
                                chart = create_bar_chart(metrics, f"{eval_name}")
                            # Add unique key for each chart
                            st.plotly_chart(chart, use_container_width=True, key=f"journey_{eval_name}")
                else:
                    st.info("No journey evaluation metrics available.")
            else:
                st.info("No journey evaluation data available.")
        
        # Process AI Script Evaluations
        with aiscript_tab:
            if eval_results["aiScriptEvals"]:
                eval_names = list(eval_results["aiScriptEvals"].keys())
                
                # Create tabs for each AI script eval type
                if len(eval_names) > 0:
                    aiscript_eval_tabs = st.tabs(eval_names)
                    
                    for i, eval_name in enumerate(eval_names):
                        with aiscript_eval_tabs[i]:
                            metrics = eval_results["aiScriptEvals"][eval_name]
                            if chart_type == 'radar':
                                chart = create_radar_chart(metrics, f"{eval_name}")
                            else:
                                chart = create_bar_chart(metrics, f"{eval_name}")
                            # Add unique key for each chart
                            st.plotly_chart(chart, use_container_width=True, key=f"aiscript_{eval_name}")
                else:
                    st.info("No AI script evaluation metrics available.")
            else:
                st.info("No AI script evaluation data available.")
    
    # Case 4: AI Script Evaluation with group_by_plots=False
    elif isinstance(eval_results, dict) and not ("journeyEvals" in eval_results or "aiScriptEvals" in eval_results):
        # Get eval names
        eval_names = list(eval_results.keys())
        
        # Create tabs for each AI script eval type
        if len(eval_names) > 0:
            aiscript_eval_tabs = st.tabs(eval_names)
            
            for i, eval_name in enumerate(eval_names):
                with aiscript_eval_tabs[i]:
                    metrics = eval_results[eval_name]
                    if chart_type == 'radar':
                        chart = create_radar_chart(metrics, f"{eval_name}")
                    else:
                        chart = create_bar_chart(metrics, f"{eval_name}")
                    # Add unique key for each chart
                    st.plotly_chart(chart, use_container_width=True, key=f"aiscript_direct_{eval_name}")
        else:
            st.info("No AI script evaluation metrics available.")
    
    else:
        st.error("Unrecognized result format")

def display_data_overview(runs_df, turns_df, ai_script_evals_df, journey_evals_df):
    """
    Display data overview tabs
    
    Args:
        runs_df (pd.DataFrame): Runs data
        turns_df (pd.DataFrame): Turns data
        ai_script_evals_df (pd.DataFrame): AI script evaluations data
        journey_evals_df (pd.DataFrame): Journey evaluations data
    """
    st.header("Data Overview")
    
    # Check if any data is available
    if runs_df.empty and turns_df.empty and ai_script_evals_df.empty and journey_evals_df.empty:
        st.warning("No data available. Please check your connection to AWS Athena and ensure that the database and tables exist.")
        # Add some troubleshooting tips
        with st.expander("Troubleshooting Tips"):
            st.markdown("""
            ### Troubleshooting Steps:
            
            1. **AWS SSO Authentication**: Make sure you've run `aws sso login --profile your_profile` before starting the dashboard
            
            2. **AWS Region**: Verify that the region in your `.env` file matches the region where your Athena database is located
            
            3. **Athena Database and Tables**: Confirm that the database name and table names in your `.env` file are correct
            
            4. **AWS Permissions**: Ensure your AWS role has permissions to query Athena and access the S3 bucket for query results
            
            5. **Network Connectivity**: Check that you have network connectivity to AWS services
            
            6. **Check Logs**: Look at the application logs for more detailed error messages
            """)
        return
    
    # Display tabs with data
    overview_tabs = st.tabs(["Runs", "Turns", "AI Script Evals", "Journey Evals"])
    
    with overview_tabs[0]:
        st.subheader("Evaluation Runs")
        if runs_df.empty:
            st.info("No run data available.")
        else:
            st.dataframe(runs_df)
    
    with overview_tabs[1]:
        st.subheader("Turns")
        if turns_df.empty:
            st.info("No turn data available.")
        else:
            st.dataframe(turns_df)
    
    with overview_tabs[2]:
        st.subheader("AI Script Evaluations")
        if ai_script_evals_df.empty:
            st.info("No AI script evaluation data available.")
        else:
            st.dataframe(ai_script_evals_df)
    
    with overview_tabs[3]:
        st.subheader("Journey Evaluations")
        if journey_evals_df.empty:
            st.info("No journey evaluation data available.")
        else:
            st.dataframe(journey_evals_df)

def display_documentation():
    """
    Display documentation section
    """
    with st.expander("Documentation"):
        st.markdown("""
        ## Evaluation Dashboard Documentation
        
        This dashboard allows you to explore and visualize evaluation data from AI runs.
        
        ### Query Types
        
        1. **Plot Evaluation**: View metrics for a specific plot
        2. **Journey Evaluation**: View metrics for a specific journey, optionally grouped by plots or threads
        3. **AI Script Evaluation**: View metrics for a specific AI script, optionally grouped by plots or threads
        4. **Shared Evaluations**: View metrics for evaluations that are shared across all runs
        
        ### Filters
        
        - **Aggregation Type**: Choose how to aggregate metric scores (mean, median, etc.)
        - **Filter by Last N Days**: Only include evaluations from the last N days
        - **Runtime Evaluations Only**: Only include evaluations that were run during runtime (thread_id is not null)
           When this option is selected, results are grouped by thread ID instead of plot
        - **Chart Type**: Choose between radar charts and bar charts
        
        ### Data Overview
        
        The Data Overview section shows the raw data in tabular format.
        """)