File size: 13,756 Bytes
0847744
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import streamlit as st
from utils.visualization import create_radar_chart, create_bar_chart

def display_eval_results(eval_results, chart_type, group_by_thread=False):
    """
    Display evaluation results with tabs and charts
    
    Args:
        eval_results (dict): Evaluation results
        chart_type (str): Type of chart to create ('radar' or 'bar')
        group_by_thread (bool): Whether results are grouped by thread instead of plot
    """
    if not eval_results:
        st.info("Select filters to view evaluation results.")
        return
    
    if "message" in eval_results:
        st.warning(eval_results["message"])
        return
    
    # Display raw data in a collapsible section
    with st.expander("View Raw Data"):
        st.json(eval_results)
    
    # Case 1: Results grouped by thread/plot with journeyEvals and aiScriptEvals
    if isinstance(eval_results, dict) and all(isinstance(val, dict) and "journeyEvals" in val and "aiScriptEvals" in val for val in eval_results.values()):
        # For each thread/plot, create a section with tabs for journey evals and ai script evals
        for group_name, group_data in eval_results.items():
            # Use appropriate header based on grouping type
            if group_by_thread:
                st.subheader(f"Thread: {group_name}")
            else:
                st.subheader(f"Plot: {group_name}")
            
            # Create tabs for Journey Evals and AI Script Evals
            journey_tab, aiscript_tab = st.tabs(["Journey Evaluations", "AI Script Evaluations"])
            
            # Process Journey Evaluations
            with journey_tab:
                if "journeyEvals" in group_data and group_data["journeyEvals"]:
                    eval_names = list(group_data["journeyEvals"].keys())
                    
                    # Create tabs for each journey eval type
                    if len(eval_names) > 0:
                        journey_eval_tabs = st.tabs(eval_names)
                        
                        for i, eval_name in enumerate(eval_names):
                            with journey_eval_tabs[i]:
                                metrics = group_data["journeyEvals"][eval_name]
                                if chart_type == 'radar':
                                    chart = create_radar_chart(metrics, f"{eval_name}")
                                else:
                                    chart = create_bar_chart(metrics, f"{eval_name}")
                                # Add unique key for each chart
                                st.plotly_chart(chart, use_container_width=True, key=f"journey_{group_name}_{eval_name}")
                    else:
                        st.info("No journey evaluation metrics available for this group.")
                else:
                    st.info("No journey evaluation data available for this group.")
            
            # Process AI Script Evaluations
            with aiscript_tab:
                if "aiScriptEvals" in group_data and group_data["aiScriptEvals"]:
                    eval_names = list(group_data["aiScriptEvals"].keys())
                    
                    # Create tabs for each AI script eval type
                    if len(eval_names) > 0:
                        aiscript_eval_tabs = st.tabs(eval_names)
                        
                        for i, eval_name in enumerate(eval_names):
                            with aiscript_eval_tabs[i]:
                                metrics = group_data["aiScriptEvals"][eval_name]
                                if chart_type == 'radar':
                                    chart = create_radar_chart(metrics, f"{eval_name}")
                                else:
                                    chart = create_bar_chart(metrics, f"{eval_name}")
                                # Add unique key for each chart
                                st.plotly_chart(chart, use_container_width=True, key=f"aiscript_{group_name}_{eval_name}")
                    else:
                        st.info("No AI script evaluation metrics available for this group.")
                else:
                    st.info("No AI script evaluation data available for this group.")
            
            # Add a separator between plots/threads
            st.markdown("---")
    
    # Case 2: AI Script Evaluation grouped by thread/plot with nested eval structure
    elif isinstance(eval_results, dict) and all(isinstance(val, dict) and not ("journeyEvals" in val or "aiScriptEvals" in val) for val in eval_results.values()):
        # For each thread/plot, create a section with tabs for AI script evals
        for group_name, group_data in eval_results.items():
            # Use appropriate header based on grouping type
            if group_by_thread:
                st.subheader(f"Thread: {group_name}")
            else:
                st.subheader(f"Plot: {group_name}")
            
            # Get eval names
            eval_names = list(group_data.keys())
            
            # Create tabs for each AI script eval type
            if len(eval_names) > 0:
                aiscript_eval_tabs = st.tabs(eval_names)
                
                for i, eval_name in enumerate(eval_names):
                    with aiscript_eval_tabs[i]:
                        metrics = group_data[eval_name]
                        if chart_type == 'radar':
                            chart = create_radar_chart(metrics, f"{eval_name}")
                        else:
                            chart = create_bar_chart(metrics, f"{eval_name}")
                        # Add unique key for each chart
                        st.plotly_chart(chart, use_container_width=True, key=f"aiscript_{group_name}_{eval_name}")
            else:
                st.info("No AI script evaluation metrics available for this group.")
            
            # Add a separator between groups
            st.markdown("---")
    
    # Case 3: Journey Evaluation with group_by_plots=False or other query types with flat structure
    elif isinstance(eval_results, dict) and "journeyEvals" in eval_results and "aiScriptEvals" in eval_results:
        
        # Create tabs for Journey Evals and AI Script Evals
        journey_tab, aiscript_tab = st.tabs(["Journey Evaluations", "AI Script Evaluations"])
        
        # Process Journey Evaluations
        with journey_tab:
            if eval_results["journeyEvals"]:
                eval_names = list(eval_results["journeyEvals"].keys())
                
                # Create tabs for each journey eval type
                if len(eval_names) > 0:
                    journey_eval_tabs = st.tabs(eval_names)
                    
                    for i, eval_name in enumerate(eval_names):
                        with journey_eval_tabs[i]:
                            metrics = eval_results["journeyEvals"][eval_name]
                            if chart_type == 'radar':
                                chart = create_radar_chart(metrics, f"{eval_name}")
                            else:
                                chart = create_bar_chart(metrics, f"{eval_name}")
                            # Add unique key for each chart
                            st.plotly_chart(chart, use_container_width=True, key=f"journey_{eval_name}")
                else:
                    st.info("No journey evaluation metrics available.")
            else:
                st.info("No journey evaluation data available.")
        
        # Process AI Script Evaluations
        with aiscript_tab:
            if eval_results["aiScriptEvals"]:
                eval_names = list(eval_results["aiScriptEvals"].keys())
                
                # Create tabs for each AI script eval type
                if len(eval_names) > 0:
                    aiscript_eval_tabs = st.tabs(eval_names)
                    
                    for i, eval_name in enumerate(eval_names):
                        with aiscript_eval_tabs[i]:
                            metrics = eval_results["aiScriptEvals"][eval_name]
                            if chart_type == 'radar':
                                chart = create_radar_chart(metrics, f"{eval_name}")
                            else:
                                chart = create_bar_chart(metrics, f"{eval_name}")
                            # Add unique key for each chart
                            st.plotly_chart(chart, use_container_width=True, key=f"aiscript_{eval_name}")
                else:
                    st.info("No AI script evaluation metrics available.")
            else:
                st.info("No AI script evaluation data available.")
    
    # Case 4: AI Script Evaluation with group_by_plots=False
    elif isinstance(eval_results, dict) and not ("journeyEvals" in eval_results or "aiScriptEvals" in eval_results):
        # Get eval names
        eval_names = list(eval_results.keys())
        
        # Create tabs for each AI script eval type
        if len(eval_names) > 0:
            aiscript_eval_tabs = st.tabs(eval_names)
            
            for i, eval_name in enumerate(eval_names):
                with aiscript_eval_tabs[i]:
                    metrics = eval_results[eval_name]
                    if chart_type == 'radar':
                        chart = create_radar_chart(metrics, f"{eval_name}")
                    else:
                        chart = create_bar_chart(metrics, f"{eval_name}")
                    # Add unique key for each chart
                    st.plotly_chart(chart, use_container_width=True, key=f"aiscript_direct_{eval_name}")
        else:
            st.info("No AI script evaluation metrics available.")
    
    else:
        st.error("Unrecognized result format")

def display_data_overview(runs_df, turns_df, ai_script_evals_df, journey_evals_df):
    """
    Display data overview tabs
    
    Args:
        runs_df (pd.DataFrame): Runs data
        turns_df (pd.DataFrame): Turns data
        ai_script_evals_df (pd.DataFrame): AI script evaluations data
        journey_evals_df (pd.DataFrame): Journey evaluations data
    """
    st.header("Data Overview")
    
    # Check if any data is available
    if runs_df.empty and turns_df.empty and ai_script_evals_df.empty and journey_evals_df.empty:
        st.warning("No data available. Please check your connection to AWS Athena and ensure that the database and tables exist.")
        # Add some troubleshooting tips
        with st.expander("Troubleshooting Tips"):
            st.markdown("""
            ### Troubleshooting Steps:
            
            1. **AWS SSO Authentication**: Make sure you've run `aws sso login --profile your_profile` before starting the dashboard
            
            2. **AWS Region**: Verify that the region in your `.env` file matches the region where your Athena database is located
            
            3. **Athena Database and Tables**: Confirm that the database name and table names in your `.env` file are correct
            
            4. **AWS Permissions**: Ensure your AWS role has permissions to query Athena and access the S3 bucket for query results
            
            5. **Network Connectivity**: Check that you have network connectivity to AWS services
            
            6. **Check Logs**: Look at the application logs for more detailed error messages
            """)
        return
    
    # Display tabs with data
    overview_tabs = st.tabs(["Runs", "Turns", "AI Script Evals", "Journey Evals"])
    
    with overview_tabs[0]:
        st.subheader("Evaluation Runs")
        if runs_df.empty:
            st.info("No run data available.")
        else:
            st.dataframe(runs_df)
    
    with overview_tabs[1]:
        st.subheader("Turns")
        if turns_df.empty:
            st.info("No turn data available.")
        else:
            st.dataframe(turns_df)
    
    with overview_tabs[2]:
        st.subheader("AI Script Evaluations")
        if ai_script_evals_df.empty:
            st.info("No AI script evaluation data available.")
        else:
            st.dataframe(ai_script_evals_df)
    
    with overview_tabs[3]:
        st.subheader("Journey Evaluations")
        if journey_evals_df.empty:
            st.info("No journey evaluation data available.")
        else:
            st.dataframe(journey_evals_df)

def display_documentation():
    """
    Display documentation section
    """
    with st.expander("Documentation"):
        st.markdown("""
        ## Evaluation Dashboard Documentation
        
        This dashboard allows you to explore and visualize evaluation data from AI runs.
        
        ### Query Types
        
        1. **Plot Evaluation**: View metrics for a specific plot
        2. **Journey Evaluation**: View metrics for a specific journey, optionally grouped by plots or threads
        3. **AI Script Evaluation**: View metrics for a specific AI script, optionally grouped by plots or threads
        4. **Shared Evaluations**: View metrics for evaluations that are shared across all runs
        
        ### Filters
        
        - **Aggregation Type**: Choose how to aggregate metric scores (mean, median, etc.)
        - **Filter by Last N Days**: Only include evaluations from the last N days
        - **Runtime Evaluations Only**: Only include evaluations that were run during runtime (thread_id is not null)
           When this option is selected, results are grouped by thread ID instead of plot
        - **Chart Type**: Choose between radar charts and bar charts
        
        ### Data Overview
        
        The Data Overview section shows the raw data in tabular format.
        """)