saptyfun commited on
Commit
2e8352c
Β·
verified Β·
1 Parent(s): 1350f34

Upload streamlit_app.py

Browse files
Files changed (1) hide show
  1. streamlit_app.py +617 -0
streamlit_app.py ADDED
@@ -0,0 +1,617 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Simplified Hugging Face Spaces compatible Multi-Agent System Dashboard
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import tempfile
9
+ import sqlite3
10
+ from pathlib import Path
11
+ import warnings
12
+ from datetime import datetime, timedelta
13
+ import random
14
+
15
+ # Suppress warnings
16
+ warnings.filterwarnings('ignore')
17
+
18
+ # Set environment variables for Hugging Face Spaces
19
+ os.environ['STREAMLIT_SERVER_HEADLESS'] = 'true'
20
+ os.environ['STREAMLIT_SERVER_PORT'] = '7860'
21
+ os.environ['STREAMLIT_BROWSER_GATHER_USAGE_STATS'] = 'false'
22
+
23
+ # Import streamlit first and set page config
24
+ import streamlit as st
25
+
26
+ st.set_page_config(
27
+ page_title="πŸ€– Multi-Agent System Dashboard",
28
+ page_icon="πŸ€–",
29
+ layout="wide",
30
+ initial_sidebar_state="expanded"
31
+ )
32
+
33
+ # Import other required modules
34
+ import pandas as pd
35
+ import plotly.express as px
36
+ import plotly.graph_objects as go
37
+ import json
38
+ import numpy as np
39
+ from typing import Dict, List, Any
40
+
41
+ class SimpleDashboard:
42
+ def __init__(self):
43
+ # Use temp directory for database
44
+ temp_dir = tempfile.gettempdir()
45
+ self.db_path = os.path.join(temp_dir, "evaluation_logs.db")
46
+ try:
47
+ self.setup_demo_data()
48
+ except Exception as e:
49
+ st.error(f"Setup error: {str(e)}")
50
+ self.create_fallback_data()
51
+
52
+ def create_fallback_data(self):
53
+ """Create fallback data if database fails"""
54
+ st.warning("Using fallback demo data")
55
+
56
+ # Create sample data directly
57
+ agents = ["Diet Agent", "Support Agent", "Queries Agent"]
58
+ data = []
59
+
60
+ for i in range(50):
61
+ base_score = random.uniform(7.0, 9.5)
62
+ accuracy = random.uniform(7.0, 9.5)
63
+ data.append({
64
+ 'id': i,
65
+ 'session_id': f"session_{random.randint(1000, 9999)}",
66
+ 'agent_name': random.choice(agents),
67
+ 'query': f"Sample query {i}",
68
+ 'response': f"Sample response {i} with detailed information and comprehensive guidance...",
69
+ 'overall_score': base_score,
70
+ 'relevance_score': random.uniform(7.0, 9.5),
71
+ 'accuracy_score': accuracy,
72
+ 'completeness_score': random.uniform(7.0, 9.5),
73
+ 'coherence_score': random.uniform(7.0, 9.5),
74
+ 'hallucination_score': max(0, min(10, 10 - accuracy + random.uniform(-1.0, 1.0))),
75
+ 'guardrails_passed': True,
76
+ 'safety_score': random.uniform(8.0, 10.0),
77
+ 'execution_time_ms': random.uniform(500, 2000),
78
+ 'input_tokens': random.randint(20, 100),
79
+ 'output_tokens': random.randint(100, 500),
80
+ 'total_tokens': random.randint(120, 600),
81
+ 'cost_usd': random.uniform(0.001, 0.02),
82
+ 'llm_provider': random.choice(["azure", "openai", "anthropic"]),
83
+ 'model_name': 'gpt-4o',
84
+ 'timestamp': datetime.now() - timedelta(days=random.randint(0, 30))
85
+ })
86
+
87
+ self.fallback_df = pd.DataFrame(data)
88
+ self.use_fallback = True
89
+
90
+ def setup_demo_data(self):
91
+ """Setup demo database"""
92
+ self.use_fallback = False
93
+
94
+ if not os.path.exists(self.db_path):
95
+ self.create_demo_database()
96
+
97
+ def create_demo_database(self):
98
+ """Create demo database"""
99
+ conn = sqlite3.connect(self.db_path)
100
+ cursor = conn.cursor()
101
+
102
+ # Create table
103
+ cursor.execute('''
104
+ CREATE TABLE IF NOT EXISTS evaluation_logs (
105
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
106
+ session_id TEXT NOT NULL,
107
+ agent_name TEXT NOT NULL,
108
+ query TEXT NOT NULL,
109
+ response TEXT,
110
+ overall_score REAL,
111
+ relevance_score REAL,
112
+ accuracy_score REAL,
113
+ completeness_score REAL,
114
+ coherence_score REAL,
115
+ hallucination_score REAL,
116
+ guardrails_passed BOOLEAN,
117
+ safety_score REAL,
118
+ execution_time_ms REAL,
119
+ input_tokens INTEGER,
120
+ output_tokens INTEGER,
121
+ total_tokens INTEGER,
122
+ cost_usd REAL,
123
+ llm_provider TEXT,
124
+ model_name TEXT,
125
+ timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
126
+ )
127
+ ''')
128
+
129
+ # Insert demo data
130
+ agents = ["Diet Agent", "Support Agent", "Queries Agent"]
131
+
132
+ sample_queries = {
133
+ "Diet Agent": [
134
+ "What's a healthy meal plan for weight loss?",
135
+ "Can you suggest low-carb breakfast options?",
136
+ "What are the benefits of intermittent fasting?",
137
+ "How much protein should I eat daily?",
138
+ "What foods are good for heart health?"
139
+ ],
140
+ "Support Agent": [
141
+ "I'm having trouble sleeping, can you help?",
142
+ "How do I manage work stress?",
143
+ "I feel overwhelmed with my tasks",
144
+ "Can you help me organize my schedule?",
145
+ "How to improve my productivity?"
146
+ ],
147
+ "Queries Agent": [
148
+ "What are the latest developments in AI?",
149
+ "How does blockchain technology work?",
150
+ "What is quantum computing?",
151
+ "Explain machine learning algorithms",
152
+ "What are the benefits of cloud computing?"
153
+ ]
154
+ }
155
+
156
+ for i in range(100):
157
+ session_id = f"session_{random.randint(1000, 9999)}"
158
+ agent = random.choice(agents)
159
+ query = random.choice(sample_queries[agent])
160
+
161
+ # Generate comprehensive response
162
+ response_templates = {
163
+ "Diet Agent": [
164
+ "Thank you for your question about nutrition and dietary guidance. I'd be happy to help you develop a healthier relationship with food and create sustainable eating habits.",
165
+ "I understand you're looking for dietary advice, and I'm here to provide evidence-based nutritional guidance tailored to your specific needs and goals."
166
+ ],
167
+ "Support Agent": [
168
+ "I appreciate you reaching out for support. It takes courage to ask for help, and I'm here to provide you with practical strategies and emotional guidance.",
169
+ "Thank you for sharing your concerns with me. I understand this can be challenging, and I want to help you work through this step by step with compassion and understanding."
170
+ ],
171
+ "Queries Agent": [
172
+ "Excellent question! This is a fascinating topic that involves cutting-edge technology and has significant implications for our future. Let me provide you with a comprehensive overview.",
173
+ "Thank you for this thought-provoking question. This subject encompasses multiple disciplines and recent innovations. I'll break this down into key concepts and practical applications."
174
+ ]
175
+ }
176
+
177
+ base_response = random.choice(response_templates[agent])
178
+
179
+ # Add detailed information
180
+ if agent == "Diet Agent":
181
+ details = "**Key Nutritional Recommendations:**\n\n1. **Whole Foods Focus**: Prioritize unprocessed foods like fresh fruits, vegetables, whole grains, lean proteins, and healthy fats.\n\n2. **Portion Control**: Use the plate method - fill half your plate with non-starchy vegetables, one quarter with lean protein, and one quarter with complex carbohydrates.\n\n3. **Hydration**: Aim for 8-10 glasses of water daily to support metabolism and overall health."
182
+ elif agent == "Support Agent":
183
+ details = "**Comprehensive Support Strategy:**\n\n**Immediate Coping Techniques:**\n1. **Deep Breathing**: Practice the 4-7-8 technique - inhale for 4 counts, hold for 7, exhale for 8.\n\n2. **Grounding Exercises**: Use the 5-4-3-2-1 method - identify 5 things you can see, 4 you can touch, 3 you can hear, 2 you can smell, and 1 you can taste.\n\n**Long-term Strategies:**\n- Establish a consistent daily routine\n- Practice mindfulness meditation for 10-15 minutes daily"
184
+ else: # Queries Agent
185
+ details = "**Technical Deep Dive:**\n\n**Fundamental Concepts:**\nThis technology represents a convergence of multiple disciplines including computer science, mathematics, engineering, and domain-specific expertise.\n\n**Current Implementation:**\n1. **Healthcare**: AI-powered diagnostic tools and personalized treatment plans\n2. **Finance**: Algorithmic trading and fraud detection\n3. **Transportation**: Autonomous vehicles and traffic optimization"
186
+
187
+ response = f"{base_response}\n\n{details}"
188
+
189
+ # Generate realistic scores
190
+ base_score = random.uniform(7.0, 9.5)
191
+ relevance_score = max(0, min(10, base_score + random.uniform(-0.3, 0.3)))
192
+ accuracy_score = max(0, min(10, base_score + random.uniform(-0.4, 0.2)))
193
+ completeness_score = max(0, min(10, base_score + random.uniform(-0.5, 0.3)))
194
+ coherence_score = max(0, min(10, base_score + random.uniform(-0.2, 0.4)))
195
+ hallucination_score = max(0, min(10, 10 - accuracy_score + random.uniform(-1.0, 1.0)))
196
+
197
+ # Generate token consumption
198
+ response_length = len(response)
199
+ input_tokens = int(len(query.split()) * 1.3)
200
+ output_tokens = int(response_length / 4)
201
+ total_tokens = input_tokens + output_tokens
202
+
203
+ # Calculate cost
204
+ llm_provider = random.choice(["azure", "openai", "anthropic"])
205
+ cost_per_1k = {"azure": 0.03, "openai": 0.03, "anthropic": 0.025}
206
+ cost_usd = (total_tokens / 1000) * cost_per_1k[llm_provider]
207
+
208
+ timestamp = datetime.now() - timedelta(days=random.randint(0, 30))
209
+
210
+ cursor.execute('''
211
+ INSERT INTO evaluation_logs (
212
+ session_id, agent_name, query, response, overall_score,
213
+ relevance_score, accuracy_score, completeness_score, coherence_score,
214
+ hallucination_score, guardrails_passed, safety_score, execution_time_ms,
215
+ input_tokens, output_tokens, total_tokens, cost_usd, llm_provider, model_name, timestamp
216
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
217
+ ''', (
218
+ session_id, agent, query, response, base_score,
219
+ relevance_score, accuracy_score, completeness_score, coherence_score,
220
+ hallucination_score, random.choice([True, True, True, False]), # 75% pass rate
221
+ random.uniform(8.0, 10.0), random.uniform(500, 2000),
222
+ input_tokens, output_tokens, total_tokens, round(cost_usd, 4),
223
+ llm_provider, "gpt-4o", timestamp.isoformat()
224
+ ))
225
+
226
+ conn.commit()
227
+ conn.close()
228
+
229
+ def load_data(self):
230
+ """Load data"""
231
+ if self.use_fallback:
232
+ return self.fallback_df
233
+
234
+ try:
235
+ conn = sqlite3.connect(self.db_path)
236
+ df = pd.read_sql_query("SELECT * FROM evaluation_logs ORDER BY timestamp DESC", conn)
237
+ conn.close()
238
+
239
+ if not df.empty:
240
+ df['timestamp'] = pd.to_datetime(df['timestamp'])
241
+
242
+ return df
243
+ except Exception as e:
244
+ st.error(f"Data loading error: {str(e)}")
245
+ return pd.DataFrame()
246
+
247
+ def show_overview(self, df):
248
+ """Show overview tab"""
249
+ st.header("πŸ“ˆ Executive Summary")
250
+
251
+ if df.empty:
252
+ st.warning("No data available")
253
+ return
254
+
255
+ # Key metrics
256
+ col1, col2, col3, col4 = st.columns(4)
257
+
258
+ with col1:
259
+ st.metric("Total Evaluations", len(df))
260
+
261
+ with col2:
262
+ avg_score = df['overall_score'].mean()
263
+ st.metric("Average Score", f"{avg_score:.2f}/10")
264
+
265
+ with col3:
266
+ safety_rate = (df['guardrails_passed'].sum() / len(df)) * 100
267
+ st.metric("Safety Rate", f"{safety_rate:.1f}%")
268
+
269
+ with col4:
270
+ avg_time = df['execution_time_ms'].mean() / 1000
271
+ st.metric("Avg Response Time", f"{avg_time:.2f}s")
272
+
273
+ # Charts
274
+ col1, col2 = st.columns(2)
275
+
276
+ with col1:
277
+ st.subheader("πŸ“Š Performance by Agent")
278
+ agent_scores = df.groupby('agent_name')['overall_score'].mean().reset_index()
279
+ fig = px.bar(
280
+ agent_scores,
281
+ x='agent_name',
282
+ y='overall_score',
283
+ title="Average Score by Agent",
284
+ color='overall_score',
285
+ color_continuous_scale='viridis'
286
+ )
287
+ st.plotly_chart(fig, use_container_width=True)
288
+
289
+ with col2:
290
+ st.subheader("πŸ“ˆ Score Distribution")
291
+ fig = px.histogram(
292
+ df,
293
+ x='overall_score',
294
+ nbins=20,
295
+ title="Score Distribution",
296
+ color_discrete_sequence=['#1f77b4']
297
+ )
298
+ st.plotly_chart(fig, use_container_width=True)
299
+
300
+ def show_agent_performance(self, df):
301
+ """Show agent performance tab"""
302
+ st.header("πŸ€– Agent Performance Analysis")
303
+
304
+ if df.empty:
305
+ st.warning("No data available")
306
+ return
307
+
308
+ # Agent selector
309
+ agents = df['agent_name'].unique()
310
+ selected_agent = st.selectbox("Select Agent", ["All Agents"] + list(agents))
311
+
312
+ # Filter data
313
+ if selected_agent != "All Agents":
314
+ filtered_df = df[df['agent_name'] == selected_agent]
315
+ else:
316
+ filtered_df = df
317
+
318
+ # Performance metrics
319
+ col1, col2 = st.columns(2)
320
+
321
+ with col1:
322
+ st.subheader("🎯 Score Breakdown")
323
+ score_cols = ['relevance_score', 'accuracy_score', 'completeness_score', 'coherence_score']
324
+ available_scores = [col for col in score_cols if col in filtered_df.columns]
325
+
326
+ if available_scores:
327
+ avg_scores = filtered_df[available_scores].mean()
328
+ fig = px.bar(
329
+ x=avg_scores.index,
330
+ y=avg_scores.values,
331
+ title=f"Average Scores - {selected_agent}",
332
+ labels={'x': 'Metric', 'y': 'Score'}
333
+ )
334
+ st.plotly_chart(fig, use_container_width=True)
335
+
336
+ with col2:
337
+ st.subheader("⏱️ Response Time Analysis")
338
+ fig = px.box(
339
+ filtered_df,
340
+ x='agent_name',
341
+ y='execution_time_ms',
342
+ title="Response Time Distribution"
343
+ )
344
+ st.plotly_chart(fig, use_container_width=True)
345
+
346
+ # Recent evaluations table
347
+ st.subheader("πŸ“‹ Recent Evaluations")
348
+ display_cols = ['agent_name', 'query', 'overall_score', 'execution_time_ms', 'timestamp']
349
+ available_cols = [col for col in display_cols if col in filtered_df.columns]
350
+
351
+ if available_cols:
352
+ recent_data = filtered_df[available_cols].head(20)
353
+ st.dataframe(recent_data, use_container_width=True)
354
+
355
+ def show_response_analysis(self, df):
356
+ """Show response analysis tab"""
357
+ st.header("πŸ“ Response Analysis & Tracing")
358
+
359
+ if df.empty:
360
+ st.warning("No data available")
361
+ return
362
+
363
+ # Response metrics
364
+ col1, col2, col3 = st.columns(3)
365
+
366
+ with col1:
367
+ if 'response' in df.columns:
368
+ avg_length = df['response'].str.len().mean()
369
+ st.metric("Avg Response Length", f"{avg_length:.0f} chars")
370
+ else:
371
+ st.metric("Avg Response Length", "N/A")
372
+
373
+ with col2:
374
+ if 'response' in df.columns:
375
+ avg_words = df['response'].str.split().str.len().mean()
376
+ st.metric("Avg Word Count", f"{avg_words:.0f} words")
377
+ else:
378
+ st.metric("Avg Word Count", "N/A")
379
+
380
+ with col3:
381
+ response_rate = (df['response'].notna().sum() / len(df)) * 100
382
+ st.metric("Response Rate", f"{response_rate:.1f}%")
383
+
384
+ # Search functionality
385
+ st.subheader("πŸ” Search Responses")
386
+ search_term = st.text_input("Search in responses:", placeholder="Enter keywords...")
387
+
388
+ if search_term and 'response' in df.columns:
389
+ mask = df['response'].str.contains(search_term, case=False, na=False)
390
+ search_results = df[mask]
391
+ else:
392
+ search_results = df.head(10)
393
+
394
+ # Display results
395
+ if not search_results.empty:
396
+ st.write(f"**Showing {len(search_results)} results**")
397
+
398
+ for idx, row in search_results.iterrows():
399
+ with st.expander(f"πŸ€– {row['agent_name']} - Score: {row['overall_score']:.1f}"):
400
+ col1, col2 = st.columns([2, 1])
401
+
402
+ with col1:
403
+ st.write("**Query:**")
404
+ st.write(row['query'])
405
+
406
+ if 'response' in row and pd.notna(row['response']):
407
+ st.write("**Response:**")
408
+ st.write(row['response'])
409
+
410
+ with col2:
411
+ st.write("**Metrics:**")
412
+ st.write(f"Overall Score: {row['overall_score']:.1f}/10")
413
+ if 'execution_time_ms' in row:
414
+ st.write(f"Response Time: {row['execution_time_ms']:.0f}ms")
415
+ if 'timestamp' in row:
416
+ st.write(f"Timestamp: {row['timestamp']}")
417
+
418
+ def show_workflow_visualization(self, df):
419
+ """Show workflow visualization tab"""
420
+ st.header("πŸ”„ Workflow Visualization")
421
+
422
+ if df.empty:
423
+ st.warning("No data available for workflow visualization.")
424
+ return
425
+
426
+ # Session selection
427
+ sessions = df['session_id'].unique()
428
+ selected_session = st.selectbox("Select Session", sessions, key="workflow_session")
429
+
430
+ # Filter data for selected session
431
+ session_data = df[df['session_id'] == selected_session]
432
+
433
+ if session_data.empty:
434
+ st.warning("No data found for selected session.")
435
+ return
436
+
437
+ # Session metrics overview
438
+ st.subheader("πŸ“ˆ Session Metrics Overview")
439
+
440
+ col1, col2, col3, col4 = st.columns(4)
441
+
442
+ with col1:
443
+ avg_score = session_data['overall_score'].mean()
444
+ st.metric("Avg Overall Score", f"{avg_score:.2f}/10")
445
+
446
+ with col2:
447
+ avg_latency = session_data['execution_time_ms'].mean()
448
+ st.metric("Avg Response Time", f"{avg_latency:.0f}ms")
449
+
450
+ with col3:
451
+ if 'hallucination_score' in session_data.columns:
452
+ avg_hallucination = session_data['hallucination_score'].mean()
453
+ st.metric("Avg Hallucination", f"{avg_hallucination:.2f}/10")
454
+ else:
455
+ st.metric("Avg Hallucination", "N/A")
456
+
457
+ with col4:
458
+ if 'total_tokens' in session_data.columns:
459
+ total_tokens = session_data['total_tokens'].sum()
460
+ total_cost = session_data['cost_usd'].sum() if 'cost_usd' in session_data.columns else 0
461
+ st.metric("Total Cost", f"${total_cost:.4f}", f"{total_tokens:,} tokens")
462
+ else:
463
+ st.metric("Total Cost", "N/A")
464
+
465
+ # Workflow steps
466
+ st.subheader("πŸ” Workflow Steps")
467
+
468
+ for idx, (_, row) in enumerate(session_data.iterrows()):
469
+ with st.expander(f"Step {idx + 1}: {row['agent_name']} - Score: {row['overall_score']:.2f}/10"):
470
+
471
+ col1, col2 = st.columns([1, 1])
472
+
473
+ with col1:
474
+ st.markdown("**Query:**")
475
+ st.write(row['query'])
476
+
477
+ # Performance metrics chart
478
+ st.markdown("**Performance Metrics:**")
479
+ metrics_data = {
480
+ 'Overall': row['overall_score'],
481
+ 'Relevance': row.get('relevance_score', 0),
482
+ 'Accuracy': row.get('accuracy_score', 0),
483
+ 'Completeness': row.get('completeness_score', 0),
484
+ 'Coherence': row.get('coherence_score', 0)
485
+ }
486
+
487
+ if 'hallucination_score' in row:
488
+ metrics_data['Hallucination'] = row['hallucination_score']
489
+
490
+ fig = px.bar(
491
+ x=list(metrics_data.keys()),
492
+ y=list(metrics_data.values()),
493
+ title="Score Breakdown",
494
+ labels={'x': 'Metric', 'y': 'Score (0-10)'}
495
+ )
496
+ fig.update_layout(height=300, showlegend=False)
497
+ st.plotly_chart(fig, use_container_width=True)
498
+
499
+ with col2:
500
+ st.markdown("**Response:**")
501
+ if pd.notna(row['response']):
502
+ st.write(row['response'])
503
+ else:
504
+ st.write("No response available")
505
+
506
+ # Resource consumption
507
+ st.markdown("**Resource Consumption:**")
508
+
509
+ if 'input_tokens' in row and pd.notna(row['input_tokens']):
510
+ token_col1, token_col2 = st.columns(2)
511
+ with token_col1:
512
+ st.metric("Input Tokens", f"{int(row['input_tokens']):,}")
513
+ st.metric("Output Tokens", f"{int(row.get('output_tokens', 0)):,}")
514
+
515
+ with token_col2:
516
+ st.metric("Total Tokens", f"{int(row.get('total_tokens', 0)):,}")
517
+ st.metric("Cost", f"${row.get('cost_usd', 0):.4f}")
518
+
519
+ # Execution details
520
+ st.markdown("**Execution Details:**")
521
+ st.write(f"⏱️ **Execution Time:** {row['execution_time_ms']:.0f}ms")
522
+ if 'llm_provider' in row:
523
+ st.write(f"πŸ€– **LLM Provider:** {row['llm_provider']}")
524
+ if 'model_name' in row:
525
+ st.write(f"🧠 **Model:** {row['model_name']}")
526
+ st.write(f"πŸ›‘οΈ **Safety Passed:** {'βœ…' if row['guardrails_passed'] else '❌'}")
527
+
528
+ # Session summary
529
+ st.subheader("πŸ“‹ Session Summary")
530
+
531
+ summary_col1, summary_col2, summary_col3 = st.columns(3)
532
+
533
+ with summary_col1:
534
+ st.markdown("**Quality Metrics:**")
535
+ st.write(f"β€’ Average Overall Score: {session_data['overall_score'].mean():.2f}/10")
536
+ best_step = session_data.loc[session_data['overall_score'].idxmax()]
537
+ st.write(f"β€’ Best Performing Step: {best_step['agent_name']}")
538
+ st.write(f"β€’ Consistency (Std Dev): {session_data['overall_score'].std():.2f}")
539
+
540
+ with summary_col2:
541
+ st.markdown("**Performance Metrics:**")
542
+ st.write(f"β€’ Total Execution Time: {session_data['execution_time_ms'].sum():.0f}ms")
543
+ st.write(f"β€’ Average Response Time: {session_data['execution_time_ms'].mean():.0f}ms")
544
+ st.write(f"β€’ Fastest Step: {session_data['execution_time_ms'].min():.0f}ms")
545
+
546
+ with summary_col3:
547
+ st.markdown("**Resource Usage:**")
548
+ if 'total_tokens' in session_data.columns:
549
+ st.write(f"β€’ Total Tokens Used: {session_data['total_tokens'].sum():,}")
550
+ if 'cost_usd' in session_data.columns:
551
+ st.write(f"β€’ Total Cost: ${session_data['cost_usd'].sum():.4f}")
552
+ st.write(f"β€’ Avg Cost per Query: ${session_data['cost_usd'].mean():.4f}")
553
+ else:
554
+ st.write("β€’ Token data not available")
555
+
556
+ # Export functionality
557
+ st.subheader("πŸ“€ Export Workflow Data")
558
+
559
+ if st.button("Export Session Data to CSV", key="export_workflow"):
560
+ csv_data = session_data.to_csv(index=False)
561
+ st.download_button(
562
+ label="Download CSV",
563
+ data=csv_data,
564
+ file_name=f"workflow_session_{selected_session}.csv",
565
+ mime="text/csv"
566
+ )
567
+
568
+ def run(self):
569
+ """Run the dashboard"""
570
+ st.title("πŸ€– Multi-Agent System Dashboard")
571
+ st.markdown("---")
572
+
573
+ st.info("πŸŽ‰ **Welcome!** This dashboard showcases evaluation metrics for Diet, Support, and Queries agents.")
574
+
575
+ # Load data
576
+ df = self.load_data()
577
+
578
+ # Create tabs
579
+ tab1, tab2, tab3, tab4 = st.tabs([
580
+ "πŸ“ˆ Overview",
581
+ "πŸ€– Agent Performance",
582
+ "πŸ“ Response Analysis",
583
+ "πŸ”„ Workflow Visualization"
584
+ ])
585
+
586
+ with tab1:
587
+ self.show_overview(df)
588
+
589
+ with tab2:
590
+ self.show_agent_performance(df)
591
+
592
+ with tab3:
593
+ self.show_response_analysis(df)
594
+
595
+ with tab4:
596
+ self.show_workflow_visualization(df)
597
+
598
+ # Footer
599
+ st.markdown("---")
600
+ st.markdown("πŸš€ **Multi-Agent System Dashboard** | Built with Streamlit & Plotly")
601
+
602
+ # Run the dashboard
603
+ try:
604
+ dashboard = SimpleDashboard()
605
+ dashboard.run()
606
+ except Exception as e:
607
+ st.error(f"Application Error: {str(e)}")
608
+ st.info("Please refresh the page.")
609
+
610
+ with st.expander("Debug Information"):
611
+ st.code(f"""
612
+ Error: {str(e)}
613
+ Type: {type(e).__name__}
614
+ Python: {sys.version}
615
+ Working Dir: {os.getcwd()}
616
+ Temp Dir: {tempfile.gettempdir()}
617
+ """)