saptyfun commited on
Commit
1cc7ca7
Β·
verified Β·
1 Parent(s): ea8cbdd

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +689 -34
src/streamlit_app.py CHANGED
@@ -1,40 +1,695 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
- import streamlit as st
5
-
6
  """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
  """
15
 
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
 
 
 
 
 
 
22
 
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
 
 
 
 
 
25
 
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
1
+ #!/usr/bin/env python3
 
 
 
 
2
  """
3
+ Multi-Agent System Dashboard - Hugging Face Spaces Demo
 
 
 
 
 
 
4
  """
5
 
6
+ import streamlit as st
7
+ import pandas as pd
8
+ import plotly.express as px
9
+ import plotly.graph_objects as go
10
+ from plotly.subplots import make_subplots
11
+ import sqlite3
12
+ from datetime import datetime, timedelta
13
+ import json
14
+ import numpy as np
15
+ from typing import Dict, List, Any, Optional
16
+ import os
17
+ from pathlib import Path
18
 
19
+ # Set page config first
20
+ st.set_page_config(
21
+ page_title="πŸ€– Multi-Agent System Dashboard",
22
+ page_icon="πŸ€–",
23
+ layout="wide",
24
+ initial_sidebar_state="expanded"
25
+ )
26
 
27
+ class HuggingFaceDashboard:
28
+ def __init__(self):
29
+ self.db_path = "evaluation_logs.db"
30
+ self.setup_demo_data()
31
+
32
+ def setup_demo_data(self):
33
+ """Setup demo data if database doesn't exist or is empty"""
34
+ if not os.path.exists(self.db_path):
35
+ self.create_demo_database()
36
+ else:
37
+ # Check if database has data
38
+ try:
39
+ conn = sqlite3.connect(self.db_path)
40
+ cursor = conn.cursor()
41
+ cursor.execute("SELECT COUNT(*) FROM evaluation_logs")
42
+ count = cursor.fetchone()[0]
43
+ conn.close()
44
+
45
+ # If database is empty or has very little data, recreate it
46
+ if count < 50:
47
+ os.remove(self.db_path)
48
+ self.create_demo_database()
49
+ except:
50
+ # If there's any error reading the database, recreate it
51
+ if os.path.exists(self.db_path):
52
+ os.remove(self.db_path)
53
+ self.create_demo_database()
54
+
55
+ def create_demo_database(self):
56
+ """Create a demo database with sample data"""
57
+ conn = sqlite3.connect(self.db_path)
58
+ cursor = conn.cursor()
59
+
60
+ # Create evaluation_logs table
61
+ cursor.execute('''
62
+ CREATE TABLE IF NOT EXISTS evaluation_logs (
63
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
64
+ session_id TEXT NOT NULL,
65
+ agent_name TEXT NOT NULL,
66
+ query TEXT NOT NULL,
67
+ response TEXT,
68
+ overall_score REAL,
69
+ relevance_score REAL,
70
+ accuracy_score REAL,
71
+ completeness_score REAL,
72
+ coherence_score REAL,
73
+ guardrails_passed BOOLEAN,
74
+ safety_score REAL,
75
+ execution_time_ms REAL,
76
+ error_occurred BOOLEAN DEFAULT FALSE,
77
+ llm_provider TEXT,
78
+ model_name TEXT,
79
+ judge_reasoning TEXT,
80
+ guardrails_failures TEXT DEFAULT '[]',
81
+ timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
82
+ )
83
+ ''')
84
+
85
+ # Create workflow_traces table
86
+ cursor.execute('''
87
+ CREATE TABLE IF NOT EXISTS workflow_traces (
88
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
89
+ session_id TEXT NOT NULL,
90
+ step_name TEXT NOT NULL,
91
+ agent_name TEXT,
92
+ step_type TEXT,
93
+ input_data TEXT,
94
+ output_data TEXT,
95
+ execution_time_ms REAL,
96
+ error_occurred BOOLEAN DEFAULT FALSE,
97
+ error_details TEXT,
98
+ timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
99
+ )
100
+ ''')
101
+
102
+ # Insert demo data
103
+ self.insert_demo_data(cursor)
104
+
105
+ conn.commit()
106
+ conn.close()
107
+
108
+ def insert_demo_data(self, cursor):
109
+ """Insert comprehensive demo data"""
110
+ import random
111
+ from datetime import datetime, timedelta
112
+
113
+ agents = ["Diet Agent", "Support Agent", "Queries Agent"]
114
+
115
+ # Comprehensive sample queries for each agent
116
+ sample_queries = {
117
+ "Diet Agent": [
118
+ "What's a healthy meal plan for weight loss?",
119
+ "Can you suggest low-carb breakfast options?",
120
+ "What are the benefits of intermittent fasting?",
121
+ "How much protein should I eat daily?",
122
+ "What foods are good for heart health?",
123
+ "Can you create a vegetarian meal plan?",
124
+ "What snacks are good for diabetics?",
125
+ "How to meal prep for the week?",
126
+ "What are superfoods I should include?",
127
+ "How to calculate my daily calorie needs?",
128
+ "What's the Mediterranean diet about?",
129
+ "Are supplements necessary for nutrition?",
130
+ "How to eat healthy on a budget?",
131
+ "What foods help with inflammation?",
132
+ "Can you suggest post-workout meals?",
133
+ "What's a balanced breakfast for energy?",
134
+ "How to reduce sugar in my diet?",
135
+ "What are healthy cooking methods?",
136
+ "Can you help with portion control?",
137
+ "What foods boost metabolism?"
138
+ ],
139
+ "Support Agent": [
140
+ "I'm having trouble sleeping, can you help?",
141
+ "How do I manage work stress?",
142
+ "I feel overwhelmed with my tasks",
143
+ "Can you help me organize my schedule?",
144
+ "I'm having difficulty focusing",
145
+ "How to improve my productivity?",
146
+ "I need help with time management",
147
+ "How to deal with anxiety?",
148
+ "Can you suggest relaxation techniques?",
149
+ "I'm feeling burned out at work",
150
+ "How to maintain work-life balance?",
151
+ "I need motivation to exercise",
152
+ "How to build better habits?",
153
+ "I'm struggling with procrastination",
154
+ "Can you help me set goals?",
155
+ "How to handle difficult conversations?",
156
+ "I need help with decision making",
157
+ "How to boost my confidence?",
158
+ "Can you help me manage emotions?",
159
+ "What are good stress relief activities?"
160
+ ],
161
+ "Queries Agent": [
162
+ "What are the latest developments in AI?",
163
+ "How does blockchain technology work?",
164
+ "What is quantum computing?",
165
+ "Explain machine learning algorithms",
166
+ "What are the benefits of cloud computing?",
167
+ "How does renewable energy work?",
168
+ "What is the future of electric vehicles?",
169
+ "Explain cryptocurrency and Bitcoin",
170
+ "What is cybersecurity and why is it important?",
171
+ "How do neural networks function?",
172
+ "What are the applications of IoT?",
173
+ "Explain data science and analytics",
174
+ "What is edge computing?",
175
+ "How does 5G technology work?",
176
+ "What are the trends in biotechnology?",
177
+ "How does virtual reality work?",
178
+ "What is artificial general intelligence?",
179
+ "Explain the metaverse concept",
180
+ "What are smart contracts?",
181
+ "How does automation impact jobs?"
182
+ ]
183
+ }
184
+
185
+ # Generate comprehensive demo data
186
+ total_evaluations = 300 # Increased for better demo
187
+
188
+ for i in range(total_evaluations):
189
+ agent = random.choice(agents)
190
+ query = random.choice(sample_queries[agent])
191
+
192
+ # Add query variations for realism
193
+ if random.random() < 0.3: # 30% chance to modify query
194
+ variations = [
195
+ f"Can you please {query.lower()}",
196
+ f"I need help with: {query.lower()}",
197
+ f"Could you explain {query.lower()}",
198
+ f"What's your advice on {query.lower()}"
199
+ ]
200
+ query = random.choice(variations)
201
+
202
+ # Generate realistic scores with agent-specific tendencies
203
+ if agent == "Diet Agent":
204
+ base_score = random.uniform(7.5, 9.2) # Diet agent performs well
205
+ elif agent == "Support Agent":
206
+ base_score = random.uniform(7.8, 9.5) # Support agent is consistent
207
+ else: # Queries Agent
208
+ base_score = random.uniform(6.8, 8.8) # More variable for complex queries
209
+
210
+ # Create realistic timestamp distribution
211
+ if i < 50: # Recent data (last 3 days)
212
+ days_ago = random.randint(0, 2)
213
+ elif i < 150: # Medium recent (last 2 weeks)
214
+ days_ago = random.randint(3, 14)
215
+ else: # Historical (last 30 days)
216
+ days_ago = random.randint(15, 29)
217
+
218
+ hours_ago = random.randint(0, 23)
219
+ minutes_ago = random.randint(0, 59)
220
+ timestamp = datetime.now() - timedelta(days=days_ago, hours=hours_ago, minutes=minutes_ago)
221
+
222
+ # Generate realistic response
223
+ response_templates = {
224
+ "Diet Agent": [
225
+ f"Based on your query about {query[:30]}..., I recommend focusing on balanced nutrition with emphasis on whole foods, proper portion sizes, and regular meal timing.",
226
+ f"For your question regarding {query[:30]}..., here's a comprehensive approach that considers your nutritional needs and health goals.",
227
+ f"Addressing your concern about {query[:30]}..., let me provide evidence-based dietary guidance tailored to your situation."
228
+ ],
229
+ "Support Agent": [
230
+ f"I understand you're dealing with {query[:30]}... This is a common challenge, and I'm here to help you work through it step by step.",
231
+ f"Thank you for sharing your concern about {query[:30]}... Let's explore some practical strategies that can make a real difference.",
232
+ f"Your question about {query[:30]}... resonates with many people. Here are some effective approaches you can try."
233
+ ],
234
+ "Queries Agent": [
235
+ f"Great question about {query[:30]}... This is a complex topic that involves several key concepts and recent developments.",
236
+ f"To answer your query about {query[:30]}..., let me break this down into the fundamental principles and current applications.",
237
+ f"Your question regarding {query[:30]}... touches on important technological and societal implications. Here's a comprehensive overview."
238
+ ]
239
+ }
240
+
241
+ response = random.choice(response_templates[agent])
242
+
243
+ # Generate correlated scores (realistic relationships)
244
+ relevance_score = max(0, min(10, base_score + random.uniform(-0.3, 0.3)))
245
+ accuracy_score = max(0, min(10, base_score + random.uniform(-0.4, 0.2)))
246
+ completeness_score = max(0, min(10, base_score + random.uniform(-0.5, 0.3)))
247
+ coherence_score = max(0, min(10, base_score + random.uniform(-0.2, 0.4)))
248
+
249
+ # Realistic safety scenarios
250
+ safety_pass_rate = 0.95 # 95% pass rate
251
+ if random.random() < 0.02: # 2% chance of safety issues
252
+ guardrails_passed = False
253
+ safety_score = random.uniform(3.0, 6.0)
254
+ guardrails_failures = '["content_safety", "inappropriate_advice"]'
255
+ else:
256
+ guardrails_passed = True
257
+ safety_score = random.uniform(8.5, 10.0)
258
+ guardrails_failures = "[]"
259
+
260
+ # Realistic execution times (with some variation)
261
+ if agent == "Diet Agent":
262
+ execution_time = random.uniform(1500, 4000) # Moderate complexity
263
+ elif agent == "Support Agent":
264
+ execution_time = random.uniform(2000, 5000) # More thoughtful responses
265
+ else: # Queries Agent
266
+ execution_time = random.uniform(2500, 6000) # Complex information retrieval
267
+
268
+ eval_data = (
269
+ f"demo_session_{i // 4 + 1}", # session_id (4 queries per session)
270
+ agent, # agent_name
271
+ query, # query
272
+ response, # response
273
+ base_score, # overall_score
274
+ relevance_score, # relevance_score
275
+ accuracy_score, # accuracy_score
276
+ completeness_score, # completeness_score
277
+ coherence_score, # coherence_score
278
+ guardrails_passed, # guardrails_passed
279
+ safety_score, # safety_score
280
+ execution_time, # execution_time_ms
281
+ False, # error_occurred
282
+ "azure", # llm_provider
283
+ "gpt-4o", # model_name
284
+ f"Comprehensive evaluation for {agent}: The response demonstrates good understanding of the query with appropriate depth and accuracy. Score breakdown reflects the quality across multiple dimensions.", # judge_reasoning
285
+ guardrails_failures, # guardrails_failures
286
+ timestamp.isoformat() # timestamp
287
+ )
288
+
289
+ cursor.execute('''
290
+ INSERT INTO evaluation_logs (
291
+ session_id, agent_name, query, response, overall_score,
292
+ relevance_score, accuracy_score, completeness_score, coherence_score,
293
+ guardrails_passed, safety_score, execution_time_ms, error_occurred,
294
+ llm_provider, model_name, judge_reasoning, guardrails_failures, timestamp
295
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
296
+ ''', eval_data)
297
+
298
+ def safe_column_access(self, df: pd.DataFrame, column: str, default_value=None):
299
+ """Safely access DataFrame columns"""
300
+ try:
301
+ if column in df.columns:
302
+ return df[column]
303
+ else:
304
+ return pd.Series([default_value] * len(df), index=df.index)
305
+ except Exception:
306
+ return pd.Series([default_value] * len(df) if len(df) > 0 else [])
307
+
308
+ def load_data(self, date_filter: tuple = None, agent_filter: List[str] = None, session_filter: str = None) -> Dict[str, pd.DataFrame]:
309
+ """Load and filter data from database"""
310
+ try:
311
+ conn = sqlite3.connect(self.db_path)
312
+
313
+ # Base queries
314
+ eval_query = "SELECT * FROM evaluation_logs"
315
+ trace_query = "SELECT * FROM workflow_traces"
316
+
317
+ # Apply filters
318
+ conditions = []
319
+ params = []
320
+
321
+ if date_filter:
322
+ conditions.append("timestamp BETWEEN ? AND ?")
323
+ params.extend([date_filter[0].strftime('%Y-%m-%d'), date_filter[1].strftime('%Y-%m-%d')])
324
+
325
+ if agent_filter:
326
+ placeholders = ','.join(['?' for _ in agent_filter])
327
+ conditions.append(f"agent_name IN ({placeholders})")
328
+ params.extend(agent_filter)
329
+
330
+ if session_filter:
331
+ conditions.append("session_id LIKE ?")
332
+ params.append(f"%{session_filter}%")
333
+
334
+ if conditions:
335
+ eval_query += " WHERE " + " AND ".join(conditions)
336
+ trace_query += " WHERE " + " AND ".join(conditions)
337
+
338
+ # Load data
339
+ evaluations = pd.read_sql_query(eval_query, conn, params=params)
340
+ traces = pd.read_sql_query(trace_query, conn, params=params)
341
+
342
+ conn.close()
343
+
344
+ # Convert timestamp columns
345
+ if not evaluations.empty:
346
+ evaluations['timestamp'] = pd.to_datetime(evaluations['timestamp'])
347
+ if not traces.empty:
348
+ traces['timestamp'] = pd.to_datetime(traces['timestamp'])
349
+
350
+ return {
351
+ 'evaluations': evaluations,
352
+ 'traces': traces
353
+ }
354
+
355
+ except Exception as e:
356
+ st.error(f"Error loading data: {str(e)}")
357
+ return {'evaluations': pd.DataFrame(), 'traces': pd.DataFrame()}
358
+
359
+ def create_sidebar_filters(self, data: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
360
+ """Create sidebar filters"""
361
+ st.sidebar.header("πŸ” Filters")
362
+
363
+ filters = {}
364
+
365
+ # Date range filter
366
+ if not data['evaluations'].empty:
367
+ min_date = data['evaluations']['timestamp'].min().date()
368
+ max_date = data['evaluations']['timestamp'].max().date()
369
+
370
+ filters['date_range'] = st.sidebar.date_input(
371
+ "πŸ“… Date Range",
372
+ value=(min_date, max_date),
373
+ min_value=min_date,
374
+ max_value=max_date
375
+ )
376
+
377
+ # Agent filter
378
+ if not data['evaluations'].empty:
379
+ agents = data['evaluations']['agent_name'].unique().tolist()
380
+ filters['agents'] = st.sidebar.multiselect(
381
+ "πŸ€– Agents",
382
+ options=agents,
383
+ default=agents
384
+ )
385
+
386
+ # Session filter
387
+ filters['session'] = st.sidebar.text_input(
388
+ "πŸ” Session ID (partial match)",
389
+ placeholder="Enter session ID..."
390
+ )
391
+
392
+ # Score range filter
393
+ filters['score_range'] = st.sidebar.slider(
394
+ "πŸ“Š Score Range",
395
+ min_value=0.0,
396
+ max_value=10.0,
397
+ value=(0.0, 10.0),
398
+ step=0.1
399
+ )
400
+
401
+ # Safety filter
402
+ filters['safety_only'] = st.sidebar.checkbox(
403
+ "πŸ›‘οΈ Show only safe responses",
404
+ value=False
405
+ )
406
+
407
+ return filters
408
+
409
+ def show_executive_summary(self, data: Dict[str, pd.DataFrame]):
410
+ """Show executive summary with key metrics"""
411
+ st.header("πŸ“ˆ Executive Summary")
412
+
413
+ if data['evaluations'].empty:
414
+ st.warning("No evaluation data available")
415
+ return
416
+
417
+ df = data['evaluations']
418
+
419
+ # Key metrics
420
+ col1, col2, col3, col4, col5 = st.columns(5)
421
+
422
+ with col1:
423
+ total_evals = len(df)
424
+ st.metric("Total Evaluations", f"{total_evals:,}")
425
+
426
+ with col2:
427
+ avg_score = self.safe_column_access(df, 'overall_score', 0).mean()
428
+ st.metric("Average Score", f"{avg_score:.2f}/10")
429
+
430
+ with col3:
431
+ safety_rate = (self.safe_column_access(df, 'guardrails_passed', True).sum() / len(df)) * 100
432
+ st.metric("Safety Pass Rate", f"{safety_rate:.1f}%")
433
+
434
+ with col4:
435
+ avg_time = self.safe_column_access(df, 'execution_time_ms', 0).mean() / 1000
436
+ st.metric("Avg Response Time", f"{avg_time:.2f}s")
437
+
438
+ with col5:
439
+ unique_sessions = df['session_id'].nunique()
440
+ st.metric("Unique Sessions", f"{unique_sessions:,}")
441
+
442
+ # Performance trends
443
+ st.subheader("πŸ“Š Performance Trends")
444
+
445
+ # Daily performance trend
446
+ df_daily = df.groupby(df['timestamp'].dt.date).agg({
447
+ 'overall_score': 'mean',
448
+ 'execution_time_ms': 'mean',
449
+ 'guardrails_passed': lambda x: (x.sum() / len(x)) * 100
450
+ }).reset_index()
451
+
452
+ fig = make_subplots(
453
+ rows=2, cols=2,
454
+ subplot_titles=('Daily Average Score', 'Daily Response Time', 'Daily Safety Rate', 'Score Distribution'),
455
+ specs=[[{"secondary_y": False}, {"secondary_y": False}],
456
+ [{"secondary_y": False}, {"secondary_y": False}]]
457
+ )
458
+
459
+ # Score trend
460
+ fig.add_trace(
461
+ go.Scatter(x=df_daily['timestamp'], y=df_daily['overall_score'],
462
+ mode='lines+markers', name='Score', line=dict(color='#1f77b4')),
463
+ row=1, col=1
464
+ )
465
+
466
+ # Response time trend
467
+ fig.add_trace(
468
+ go.Scatter(x=df_daily['timestamp'], y=df_daily['execution_time_ms']/1000,
469
+ mode='lines+markers', name='Response Time', line=dict(color='#ff7f0e')),
470
+ row=1, col=2
471
+ )
472
+
473
+ # Safety rate trend
474
+ fig.add_trace(
475
+ go.Scatter(x=df_daily['timestamp'], y=df_daily['guardrails_passed'],
476
+ mode='lines+markers', name='Safety Rate', line=dict(color='#2ca02c')),
477
+ row=2, col=1
478
+ )
479
+
480
+ # Score distribution
481
+ fig.add_trace(
482
+ go.Histogram(x=self.safe_column_access(df, 'overall_score', 0),
483
+ nbinsx=20, name='Score Distribution', marker_color='#d62728'),
484
+ row=2, col=2
485
+ )
486
+
487
+ fig.update_layout(height=600, showlegend=False, title_text="Performance Analytics")
488
+ st.plotly_chart(fig, use_container_width=True)
489
+
490
+ def show_agent_performance(self, data: Dict[str, pd.DataFrame]):
491
+ """Show detailed agent performance analysis"""
492
+ st.header("πŸ€– Agent Performance Analysis")
493
+
494
+ if data['evaluations'].empty:
495
+ st.warning("No evaluation data available")
496
+ return
497
+
498
+ df = data['evaluations']
499
+
500
+ # Agent comparison
501
+ col1, col2 = st.columns(2)
502
+
503
+ with col1:
504
+ st.subheader("πŸ“Š Agent Score Comparison")
505
+ agent_scores = df.groupby('agent_name').agg({
506
+ 'overall_score': ['mean', 'std', 'count'],
507
+ 'relevance_score': 'mean',
508
+ 'accuracy_score': 'mean',
509
+ 'completeness_score': 'mean',
510
+ 'coherence_score': 'mean'
511
+ }).round(2)
512
+
513
+ # Flatten column names
514
+ agent_scores.columns = ['_'.join(col).strip() for col in agent_scores.columns]
515
+
516
+ fig = px.bar(
517
+ x=agent_scores.index,
518
+ y=agent_scores['overall_score_mean'],
519
+ error_y=agent_scores['overall_score_std'],
520
+ title="Average Score by Agent",
521
+ labels={'x': 'Agent', 'y': 'Average Score'}
522
+ )
523
+ fig.update_layout(showlegend=False)
524
+ st.plotly_chart(fig, use_container_width=True)
525
+
526
+ with col2:
527
+ st.subheader("⚑ Response Time Analysis")
528
+ agent_times = df.groupby('agent_name')['execution_time_ms'].agg(['mean', 'std']).reset_index()
529
+ agent_times['mean'] = agent_times['mean'] / 1000 # Convert to seconds
530
+ agent_times['std'] = agent_times['std'] / 1000
531
+
532
+ fig = px.bar(
533
+ agent_times,
534
+ x='agent_name',
535
+ y='mean',
536
+ error_y='std',
537
+ title="Average Response Time by Agent",
538
+ labels={'agent_name': 'Agent', 'mean': 'Response Time (seconds)'}
539
+ )
540
+ st.plotly_chart(fig, use_container_width=True)
541
+
542
+ # Detailed score breakdown
543
+ st.subheader("🎯 Detailed Score Breakdown")
544
+
545
+ score_columns = ['relevance_score', 'accuracy_score', 'completeness_score', 'coherence_score']
546
+ available_scores = [col for col in score_columns if col in df.columns]
547
+
548
+ if available_scores:
549
+ agent_detailed = df.groupby('agent_name')[available_scores].mean().reset_index()
550
+
551
+ fig = go.Figure()
552
+
553
+ for agent in agent_detailed['agent_name'].unique():
554
+ agent_data = agent_detailed[agent_detailed['agent_name'] == agent]
555
+ fig.add_trace(go.Scatterpolar(
556
+ r=[agent_data[col].iloc[0] for col in available_scores],
557
+ theta=[col.replace('_score', '').title() for col in available_scores],
558
+ fill='toself',
559
+ name=agent
560
+ ))
561
+
562
+ fig.update_layout(
563
+ polar=dict(
564
+ radialaxis=dict(visible=True, range=[0, 10])
565
+ ),
566
+ showlegend=True,
567
+ title="Agent Performance Radar Chart"
568
+ )
569
+ st.plotly_chart(fig, use_container_width=True)
570
+
571
+ def show_safety_analysis(self, data: Dict[str, pd.DataFrame]):
572
+ """Show safety and guardrails analysis"""
573
+ st.header("πŸ›‘οΈ Safety & Guardrails Analysis")
574
+
575
+ if data['evaluations'].empty:
576
+ st.warning("No evaluation data available")
577
+ return
578
+
579
+ df = data['evaluations']
580
+
581
+ # Safety metrics
582
+ col1, col2, col3 = st.columns(3)
583
+
584
+ with col1:
585
+ total_checks = len(df)
586
+ passed_checks = self.safe_column_access(df, 'guardrails_passed', True).sum()
587
+ safety_rate = (passed_checks / total_checks) * 100 if total_checks > 0 else 0
588
+
589
+ st.metric("Overall Safety Rate", f"{safety_rate:.1f}%", f"{passed_checks}/{total_checks}")
590
+
591
+ with col2:
592
+ avg_safety_score = self.safe_column_access(df, 'safety_score', 10).mean()
593
+ st.metric("Average Safety Score", f"{avg_safety_score:.2f}/10")
594
+
595
+ with col3:
596
+ failed_checks = total_checks - passed_checks
597
+ st.metric("Failed Checks", f"{failed_checks:,}")
598
+
599
+ # Safety by agent
600
+ col1, col2 = st.columns(2)
601
+
602
+ with col1:
603
+ st.subheader("πŸ€– Safety Rate by Agent")
604
+ safety_by_agent = df.groupby('agent_name').agg({
605
+ 'guardrails_passed': lambda x: (x.sum() / len(x)) * 100
606
+ }).reset_index()
607
+
608
+ fig = px.bar(
609
+ safety_by_agent,
610
+ x='agent_name',
611
+ y='guardrails_passed',
612
+ title="Safety Pass Rate by Agent",
613
+ labels={'agent_name': 'Agent', 'guardrails_passed': 'Safety Rate (%)'},
614
+ color='guardrails_passed',
615
+ color_continuous_scale='RdYlGn'
616
+ )
617
+ fig.update_layout(showlegend=False)
618
+ st.plotly_chart(fig, use_container_width=True)
619
+
620
+ with col2:
621
+ st.subheader("πŸ“… Safety Trends Over Time")
622
+ df_daily_safety = df.groupby(df['timestamp'].dt.date).agg({
623
+ 'guardrails_passed': lambda x: (x.sum() / len(x)) * 100
624
+ }).reset_index()
625
+
626
+ fig = px.line(
627
+ df_daily_safety,
628
+ x='timestamp',
629
+ y='guardrails_passed',
630
+ title="Daily Safety Rate Trend",
631
+ labels={'timestamp': 'Date', 'guardrails_passed': 'Safety Rate (%)'}
632
+ )
633
+ fig.add_hline(y=95, line_dash="dash", line_color="red",
634
+ annotation_text="95% Target")
635
+ st.plotly_chart(fig, use_container_width=True)
636
+
637
+ def run(self):
638
+ """Run the dashboard"""
639
+ st.title("πŸ€– Multi-Agent System Dashboard - Demo")
640
+ st.markdown("---")
641
+
642
+ # Demo info
643
+ st.info("πŸŽ‰ **Welcome to the Multi-Agent System Dashboard Demo!** This showcases a comprehensive evaluation system with LLM judge scoring, safety guardrails, and advanced analytics for Diet, Support, and Queries agents.")
644
+
645
+ # Load initial data
646
+ initial_data = self.load_data()
647
+
648
+ # Create filters
649
+ filters = self.create_sidebar_filters(initial_data)
650
+
651
+ # Apply filters and reload data
652
+ filtered_data = self.load_data(
653
+ date_filter=filters.get('date_range'),
654
+ agent_filter=filters.get('agents'),
655
+ session_filter=filters.get('session')
656
+ )
657
+
658
+ # Apply additional filters
659
+ if not filtered_data['evaluations'].empty:
660
+ df = filtered_data['evaluations']
661
+
662
+ # Score range filter
663
+ if 'score_range' in filters:
664
+ score_min, score_max = filters['score_range']
665
+ df = df[(df['overall_score'] >= score_min) & (df['overall_score'] <= score_max)]
666
+
667
+ # Safety filter
668
+ if filters.get('safety_only', False):
669
+ df = df[df['guardrails_passed'] == True]
670
+
671
+ filtered_data['evaluations'] = df
672
+
673
+ # Create tabs
674
+ tab1, tab2, tab3 = st.tabs([
675
+ "πŸ“ˆ Executive Summary",
676
+ "πŸ€– Agent Performance",
677
+ "πŸ›‘οΈ Safety Analysis"
678
+ ])
679
+
680
+ with tab1:
681
+ self.show_executive_summary(filtered_data)
682
+
683
+ with tab2:
684
+ self.show_agent_performance(filtered_data)
685
+
686
+ with tab3:
687
+ self.show_safety_analysis(filtered_data)
688
+
689
+ # Footer
690
+ st.markdown("---")
691
+ st.markdown("πŸš€ **Multi-Agent System Dashboard** | Built with Streamlit & Plotly | Demo hosted on Hugging Face Spaces")
692
 
693
+ if __name__ == "__main__":
694
+ dashboard = HuggingFaceDashboard()
695
+ dashboard.run()