Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- src/app.py +334 -6
src/app.py
CHANGED
|
@@ -93,7 +93,7 @@ class HuggingFaceDashboard:
|
|
| 93 |
)
|
| 94 |
''')
|
| 95 |
|
| 96 |
-
# Create workflow_traces table
|
| 97 |
cursor.execute('''
|
| 98 |
CREATE TABLE IF NOT EXISTS workflow_traces (
|
| 99 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
@@ -103,6 +103,9 @@ class HuggingFaceDashboard:
|
|
| 103 |
step_type TEXT,
|
| 104 |
input_data TEXT,
|
| 105 |
output_data TEXT,
|
|
|
|
|
|
|
|
|
|
| 106 |
execution_time_ms REAL,
|
| 107 |
error_occurred BOOLEAN DEFAULT FALSE,
|
| 108 |
error_details TEXT,
|
|
@@ -110,6 +113,29 @@ class HuggingFaceDashboard:
|
|
| 110 |
)
|
| 111 |
''')
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
# Insert demo data
|
| 114 |
self.insert_demo_data(cursor)
|
| 115 |
|
|
@@ -249,7 +275,33 @@ class HuggingFaceDashboard:
|
|
| 249 |
]
|
| 250 |
}
|
| 251 |
|
| 252 |
-
response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
# Generate correlated scores (realistic relationships)
|
| 255 |
relevance_score = max(0, min(10, base_score + random.uniform(-0.3, 0.3)))
|
|
@@ -305,6 +357,85 @@ class HuggingFaceDashboard:
|
|
| 305 |
llm_provider, model_name, judge_reasoning, guardrails_failures, timestamp
|
| 306 |
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 307 |
''', eval_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
|
| 309 |
def safe_column_access(self, df: pd.DataFrame, column: str, default_value=None):
|
| 310 |
"""Safely access DataFrame columns"""
|
|
@@ -324,6 +455,7 @@ class HuggingFaceDashboard:
|
|
| 324 |
# Base queries
|
| 325 |
eval_query = "SELECT * FROM evaluation_logs"
|
| 326 |
trace_query = "SELECT * FROM workflow_traces"
|
|
|
|
| 327 |
|
| 328 |
# Apply filters
|
| 329 |
conditions = []
|
|
@@ -345,11 +477,18 @@ class HuggingFaceDashboard:
|
|
| 345 |
if conditions:
|
| 346 |
eval_query += " WHERE " + " AND ".join(conditions)
|
| 347 |
trace_query += " WHERE " + " AND ".join(conditions)
|
|
|
|
| 348 |
|
| 349 |
# Load data
|
| 350 |
evaluations = pd.read_sql_query(eval_query, conn, params=params)
|
| 351 |
traces = pd.read_sql_query(trace_query, conn, params=params)
|
| 352 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
conn.close()
|
| 354 |
|
| 355 |
# Convert timestamp columns
|
|
@@ -357,15 +496,18 @@ class HuggingFaceDashboard:
|
|
| 357 |
evaluations['timestamp'] = pd.to_datetime(evaluations['timestamp'])
|
| 358 |
if not traces.empty:
|
| 359 |
traces['timestamp'] = pd.to_datetime(traces['timestamp'])
|
|
|
|
|
|
|
| 360 |
|
| 361 |
return {
|
| 362 |
'evaluations': evaluations,
|
| 363 |
-
'traces': traces
|
|
|
|
| 364 |
}
|
| 365 |
|
| 366 |
except Exception as e:
|
| 367 |
st.error(f"Error loading data: {str(e)}")
|
| 368 |
-
return {'evaluations': pd.DataFrame(), 'traces': pd.DataFrame()}
|
| 369 |
|
| 370 |
def create_sidebar_filters(self, data: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
|
| 371 |
"""Create sidebar filters"""
|
|
@@ -645,6 +787,188 @@ class HuggingFaceDashboard:
|
|
| 645 |
annotation_text="95% Target")
|
| 646 |
st.plotly_chart(fig, use_container_width=True)
|
| 647 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 648 |
def run(self):
|
| 649 |
"""Run the dashboard"""
|
| 650 |
st.title("π€ Multi-Agent System Dashboard - Demo")
|
|
@@ -682,10 +1006,11 @@ class HuggingFaceDashboard:
|
|
| 682 |
filtered_data['evaluations'] = df
|
| 683 |
|
| 684 |
# Create tabs
|
| 685 |
-
tab1, tab2, tab3 = st.tabs([
|
| 686 |
"π Executive Summary",
|
| 687 |
"π€ Agent Performance",
|
| 688 |
-
"π‘οΈ Safety Analysis"
|
|
|
|
| 689 |
])
|
| 690 |
|
| 691 |
with tab1:
|
|
@@ -697,6 +1022,9 @@ class HuggingFaceDashboard:
|
|
| 697 |
with tab3:
|
| 698 |
self.show_safety_analysis(filtered_data)
|
| 699 |
|
|
|
|
|
|
|
|
|
|
| 700 |
# Footer
|
| 701 |
st.markdown("---")
|
| 702 |
st.markdown("π **Multi-Agent System Dashboard** | Built with Streamlit & Plotly | Demo hosted on Hugging Face Spaces")
|
|
|
|
| 93 |
)
|
| 94 |
''')
|
| 95 |
|
| 96 |
+
# Create workflow_traces table with enhanced response tracking
|
| 97 |
cursor.execute('''
|
| 98 |
CREATE TABLE IF NOT EXISTS workflow_traces (
|
| 99 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
|
| 103 |
step_type TEXT,
|
| 104 |
input_data TEXT,
|
| 105 |
output_data TEXT,
|
| 106 |
+
response_metadata TEXT,
|
| 107 |
+
token_count INTEGER,
|
| 108 |
+
response_length INTEGER,
|
| 109 |
execution_time_ms REAL,
|
| 110 |
error_occurred BOOLEAN DEFAULT FALSE,
|
| 111 |
error_details TEXT,
|
|
|
|
| 113 |
)
|
| 114 |
''')
|
| 115 |
|
| 116 |
+
# Create response_analysis table for detailed response tracking
|
| 117 |
+
cursor.execute('''
|
| 118 |
+
CREATE TABLE IF NOT EXISTS response_analysis (
|
| 119 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 120 |
+
evaluation_id INTEGER,
|
| 121 |
+
session_id TEXT NOT NULL,
|
| 122 |
+
agent_name TEXT NOT NULL,
|
| 123 |
+
response_text TEXT NOT NULL,
|
| 124 |
+
response_length INTEGER,
|
| 125 |
+
word_count INTEGER,
|
| 126 |
+
sentence_count INTEGER,
|
| 127 |
+
readability_score REAL,
|
| 128 |
+
sentiment_score REAL,
|
| 129 |
+
key_topics TEXT,
|
| 130 |
+
response_type TEXT,
|
| 131 |
+
contains_code BOOLEAN DEFAULT FALSE,
|
| 132 |
+
contains_links BOOLEAN DEFAULT FALSE,
|
| 133 |
+
language_detected TEXT DEFAULT 'en',
|
| 134 |
+
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
|
| 135 |
+
FOREIGN KEY (evaluation_id) REFERENCES evaluation_logs (id)
|
| 136 |
+
)
|
| 137 |
+
''')
|
| 138 |
+
|
| 139 |
# Insert demo data
|
| 140 |
self.insert_demo_data(cursor)
|
| 141 |
|
|
|
|
| 275 |
]
|
| 276 |
}
|
| 277 |
|
| 278 |
+
# Generate more detailed response based on agent type
|
| 279 |
+
base_response = random.choice(response_templates[agent])
|
| 280 |
+
|
| 281 |
+
# Add specific details based on agent type
|
| 282 |
+
if agent == "Diet Agent":
|
| 283 |
+
details = [
|
| 284 |
+
"Key recommendations: 1) Focus on whole foods, 2) Control portions, 3) Stay hydrated",
|
| 285 |
+
"Nutritional guidelines: Aim for 50% vegetables, 25% lean protein, 25% complex carbs",
|
| 286 |
+
"Meal timing: Consider eating every 3-4 hours to maintain stable blood sugar",
|
| 287 |
+
"Sample foods: Quinoa, salmon, leafy greens, berries, nuts, and legumes"
|
| 288 |
+
]
|
| 289 |
+
elif agent == "Support Agent":
|
| 290 |
+
details = [
|
| 291 |
+
"Action steps: 1) Identify triggers, 2) Develop coping strategies, 3) Practice regularly",
|
| 292 |
+
"Techniques to try: Deep breathing, progressive muscle relaxation, mindfulness meditation",
|
| 293 |
+
"Timeline: Start with 5-10 minutes daily, gradually increase as comfort grows",
|
| 294 |
+
"Resources: Consider apps like Headspace, Calm, or consulting a professional"
|
| 295 |
+
]
|
| 296 |
+
else: # Queries Agent
|
| 297 |
+
details = [
|
| 298 |
+
"Technical overview: This involves complex algorithms and data processing methods",
|
| 299 |
+
"Current applications: Used in healthcare, finance, transportation, and entertainment",
|
| 300 |
+
"Future implications: Expected to revolutionize how we work and interact with technology",
|
| 301 |
+
"Key considerations: Privacy, security, ethical implications, and regulatory frameworks"
|
| 302 |
+
]
|
| 303 |
+
|
| 304 |
+
response = f"{base_response}\n\n{random.choice(details)}"
|
| 305 |
|
| 306 |
# Generate correlated scores (realistic relationships)
|
| 307 |
relevance_score = max(0, min(10, base_score + random.uniform(-0.3, 0.3)))
|
|
|
|
| 357 |
llm_provider, model_name, judge_reasoning, guardrails_failures, timestamp
|
| 358 |
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 359 |
''', eval_data)
|
| 360 |
+
|
| 361 |
+
# Get the evaluation ID for response analysis
|
| 362 |
+
evaluation_id = cursor.lastrowid
|
| 363 |
+
|
| 364 |
+
# Insert detailed response analysis
|
| 365 |
+
self.insert_response_analysis(cursor, evaluation_id, session_id, agent, response, timestamp)
|
| 366 |
+
|
| 367 |
+
def insert_response_analysis(self, cursor, evaluation_id, session_id, agent_name, response_text, timestamp):
|
| 368 |
+
"""Insert detailed response analysis data"""
|
| 369 |
+
import re
|
| 370 |
+
|
| 371 |
+
# Calculate response metrics
|
| 372 |
+
response_length = len(response_text)
|
| 373 |
+
word_count = len(response_text.split())
|
| 374 |
+
sentence_count = len(re.split(r'[.!?]+', response_text)) - 1
|
| 375 |
+
|
| 376 |
+
# Simple readability score (Flesch-like approximation)
|
| 377 |
+
if sentence_count > 0 and word_count > 0:
|
| 378 |
+
avg_sentence_length = word_count / sentence_count
|
| 379 |
+
readability_score = max(0, min(10, 10 - (avg_sentence_length - 15) * 0.1))
|
| 380 |
+
else:
|
| 381 |
+
readability_score = 5.0
|
| 382 |
+
|
| 383 |
+
# Simple sentiment analysis (based on positive/negative words)
|
| 384 |
+
positive_words = ['good', 'great', 'excellent', 'helpful', 'recommend', 'beneficial', 'effective', 'important', 'valuable', 'useful']
|
| 385 |
+
negative_words = ['bad', 'poor', 'difficult', 'problem', 'issue', 'concern', 'warning', 'avoid', 'risk', 'danger']
|
| 386 |
+
|
| 387 |
+
text_lower = response_text.lower()
|
| 388 |
+
positive_count = sum(1 for word in positive_words if word in text_lower)
|
| 389 |
+
negative_count = sum(1 for word in negative_words if word in text_lower)
|
| 390 |
+
|
| 391 |
+
if positive_count + negative_count > 0:
|
| 392 |
+
sentiment_score = (positive_count - negative_count) / (positive_count + negative_count) * 5 + 5
|
| 393 |
+
else:
|
| 394 |
+
sentiment_score = 5.0 # Neutral
|
| 395 |
+
|
| 396 |
+
# Extract key topics (simple keyword extraction)
|
| 397 |
+
keywords = []
|
| 398 |
+
if 'diet' in text_lower or 'food' in text_lower or 'nutrition' in text_lower:
|
| 399 |
+
keywords.append('nutrition')
|
| 400 |
+
if 'exercise' in text_lower or 'workout' in text_lower or 'fitness' in text_lower:
|
| 401 |
+
keywords.append('fitness')
|
| 402 |
+
if 'stress' in text_lower or 'anxiety' in text_lower or 'mental' in text_lower:
|
| 403 |
+
keywords.append('mental_health')
|
| 404 |
+
if 'technology' in text_lower or 'ai' in text_lower or 'algorithm' in text_lower:
|
| 405 |
+
keywords.append('technology')
|
| 406 |
+
if 'health' in text_lower or 'medical' in text_lower:
|
| 407 |
+
keywords.append('health')
|
| 408 |
+
|
| 409 |
+
key_topics = ','.join(keywords) if keywords else 'general'
|
| 410 |
+
|
| 411 |
+
# Determine response type
|
| 412 |
+
if '?' in response_text:
|
| 413 |
+
response_type = 'question'
|
| 414 |
+
elif any(word in text_lower for word in ['recommend', 'suggest', 'try', 'consider']):
|
| 415 |
+
response_type = 'recommendation'
|
| 416 |
+
elif any(word in text_lower for word in ['explain', 'definition', 'means', 'is']):
|
| 417 |
+
response_type = 'explanation'
|
| 418 |
+
else:
|
| 419 |
+
response_type = 'general'
|
| 420 |
+
|
| 421 |
+
# Check for code and links
|
| 422 |
+
contains_code = bool(re.search(r'```|`.*`|\bcode\b|\bfunction\b|\bclass\b', response_text))
|
| 423 |
+
contains_links = bool(re.search(r'http[s]?://|www\.|\.com|\.org', response_text))
|
| 424 |
+
|
| 425 |
+
# Insert response analysis
|
| 426 |
+
cursor.execute('''
|
| 427 |
+
INSERT INTO response_analysis (
|
| 428 |
+
evaluation_id, session_id, agent_name, response_text, response_length,
|
| 429 |
+
word_count, sentence_count, readability_score, sentiment_score,
|
| 430 |
+
key_topics, response_type, contains_code, contains_links,
|
| 431 |
+
language_detected, timestamp
|
| 432 |
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 433 |
+
''', (
|
| 434 |
+
evaluation_id, session_id, agent_name, response_text, response_length,
|
| 435 |
+
word_count, sentence_count, readability_score, sentiment_score,
|
| 436 |
+
key_topics, response_type, contains_code, contains_links,
|
| 437 |
+
'en', timestamp.isoformat()
|
| 438 |
+
))
|
| 439 |
|
| 440 |
def safe_column_access(self, df: pd.DataFrame, column: str, default_value=None):
|
| 441 |
"""Safely access DataFrame columns"""
|
|
|
|
| 455 |
# Base queries
|
| 456 |
eval_query = "SELECT * FROM evaluation_logs"
|
| 457 |
trace_query = "SELECT * FROM workflow_traces"
|
| 458 |
+
response_analysis_query = "SELECT * FROM response_analysis"
|
| 459 |
|
| 460 |
# Apply filters
|
| 461 |
conditions = []
|
|
|
|
| 477 |
if conditions:
|
| 478 |
eval_query += " WHERE " + " AND ".join(conditions)
|
| 479 |
trace_query += " WHERE " + " AND ".join(conditions)
|
| 480 |
+
response_analysis_query += " WHERE " + " AND ".join(conditions)
|
| 481 |
|
| 482 |
# Load data
|
| 483 |
evaluations = pd.read_sql_query(eval_query, conn, params=params)
|
| 484 |
traces = pd.read_sql_query(trace_query, conn, params=params)
|
| 485 |
|
| 486 |
+
# Load response analysis data (handle if table doesn't exist yet)
|
| 487 |
+
try:
|
| 488 |
+
response_analysis = pd.read_sql_query(response_analysis_query, conn, params=params)
|
| 489 |
+
except Exception:
|
| 490 |
+
response_analysis = pd.DataFrame()
|
| 491 |
+
|
| 492 |
conn.close()
|
| 493 |
|
| 494 |
# Convert timestamp columns
|
|
|
|
| 496 |
evaluations['timestamp'] = pd.to_datetime(evaluations['timestamp'])
|
| 497 |
if not traces.empty:
|
| 498 |
traces['timestamp'] = pd.to_datetime(traces['timestamp'])
|
| 499 |
+
if not response_analysis.empty:
|
| 500 |
+
response_analysis['timestamp'] = pd.to_datetime(response_analysis['timestamp'])
|
| 501 |
|
| 502 |
return {
|
| 503 |
'evaluations': evaluations,
|
| 504 |
+
'traces': traces,
|
| 505 |
+
'response_analysis': response_analysis
|
| 506 |
}
|
| 507 |
|
| 508 |
except Exception as e:
|
| 509 |
st.error(f"Error loading data: {str(e)}")
|
| 510 |
+
return {'evaluations': pd.DataFrame(), 'traces': pd.DataFrame(), 'response_analysis': pd.DataFrame()}
|
| 511 |
|
| 512 |
def create_sidebar_filters(self, data: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
|
| 513 |
"""Create sidebar filters"""
|
|
|
|
| 787 |
annotation_text="95% Target")
|
| 788 |
st.plotly_chart(fig, use_container_width=True)
|
| 789 |
|
| 790 |
+
def show_response_analysis(self, data: Dict[str, pd.DataFrame]):
|
| 791 |
+
"""Show detailed response analysis and tracing"""
|
| 792 |
+
st.header("π Response Analysis & Tracing")
|
| 793 |
+
|
| 794 |
+
if data['evaluations'].empty:
|
| 795 |
+
st.warning("No evaluation data available")
|
| 796 |
+
return
|
| 797 |
+
|
| 798 |
+
df_eval = data['evaluations']
|
| 799 |
+
df_analysis = data.get('response_analysis', pd.DataFrame())
|
| 800 |
+
|
| 801 |
+
# Response overview metrics
|
| 802 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 803 |
+
|
| 804 |
+
with col1:
|
| 805 |
+
avg_response_length = df_eval['response'].str.len().mean() if 'response' in df_eval.columns else 0
|
| 806 |
+
st.metric("Avg Response Length", f"{avg_response_length:.0f} chars")
|
| 807 |
+
|
| 808 |
+
with col2:
|
| 809 |
+
if not df_analysis.empty:
|
| 810 |
+
avg_word_count = df_analysis['word_count'].mean()
|
| 811 |
+
st.metric("Avg Word Count", f"{avg_word_count:.0f} words")
|
| 812 |
+
else:
|
| 813 |
+
st.metric("Avg Word Count", "N/A")
|
| 814 |
+
|
| 815 |
+
with col3:
|
| 816 |
+
if not df_analysis.empty:
|
| 817 |
+
avg_readability = df_analysis['readability_score'].mean()
|
| 818 |
+
st.metric("Avg Readability", f"{avg_readability:.1f}/10")
|
| 819 |
+
else:
|
| 820 |
+
st.metric("Avg Readability", "N/A")
|
| 821 |
+
|
| 822 |
+
with col4:
|
| 823 |
+
if not df_analysis.empty:
|
| 824 |
+
avg_sentiment = df_analysis['sentiment_score'].mean()
|
| 825 |
+
st.metric("Avg Sentiment", f"{avg_sentiment:.1f}/10")
|
| 826 |
+
else:
|
| 827 |
+
st.metric("Avg Sentiment", "N/A")
|
| 828 |
+
|
| 829 |
+
# Response analysis charts
|
| 830 |
+
if not df_analysis.empty:
|
| 831 |
+
col1, col2 = st.columns(2)
|
| 832 |
+
|
| 833 |
+
with col1:
|
| 834 |
+
st.subheader("π Response Length Distribution")
|
| 835 |
+
fig = px.histogram(
|
| 836 |
+
df_analysis,
|
| 837 |
+
x='response_length',
|
| 838 |
+
nbins=20,
|
| 839 |
+
title="Response Length Distribution",
|
| 840 |
+
labels={'response_length': 'Response Length (characters)', 'count': 'Frequency'}
|
| 841 |
+
)
|
| 842 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 843 |
+
|
| 844 |
+
with col2:
|
| 845 |
+
st.subheader("π Readability vs Sentiment")
|
| 846 |
+
fig = px.scatter(
|
| 847 |
+
df_analysis,
|
| 848 |
+
x='readability_score',
|
| 849 |
+
y='sentiment_score',
|
| 850 |
+
color='agent_name',
|
| 851 |
+
title="Readability vs Sentiment by Agent",
|
| 852 |
+
labels={'readability_score': 'Readability Score', 'sentiment_score': 'Sentiment Score'}
|
| 853 |
+
)
|
| 854 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 855 |
+
|
| 856 |
+
# Response type analysis
|
| 857 |
+
col1, col2 = st.columns(2)
|
| 858 |
+
|
| 859 |
+
with col1:
|
| 860 |
+
st.subheader("π·οΈ Response Types")
|
| 861 |
+
response_types = df_analysis['response_type'].value_counts()
|
| 862 |
+
fig = px.pie(
|
| 863 |
+
values=response_types.values,
|
| 864 |
+
names=response_types.index,
|
| 865 |
+
title="Distribution of Response Types"
|
| 866 |
+
)
|
| 867 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 868 |
+
|
| 869 |
+
with col2:
|
| 870 |
+
st.subheader("π Key Topics")
|
| 871 |
+
# Process key topics
|
| 872 |
+
all_topics = []
|
| 873 |
+
for topics in df_analysis['key_topics'].dropna():
|
| 874 |
+
all_topics.extend(topics.split(','))
|
| 875 |
+
|
| 876 |
+
if all_topics:
|
| 877 |
+
topic_counts = pd.Series(all_topics).value_counts().head(10)
|
| 878 |
+
fig = px.bar(
|
| 879 |
+
x=topic_counts.values,
|
| 880 |
+
y=topic_counts.index,
|
| 881 |
+
orientation='h',
|
| 882 |
+
title="Top 10 Key Topics",
|
| 883 |
+
labels={'x': 'Frequency', 'y': 'Topics'}
|
| 884 |
+
)
|
| 885 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 886 |
+
else:
|
| 887 |
+
st.info("No topic data available")
|
| 888 |
+
|
| 889 |
+
# Response tracing section
|
| 890 |
+
st.subheader("π Response Tracing")
|
| 891 |
+
|
| 892 |
+
# Search functionality
|
| 893 |
+
search_term = st.text_input("π Search in responses:", placeholder="Enter keywords to search...")
|
| 894 |
+
|
| 895 |
+
if search_term:
|
| 896 |
+
mask = df_eval['response'].str.contains(search_term, case=False, na=False)
|
| 897 |
+
filtered_responses = df_eval[mask]
|
| 898 |
+
else:
|
| 899 |
+
filtered_responses = df_eval.head(10) # Show first 10 by default
|
| 900 |
+
|
| 901 |
+
# Display responses with details
|
| 902 |
+
if not filtered_responses.empty:
|
| 903 |
+
st.write(f"**Found {len(filtered_responses)} responses**")
|
| 904 |
+
|
| 905 |
+
for idx, row in filtered_responses.iterrows():
|
| 906 |
+
with st.expander(f"π€ {row['agent_name']} - Session: {row['session_id'][:8]}... - Score: {row['overall_score']:.1f}"):
|
| 907 |
+
col1, col2 = st.columns([2, 1])
|
| 908 |
+
|
| 909 |
+
with col1:
|
| 910 |
+
st.write("**Query:**")
|
| 911 |
+
st.write(row['query'])
|
| 912 |
+
|
| 913 |
+
st.write("**Response:**")
|
| 914 |
+
st.write(row['response'])
|
| 915 |
+
|
| 916 |
+
with col2:
|
| 917 |
+
st.write("**Evaluation Scores:**")
|
| 918 |
+
st.write(f"Overall: {row['overall_score']:.1f}/10")
|
| 919 |
+
if 'relevance_score' in row:
|
| 920 |
+
st.write(f"Relevance: {row['relevance_score']:.1f}/10")
|
| 921 |
+
if 'accuracy_score' in row:
|
| 922 |
+
st.write(f"Accuracy: {row['accuracy_score']:.1f}/10")
|
| 923 |
+
if 'completeness_score' in row:
|
| 924 |
+
st.write(f"Completeness: {row['completeness_score']:.1f}/10")
|
| 925 |
+
if 'coherence_score' in row:
|
| 926 |
+
st.write(f"Coherence: {row['coherence_score']:.1f}/10")
|
| 927 |
+
|
| 928 |
+
st.write("**Metadata:**")
|
| 929 |
+
st.write(f"Timestamp: {row['timestamp']}")
|
| 930 |
+
st.write(f"Response Time: {row['execution_time_ms']:.0f}ms")
|
| 931 |
+
st.write(f"Safety: {'β
Passed' if row['guardrails_passed'] else 'β Failed'}")
|
| 932 |
+
|
| 933 |
+
# Show response analysis if available
|
| 934 |
+
if not df_analysis.empty:
|
| 935 |
+
analysis_row = df_analysis[df_analysis['evaluation_id'] == row['id']]
|
| 936 |
+
if not analysis_row.empty:
|
| 937 |
+
analysis = analysis_row.iloc[0]
|
| 938 |
+
st.write("**Response Analysis:**")
|
| 939 |
+
st.write(f"Length: {analysis['response_length']} chars")
|
| 940 |
+
st.write(f"Words: {analysis['word_count']}")
|
| 941 |
+
st.write(f"Readability: {analysis['readability_score']:.1f}/10")
|
| 942 |
+
st.write(f"Sentiment: {analysis['sentiment_score']:.1f}/10")
|
| 943 |
+
st.write(f"Type: {analysis['response_type']}")
|
| 944 |
+
st.write(f"Topics: {analysis['key_topics']}")
|
| 945 |
+
else:
|
| 946 |
+
st.info("No responses found matching your search criteria.")
|
| 947 |
+
|
| 948 |
+
# Export response data
|
| 949 |
+
st.subheader("π€ Export Response Data")
|
| 950 |
+
col1, col2 = st.columns(2)
|
| 951 |
+
|
| 952 |
+
with col1:
|
| 953 |
+
if st.button("π Export Evaluation Data"):
|
| 954 |
+
csv = df_eval.to_csv(index=False)
|
| 955 |
+
st.download_button(
|
| 956 |
+
label="Download CSV",
|
| 957 |
+
data=csv,
|
| 958 |
+
file_name="evaluation_responses.csv",
|
| 959 |
+
mime="text/csv"
|
| 960 |
+
)
|
| 961 |
+
|
| 962 |
+
with col2:
|
| 963 |
+
if not df_analysis.empty and st.button("π Export Analysis Data"):
|
| 964 |
+
csv = df_analysis.to_csv(index=False)
|
| 965 |
+
st.download_button(
|
| 966 |
+
label="Download CSV",
|
| 967 |
+
data=csv,
|
| 968 |
+
file_name="response_analysis.csv",
|
| 969 |
+
mime="text/csv"
|
| 970 |
+
)
|
| 971 |
+
|
| 972 |
def run(self):
|
| 973 |
"""Run the dashboard"""
|
| 974 |
st.title("π€ Multi-Agent System Dashboard - Demo")
|
|
|
|
| 1006 |
filtered_data['evaluations'] = df
|
| 1007 |
|
| 1008 |
# Create tabs
|
| 1009 |
+
tab1, tab2, tab3, tab4 = st.tabs([
|
| 1010 |
"π Executive Summary",
|
| 1011 |
"π€ Agent Performance",
|
| 1012 |
+
"π‘οΈ Safety Analysis",
|
| 1013 |
+
"π Response Analysis"
|
| 1014 |
])
|
| 1015 |
|
| 1016 |
with tab1:
|
|
|
|
| 1022 |
with tab3:
|
| 1023 |
self.show_safety_analysis(filtered_data)
|
| 1024 |
|
| 1025 |
+
with tab4:
|
| 1026 |
+
self.show_response_analysis(filtered_data)
|
| 1027 |
+
|
| 1028 |
# Footer
|
| 1029 |
st.markdown("---")
|
| 1030 |
st.markdown("π **Multi-Agent System Dashboard** | Built with Streamlit & Plotly | Demo hosted on Hugging Face Spaces")
|