Spaces:
Sleeping
Sleeping
Upload app.py
Browse files- src/app.py +740 -30
src/app.py
CHANGED
|
@@ -81,9 +81,14 @@ class HuggingFaceDashboard:
|
|
| 81 |
accuracy_score REAL,
|
| 82 |
completeness_score REAL,
|
| 83 |
coherence_score REAL,
|
|
|
|
| 84 |
guardrails_passed BOOLEAN,
|
| 85 |
safety_score REAL,
|
| 86 |
execution_time_ms REAL,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
error_occurred BOOLEAN DEFAULT FALSE,
|
| 88 |
llm_provider TEXT,
|
| 89 |
model_name TEXT,
|
|
@@ -259,19 +264,19 @@ class HuggingFaceDashboard:
|
|
| 259 |
# Generate realistic response
|
| 260 |
response_templates = {
|
| 261 |
"Diet Agent": [
|
| 262 |
-
f"
|
| 263 |
-
f"
|
| 264 |
-
f"
|
| 265 |
],
|
| 266 |
"Support Agent": [
|
| 267 |
-
f"I
|
| 268 |
-
f"Thank you for sharing your
|
| 269 |
-
f"
|
| 270 |
],
|
| 271 |
"Queries Agent": [
|
| 272 |
-
f"
|
| 273 |
-
f"
|
| 274 |
-
f"
|
| 275 |
]
|
| 276 |
}
|
| 277 |
|
|
@@ -281,26 +286,30 @@ class HuggingFaceDashboard:
|
|
| 281 |
# Add specific details based on agent type
|
| 282 |
if agent == "Diet Agent":
|
| 283 |
details = [
|
| 284 |
-
"Key
|
| 285 |
-
|
| 286 |
-
"Meal
|
| 287 |
-
|
|
|
|
| 288 |
]
|
| 289 |
elif agent == "Support Agent":
|
| 290 |
details = [
|
| 291 |
-
"
|
| 292 |
-
|
| 293 |
-
"
|
| 294 |
-
|
|
|
|
| 295 |
]
|
| 296 |
else: # Queries Agent
|
| 297 |
details = [
|
| 298 |
-
"Technical
|
| 299 |
-
|
| 300 |
-
"
|
| 301 |
-
|
|
|
|
| 302 |
]
|
| 303 |
|
|
|
|
| 304 |
response = f"{base_response}\n\n{random.choice(details)}"
|
| 305 |
|
| 306 |
# Generate correlated scores (realistic relationships)
|
|
@@ -309,6 +318,23 @@ class HuggingFaceDashboard:
|
|
| 309 |
completeness_score = max(0, min(10, base_score + random.uniform(-0.5, 0.3)))
|
| 310 |
coherence_score = max(0, min(10, base_score + random.uniform(-0.2, 0.4)))
|
| 311 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
# Realistic safety scenarios
|
| 313 |
safety_pass_rate = 0.95 # 95% pass rate
|
| 314 |
if random.random() < 0.02: # 2% chance of safety issues
|
|
@@ -338,11 +364,16 @@ class HuggingFaceDashboard:
|
|
| 338 |
accuracy_score, # accuracy_score
|
| 339 |
completeness_score, # completeness_score
|
| 340 |
coherence_score, # coherence_score
|
|
|
|
| 341 |
guardrails_passed, # guardrails_passed
|
| 342 |
safety_score, # safety_score
|
| 343 |
execution_time, # execution_time_ms
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
False, # error_occurred
|
| 345 |
-
|
| 346 |
"gpt-4o", # model_name
|
| 347 |
f"Comprehensive evaluation for {agent}: The response demonstrates good understanding of the query with appropriate depth and accuracy. Score breakdown reflects the quality across multiple dimensions.", # judge_reasoning
|
| 348 |
guardrails_failures, # guardrails_failures
|
|
@@ -353,16 +384,17 @@ class HuggingFaceDashboard:
|
|
| 353 |
INSERT INTO evaluation_logs (
|
| 354 |
session_id, agent_name, query, response, overall_score,
|
| 355 |
relevance_score, accuracy_score, completeness_score, coherence_score,
|
| 356 |
-
guardrails_passed, safety_score, execution_time_ms,
|
|
|
|
| 357 |
llm_provider, model_name, judge_reasoning, guardrails_failures, timestamp
|
| 358 |
-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 359 |
''', eval_data)
|
| 360 |
|
| 361 |
# Get the evaluation ID for response analysis
|
| 362 |
evaluation_id = cursor.lastrowid
|
| 363 |
|
| 364 |
# Insert detailed response analysis
|
| 365 |
-
self.insert_response_analysis(cursor, evaluation_id,
|
| 366 |
|
| 367 |
def insert_response_analysis(self, cursor, evaluation_id, session_id, agent_name, response_text, timestamp):
|
| 368 |
"""Insert detailed response analysis data"""
|
|
@@ -557,6 +589,47 @@ class HuggingFaceDashboard:
|
|
| 557 |
value=False
|
| 558 |
)
|
| 559 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 560 |
return filters
|
| 561 |
|
| 562 |
def show_executive_summary(self, data: Dict[str, pd.DataFrame]):
|
|
@@ -593,10 +666,31 @@ class HuggingFaceDashboard:
|
|
| 593 |
st.metric("Unique Sessions", f"{unique_sessions:,}")
|
| 594 |
|
| 595 |
# Performance trends
|
| 596 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 597 |
|
| 598 |
# Daily performance trend
|
| 599 |
-
df_daily =
|
| 600 |
'overall_score': 'mean',
|
| 601 |
'execution_time_ms': 'mean',
|
| 602 |
'guardrails_passed': lambda x: (x.sum() / len(x)) * 100
|
|
@@ -969,6 +1063,553 @@ class HuggingFaceDashboard:
|
|
| 969 |
mime="text/csv"
|
| 970 |
)
|
| 971 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 972 |
def run(self):
|
| 973 |
"""Run the dashboard"""
|
| 974 |
st.title("π€ Multi-Agent System Dashboard - Demo")
|
|
@@ -1003,14 +1644,33 @@ class HuggingFaceDashboard:
|
|
| 1003 |
if filters.get('safety_only', False):
|
| 1004 |
df = df[df['guardrails_passed'] == True]
|
| 1005 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1006 |
filtered_data['evaluations'] = df
|
| 1007 |
|
| 1008 |
# Create tabs
|
| 1009 |
-
tab1, tab2, tab3, tab4 = st.tabs([
|
| 1010 |
"π Executive Summary",
|
| 1011 |
"π€ Agent Performance",
|
| 1012 |
"π‘οΈ Safety Analysis",
|
| 1013 |
-
"π Response Analysis"
|
|
|
|
|
|
|
| 1014 |
])
|
| 1015 |
|
| 1016 |
with tab1:
|
|
@@ -1025,9 +1685,59 @@ class HuggingFaceDashboard:
|
|
| 1025 |
with tab4:
|
| 1026 |
self.show_response_analysis(filtered_data)
|
| 1027 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1028 |
# Footer
|
| 1029 |
st.markdown("---")
|
| 1030 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1031 |
|
| 1032 |
if __name__ == "__main__":
|
| 1033 |
dashboard = HuggingFaceDashboard()
|
|
|
|
| 81 |
accuracy_score REAL,
|
| 82 |
completeness_score REAL,
|
| 83 |
coherence_score REAL,
|
| 84 |
+
hallucination_score REAL,
|
| 85 |
guardrails_passed BOOLEAN,
|
| 86 |
safety_score REAL,
|
| 87 |
execution_time_ms REAL,
|
| 88 |
+
input_tokens INTEGER,
|
| 89 |
+
output_tokens INTEGER,
|
| 90 |
+
total_tokens INTEGER,
|
| 91 |
+
cost_usd REAL,
|
| 92 |
error_occurred BOOLEAN DEFAULT FALSE,
|
| 93 |
llm_provider TEXT,
|
| 94 |
model_name TEXT,
|
|
|
|
| 264 |
# Generate realistic response
|
| 265 |
response_templates = {
|
| 266 |
"Diet Agent": [
|
| 267 |
+
f"Thank you for your question about nutrition and dietary guidance. I'd be happy to help you develop a healthier relationship with food and create sustainable eating habits.",
|
| 268 |
+
f"I understand you're looking for dietary advice, and I'm here to provide evidence-based nutritional guidance tailored to your specific needs and goals.",
|
| 269 |
+
f"Great question about nutrition! Let me share some comprehensive dietary recommendations that can help you achieve better health outcomes."
|
| 270 |
],
|
| 271 |
"Support Agent": [
|
| 272 |
+
f"I appreciate you reaching out for support. It takes courage to ask for help, and I'm here to provide you with practical strategies and emotional guidance.",
|
| 273 |
+
f"Thank you for sharing your concerns with me. I understand this can be challenging, and I want to help you work through this step by step with compassion and understanding.",
|
| 274 |
+
f"I'm glad you've come to me for support. Your feelings are valid, and together we can explore effective coping strategies and build resilience."
|
| 275 |
],
|
| 276 |
"Queries Agent": [
|
| 277 |
+
f"Excellent question! This is a fascinating topic that involves cutting-edge technology and has significant implications for our future. Let me provide you with a comprehensive overview.",
|
| 278 |
+
f"Thank you for this thought-provoking question. This subject encompasses multiple disciplines and recent innovations. I'll break this down into key concepts and practical applications.",
|
| 279 |
+
f"Great inquiry! This is an evolving field with exciting developments. Let me explain the fundamental principles and explore the current state of research and implementation."
|
| 280 |
]
|
| 281 |
}
|
| 282 |
|
|
|
|
| 286 |
# Add specific details based on agent type
|
| 287 |
if agent == "Diet Agent":
|
| 288 |
details = [
|
| 289 |
+
"**Key Nutritional Recommendations:**\n\n1. **Whole Foods Focus**: Prioritize unprocessed foods like fresh fruits, vegetables, whole grains, lean proteins, and healthy fats. These provide essential nutrients and fiber while avoiding added sugars and preservatives.\n\n2. **Portion Control**: Use the plate method - fill half your plate with non-starchy vegetables, one quarter with lean protein, and one quarter with complex carbohydrates.\n\n3. **Hydration**: Aim for 8-10 glasses of water daily. Proper hydration supports metabolism, digestion, and overall health.\n\n4. **Meal Timing**: Eat regular meals every 3-4 hours to maintain stable blood sugar levels and prevent overeating.\n\n**Sample Daily Meal Plan:**\n- Breakfast: Greek yogurt with berries and nuts\n- Lunch: Quinoa salad with grilled chicken and vegetables\n- Dinner: Baked salmon with roasted sweet potatoes and broccoli\n- Snacks: Apple with almond butter, or handful of mixed nuts",
|
| 290 |
+
|
| 291 |
+
"**Evidence-Based Dietary Guidelines:**\n\n1. **Macronutrient Balance**: Aim for 45-65% carbohydrates (focus on complex carbs), 20-35% healthy fats, and 10-35% protein based on your activity level.\n\n2. **Micronutrient Density**: Choose foods rich in vitamins, minerals, and antioxidants. Include colorful fruits and vegetables to ensure variety.\n\n3. **Fiber Intake**: Target 25-35 grams daily through whole grains, legumes, fruits, and vegetables to support digestive health.\n\n4. **Healthy Fats**: Include omega-3 fatty acids from fish, walnuts, and flaxseeds, while limiting saturated and trans fats.\n\n**Practical Implementation Tips:**\n- Meal prep on weekends to ensure healthy options are available\n- Read nutrition labels to make informed choices\n- Practice mindful eating by eating slowly and paying attention to hunger cues\n- Keep a food diary to track patterns and identify areas for improvement",
|
| 292 |
+
|
| 293 |
+
"**Personalized Nutrition Approach:**\n\nEvery individual has unique nutritional needs based on age, gender, activity level, health conditions, and personal preferences. Here's how to customize your approach:\n\n1. **Assessment**: Consider your current health status, goals (weight management, energy levels, disease prevention), and any dietary restrictions.\n\n2. **Gradual Changes**: Implement changes slowly to ensure sustainability. Start with one or two modifications per week.\n\n3. **Professional Guidance**: Consider consulting with a registered dietitian for personalized meal planning, especially if you have specific health conditions.\n\n4. **Regular Monitoring**: Track your progress through energy levels, sleep quality, and how you feel overall, not just weight.\n\n**Common Nutritional Myths Debunked:**\n- Carbs aren't inherently bad - choose complex carbohydrates over simple sugars\n- Fat doesn't make you fat - healthy fats are essential for hormone production and nutrient absorption\n- Skipping meals doesn't help with weight loss and can lead to overeating later"
|
| 294 |
]
|
| 295 |
elif agent == "Support Agent":
|
| 296 |
details = [
|
| 297 |
+
"**Comprehensive Support Strategy:**\n\n**Immediate Coping Techniques:**\n1. **Deep Breathing**: Practice the 4-7-8 technique - inhale for 4 counts, hold for 7, exhale for 8. This activates your parasympathetic nervous system.\n\n2. **Grounding Exercises**: Use the 5-4-3-2-1 method - identify 5 things you can see, 4 you can touch, 3 you can hear, 2 you can smell, and 1 you can taste.\n\n3. **Progressive Muscle Relaxation**: Tense and release each muscle group from toes to head, holding tension for 5 seconds before releasing.\n\n**Long-term Strategies:**\n- Establish a consistent daily routine to provide structure and predictability\n- Practice mindfulness meditation for 10-15 minutes daily\n- Maintain a journal to process emotions and identify patterns\n- Build a support network of trusted friends, family, or support groups\n\n**Professional Resources:**\nConsider reaching out to mental health professionals if you're experiencing persistent difficulties. Many offer telehealth options for convenience.",
|
| 298 |
+
|
| 299 |
+
"**Building Emotional Resilience:**\n\n**Understanding Your Emotions:**\nEmotions are natural responses to life events. Learning to recognize, understand, and manage them is a skill that can be developed with practice.\n\n**Practical Steps:**\n1. **Emotion Identification**: Use an emotion wheel or journal to name specific feelings rather than general terms like 'bad' or 'stressed.'\n\n2. **Trigger Awareness**: Notice what situations, people, or thoughts tend to trigger difficult emotions.\n\n3. **Response vs. Reaction**: Create a pause between feeling and action. Ask yourself: 'What would be most helpful right now?'\n\n4. **Self-Compassion**: Treat yourself with the same kindness you'd offer a good friend facing similar challenges.\n\n**Daily Practices:**\n- Morning intention setting (5 minutes)\n- Midday check-in with your emotional state\n- Evening reflection on what went well and what you learned\n- Regular physical activity to support mental health\n\n**Crisis Resources:**\nIf you're experiencing thoughts of self-harm, please reach out immediately to a crisis hotline, emergency services, or trusted healthcare provider.",
|
| 300 |
+
|
| 301 |
+
"**Stress Management and Well-being:**\n\n**Understanding Stress:**\nStress is a normal part of life, but chronic stress can impact your physical and mental health. Learning effective management techniques is crucial for long-term well-being.\n\n**Evidence-Based Techniques:**\n1. **Cognitive Restructuring**: Challenge negative thought patterns by asking: 'Is this thought realistic? What evidence supports or contradicts it? What would I tell a friend in this situation?'\n\n2. **Time Management**: Use techniques like the Pomodoro method, prioritization matrices, and saying no to non-essential commitments.\n\n3. **Physical Self-Care**: Regular exercise, adequate sleep (7-9 hours), and proper nutrition form the foundation of stress resilience.\n\n4. **Social Connection**: Maintain relationships with supportive people. Even brief positive interactions can improve mood and reduce stress.\n\n**Creating Your Personal Toolkit:**\n- Identify 3-5 coping strategies that work best for you\n- Practice them regularly, not just during stressful times\n- Adjust and refine your approach based on what's most effective\n- Remember that seeking help is a sign of strength, not weakness"
|
| 302 |
]
|
| 303 |
else: # Queries Agent
|
| 304 |
details = [
|
| 305 |
+
"**Technical Deep Dive:**\n\n**Fundamental Concepts:**\nThis technology represents a convergence of multiple disciplines including computer science, mathematics, engineering, and domain-specific expertise. The underlying principles involve complex algorithms, data structures, and computational methods.\n\n**Current Implementation:**\n1. **Healthcare**: AI-powered diagnostic tools, personalized treatment plans, drug discovery acceleration, and robotic surgery assistance.\n\n2. **Finance**: Algorithmic trading, fraud detection, risk assessment, and automated customer service through chatbots.\n\n3. **Transportation**: Autonomous vehicles, traffic optimization, predictive maintenance, and route planning algorithms.\n\n4. **Entertainment**: Recommendation systems, content generation, virtual reality experiences, and interactive gaming.\n\n**Technical Architecture:**\n- Data processing pipelines that handle massive datasets in real-time\n- Machine learning models trained on diverse, high-quality datasets\n- Cloud infrastructure enabling scalable deployment and accessibility\n- APIs and interfaces that allow integration with existing systems\n\n**Performance Metrics:**\nSuccess is measured through accuracy rates, processing speed, user engagement, cost efficiency, and real-world impact on problem-solving.",
|
| 306 |
+
|
| 307 |
+
"**Industry Applications and Impact:**\n\n**Current Market Landscape:**\nThe technology sector is experiencing rapid transformation with significant investments in research and development. Major companies are competing to develop more efficient, ethical, and accessible solutions.\n\n**Real-World Applications:**\n1. **Smart Cities**: IoT sensors, traffic management, energy optimization, and public safety systems working together to improve urban living.\n\n2. **Environmental Monitoring**: Satellite imagery analysis, climate modeling, pollution tracking, and renewable energy optimization.\n\n3. **Education**: Personalized learning platforms, automated grading systems, virtual tutors, and accessibility tools for diverse learners.\n\n4. **Manufacturing**: Predictive maintenance, quality control, supply chain optimization, and human-robot collaboration.\n\n**Economic Impact:**\n- Job creation in new fields while transforming traditional roles\n- Increased productivity and efficiency across industries\n- New business models and revenue streams\n- Global competitiveness and innovation drivers\n\n**Challenges and Solutions:**\n- Addressing ethical concerns through responsible development practices\n- Ensuring data privacy and security through robust frameworks\n- Managing the digital divide through inclusive design and accessibility",
|
| 308 |
+
|
| 309 |
+
"**Future Implications and Trends:**\n\n**Emerging Developments:**\nThe field is evolving rapidly with breakthrough research in quantum computing, neuromorphic chips, and advanced algorithms that promise to solve previously intractable problems.\n\n**Next 5-10 Years:**\n1. **Integration**: Seamless integration across platforms and devices, creating more intuitive user experiences.\n\n2. **Personalization**: Highly customized solutions that adapt to individual preferences and needs in real-time.\n\n3. **Sustainability**: Green technology initiatives focusing on energy efficiency and environmental responsibility.\n\n4. **Accessibility**: Universal design principles ensuring technology benefits all users regardless of abilities or circumstances.\n\n**Societal Considerations:**\n- Regulatory frameworks evolving to balance innovation with consumer protection\n- Educational systems adapting to prepare workforce for technological changes\n- International cooperation on standards and ethical guidelines\n- Public discourse on the role of technology in society\n\n**Preparing for the Future:**\n- Continuous learning and skill development\n- Critical thinking about technology's role in daily life\n- Participation in discussions about technology policy and ethics\n- Understanding both opportunities and risks associated with technological advancement"
|
| 310 |
]
|
| 311 |
|
| 312 |
+
# Create a more comprehensive response
|
| 313 |
response = f"{base_response}\n\n{random.choice(details)}"
|
| 314 |
|
| 315 |
# Generate correlated scores (realistic relationships)
|
|
|
|
| 318 |
completeness_score = max(0, min(10, base_score + random.uniform(-0.5, 0.3)))
|
| 319 |
coherence_score = max(0, min(10, base_score + random.uniform(-0.2, 0.4)))
|
| 320 |
|
| 321 |
+
# Generate hallucination score (inverse relationship with accuracy)
|
| 322 |
+
hallucination_score = max(0, min(10, 10 - accuracy_score + random.uniform(-1.0, 1.0)))
|
| 323 |
+
|
| 324 |
+
# Generate token consumption based on response length and agent type
|
| 325 |
+
response_length = len(response)
|
| 326 |
+
input_tokens = len(query.split()) * 1.3 # Rough estimate
|
| 327 |
+
output_tokens = response_length / 4 # Rough estimate (4 chars per token)
|
| 328 |
+
total_tokens = int(input_tokens + output_tokens)
|
| 329 |
+
|
| 330 |
+
# Calculate cost (rough estimates per 1K tokens)
|
| 331 |
+
cost_per_1k_tokens = {
|
| 332 |
+
"azure": 0.03, # GPT-4
|
| 333 |
+
"openai": 0.03,
|
| 334 |
+
"anthropic": 0.025
|
| 335 |
+
}
|
| 336 |
+
cost_usd = (total_tokens / 1000) * cost_per_1k_tokens.get(llm_provider, 0.03)
|
| 337 |
+
|
| 338 |
# Realistic safety scenarios
|
| 339 |
safety_pass_rate = 0.95 # 95% pass rate
|
| 340 |
if random.random() < 0.02: # 2% chance of safety issues
|
|
|
|
| 364 |
accuracy_score, # accuracy_score
|
| 365 |
completeness_score, # completeness_score
|
| 366 |
coherence_score, # coherence_score
|
| 367 |
+
hallucination_score, # hallucination_score
|
| 368 |
guardrails_passed, # guardrails_passed
|
| 369 |
safety_score, # safety_score
|
| 370 |
execution_time, # execution_time_ms
|
| 371 |
+
int(input_tokens), # input_tokens
|
| 372 |
+
int(output_tokens), # output_tokens
|
| 373 |
+
total_tokens, # total_tokens
|
| 374 |
+
round(cost_usd, 4), # cost_usd
|
| 375 |
False, # error_occurred
|
| 376 |
+
llm_provider, # llm_provider
|
| 377 |
"gpt-4o", # model_name
|
| 378 |
f"Comprehensive evaluation for {agent}: The response demonstrates good understanding of the query with appropriate depth and accuracy. Score breakdown reflects the quality across multiple dimensions.", # judge_reasoning
|
| 379 |
guardrails_failures, # guardrails_failures
|
|
|
|
| 384 |
INSERT INTO evaluation_logs (
|
| 385 |
session_id, agent_name, query, response, overall_score,
|
| 386 |
relevance_score, accuracy_score, completeness_score, coherence_score,
|
| 387 |
+
hallucination_score, guardrails_passed, safety_score, execution_time_ms,
|
| 388 |
+
input_tokens, output_tokens, total_tokens, cost_usd, error_occurred,
|
| 389 |
llm_provider, model_name, judge_reasoning, guardrails_failures, timestamp
|
| 390 |
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 391 |
''', eval_data)
|
| 392 |
|
| 393 |
# Get the evaluation ID for response analysis
|
| 394 |
evaluation_id = cursor.lastrowid
|
| 395 |
|
| 396 |
# Insert detailed response analysis
|
| 397 |
+
self.insert_response_analysis(cursor, evaluation_id, eval_data[0], agent, response, timestamp)
|
| 398 |
|
| 399 |
def insert_response_analysis(self, cursor, evaluation_id, session_id, agent_name, response_text, timestamp):
|
| 400 |
"""Insert detailed response analysis data"""
|
|
|
|
| 589 |
value=False
|
| 590 |
)
|
| 591 |
|
| 592 |
+
# Advanced filters
|
| 593 |
+
st.sidebar.markdown("### π¬ Advanced Filters")
|
| 594 |
+
|
| 595 |
+
# Performance tier filter
|
| 596 |
+
filters['performance_tier'] = st.sidebar.selectbox(
|
| 597 |
+
"π Performance Tier",
|
| 598 |
+
options=["All", "Excellent (8.5+)", "Good (7.0-8.5)", "Needs Improvement (<7.0)"],
|
| 599 |
+
index=0
|
| 600 |
+
)
|
| 601 |
+
|
| 602 |
+
# Response time filter
|
| 603 |
+
if not data['evaluations'].empty:
|
| 604 |
+
max_time = data['evaluations']['execution_time_ms'].max()
|
| 605 |
+
filters['max_response_time'] = st.sidebar.slider(
|
| 606 |
+
"β±οΈ Max Response Time (ms)",
|
| 607 |
+
min_value=0,
|
| 608 |
+
max_value=int(max_time),
|
| 609 |
+
value=int(max_time),
|
| 610 |
+
step=100
|
| 611 |
+
)
|
| 612 |
+
|
| 613 |
+
# Model/Provider filter
|
| 614 |
+
if not data['evaluations'].empty and 'llm_provider' in data['evaluations'].columns:
|
| 615 |
+
providers = data['evaluations']['llm_provider'].unique().tolist()
|
| 616 |
+
filters['providers'] = st.sidebar.multiselect(
|
| 617 |
+
"π€ LLM Providers",
|
| 618 |
+
options=providers,
|
| 619 |
+
default=providers
|
| 620 |
+
)
|
| 621 |
+
|
| 622 |
+
# Auto-refresh option
|
| 623 |
+
filters['auto_refresh'] = st.sidebar.checkbox(
|
| 624 |
+
"π Auto-refresh (30s)",
|
| 625 |
+
value=False,
|
| 626 |
+
help="Automatically refresh data every 30 seconds"
|
| 627 |
+
)
|
| 628 |
+
|
| 629 |
+
if filters.get('auto_refresh', False):
|
| 630 |
+
st.sidebar.success("π Auto-refresh enabled")
|
| 631 |
+
# Add auto-refresh logic here if needed
|
| 632 |
+
|
| 633 |
return filters
|
| 634 |
|
| 635 |
def show_executive_summary(self, data: Dict[str, pd.DataFrame]):
|
|
|
|
| 666 |
st.metric("Unique Sessions", f"{unique_sessions:,}")
|
| 667 |
|
| 668 |
# Performance trends
|
| 669 |
+
col1, col2 = st.columns([3, 1])
|
| 670 |
+
|
| 671 |
+
with col1:
|
| 672 |
+
st.subheader("π Performance Trends")
|
| 673 |
+
|
| 674 |
+
with col2:
|
| 675 |
+
trend_period = st.selectbox(
|
| 676 |
+
"π
Period",
|
| 677 |
+
options=["7 days", "30 days", "All time"],
|
| 678 |
+
index=1,
|
| 679 |
+
key="trend_period"
|
| 680 |
+
)
|
| 681 |
+
|
| 682 |
+
# Filter data based on selected period
|
| 683 |
+
if trend_period == "7 days":
|
| 684 |
+
cutoff_date = datetime.now() - timedelta(days=7)
|
| 685 |
+
trend_df = df[df['timestamp'] >= cutoff_date]
|
| 686 |
+
elif trend_period == "30 days":
|
| 687 |
+
cutoff_date = datetime.now() - timedelta(days=30)
|
| 688 |
+
trend_df = df[df['timestamp'] >= cutoff_date]
|
| 689 |
+
else:
|
| 690 |
+
trend_df = df
|
| 691 |
|
| 692 |
# Daily performance trend
|
| 693 |
+
df_daily = trend_df.groupby(trend_df['timestamp'].dt.date).agg({
|
| 694 |
'overall_score': 'mean',
|
| 695 |
'execution_time_ms': 'mean',
|
| 696 |
'guardrails_passed': lambda x: (x.sum() / len(x)) * 100
|
|
|
|
| 1063 |
mime="text/csv"
|
| 1064 |
)
|
| 1065 |
|
| 1066 |
+
def show_advanced_analytics(self, data: Dict[str, pd.DataFrame]):
|
| 1067 |
+
"""Show advanced analytics and insights"""
|
| 1068 |
+
st.header("π¬ Advanced Analytics & AI Insights")
|
| 1069 |
+
|
| 1070 |
+
if data['evaluations'].empty:
|
| 1071 |
+
st.warning("No evaluation data available")
|
| 1072 |
+
return
|
| 1073 |
+
|
| 1074 |
+
df_eval = data['evaluations']
|
| 1075 |
+
df_analysis = data.get('response_analysis', pd.DataFrame())
|
| 1076 |
+
|
| 1077 |
+
# Performance trends and predictions
|
| 1078 |
+
st.subheader("π Performance Trends & Predictions")
|
| 1079 |
+
|
| 1080 |
+
col1, col2 = st.columns(2)
|
| 1081 |
+
|
| 1082 |
+
with col1:
|
| 1083 |
+
st.write("**π Score Trends Over Time**")
|
| 1084 |
+
# Daily performance trend with moving average
|
| 1085 |
+
df_daily = df_eval.groupby(df_eval['timestamp'].dt.date).agg({
|
| 1086 |
+
'overall_score': ['mean', 'count'],
|
| 1087 |
+
'execution_time_ms': 'mean'
|
| 1088 |
+
}).reset_index()
|
| 1089 |
+
|
| 1090 |
+
df_daily.columns = ['date', 'avg_score', 'count', 'avg_time']
|
| 1091 |
+
|
| 1092 |
+
# Calculate moving average
|
| 1093 |
+
df_daily['score_ma'] = df_daily['avg_score'].rolling(window=7, min_periods=1).mean()
|
| 1094 |
+
|
| 1095 |
+
fig = go.Figure()
|
| 1096 |
+
fig.add_trace(go.Scatter(
|
| 1097 |
+
x=df_daily['date'],
|
| 1098 |
+
y=df_daily['avg_score'],
|
| 1099 |
+
mode='lines+markers',
|
| 1100 |
+
name='Daily Score',
|
| 1101 |
+
line=dict(color='lightblue', width=1),
|
| 1102 |
+
opacity=0.7
|
| 1103 |
+
))
|
| 1104 |
+
fig.add_trace(go.Scatter(
|
| 1105 |
+
x=df_daily['date'],
|
| 1106 |
+
y=df_daily['score_ma'],
|
| 1107 |
+
mode='lines',
|
| 1108 |
+
name='7-Day Moving Average',
|
| 1109 |
+
line=dict(color='red', width=3)
|
| 1110 |
+
))
|
| 1111 |
+
fig.update_layout(
|
| 1112 |
+
title="Score Trends with Moving Average",
|
| 1113 |
+
xaxis_title="Date",
|
| 1114 |
+
yaxis_title="Score",
|
| 1115 |
+
height=400
|
| 1116 |
+
)
|
| 1117 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1118 |
+
|
| 1119 |
+
with col2:
|
| 1120 |
+
st.write("**β‘ Performance Correlation Matrix**")
|
| 1121 |
+
# Correlation analysis
|
| 1122 |
+
score_cols = ['overall_score', 'relevance_score', 'accuracy_score',
|
| 1123 |
+
'completeness_score', 'coherence_score', 'execution_time_ms']
|
| 1124 |
+
available_cols = [col for col in score_cols if col in df_eval.columns]
|
| 1125 |
+
|
| 1126 |
+
if len(available_cols) > 2:
|
| 1127 |
+
corr_matrix = df_eval[available_cols].corr()
|
| 1128 |
+
|
| 1129 |
+
fig = px.imshow(
|
| 1130 |
+
corr_matrix,
|
| 1131 |
+
title="Performance Metrics Correlation",
|
| 1132 |
+
color_continuous_scale='RdBu',
|
| 1133 |
+
aspect="auto"
|
| 1134 |
+
)
|
| 1135 |
+
fig.update_layout(height=400)
|
| 1136 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1137 |
+
else:
|
| 1138 |
+
st.info("Need more metrics for correlation analysis")
|
| 1139 |
+
|
| 1140 |
+
# Agent comparison and benchmarking
|
| 1141 |
+
st.subheader("π Agent Benchmarking & Competition")
|
| 1142 |
+
|
| 1143 |
+
col1, col2, col3 = st.columns(3)
|
| 1144 |
+
|
| 1145 |
+
with col1:
|
| 1146 |
+
st.write("**π₯ Agent Leaderboard**")
|
| 1147 |
+
leaderboard = df_eval.groupby('agent_name').agg({
|
| 1148 |
+
'overall_score': ['mean', 'std', 'count'],
|
| 1149 |
+
'execution_time_ms': 'mean'
|
| 1150 |
+
}).round(2)
|
| 1151 |
+
|
| 1152 |
+
leaderboard.columns = ['Avg Score', 'Score StdDev', 'Total Evals', 'Avg Time (ms)']
|
| 1153 |
+
leaderboard['Efficiency'] = (leaderboard['Avg Score'] / (leaderboard['Avg Time (ms)'] / 1000)).round(2)
|
| 1154 |
+
leaderboard = leaderboard.sort_values('Avg Score', ascending=False)
|
| 1155 |
+
|
| 1156 |
+
# Add rank and medals
|
| 1157 |
+
leaderboard['Rank'] = range(1, len(leaderboard) + 1)
|
| 1158 |
+
medals = ['π₯', 'π₯', 'π₯'] + ['π
'] * (len(leaderboard) - 3)
|
| 1159 |
+
leaderboard['Medal'] = medals[:len(leaderboard)]
|
| 1160 |
+
|
| 1161 |
+
st.dataframe(leaderboard[['Medal', 'Rank', 'Avg Score', 'Efficiency', 'Total Evals']], use_container_width=True)
|
| 1162 |
+
|
| 1163 |
+
with col2:
|
| 1164 |
+
st.write("**π Performance Distribution**")
|
| 1165 |
+
fig = px.box(
|
| 1166 |
+
df_eval,
|
| 1167 |
+
x='agent_name',
|
| 1168 |
+
y='overall_score',
|
| 1169 |
+
title="Score Distribution by Agent",
|
| 1170 |
+
color='agent_name'
|
| 1171 |
+
)
|
| 1172 |
+
fig.update_layout(height=300, showlegend=False)
|
| 1173 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1174 |
+
|
| 1175 |
+
with col3:
|
| 1176 |
+
st.write("**β‘ Speed vs Quality**")
|
| 1177 |
+
agent_perf = df_eval.groupby('agent_name').agg({
|
| 1178 |
+
'overall_score': 'mean',
|
| 1179 |
+
'execution_time_ms': 'mean'
|
| 1180 |
+
}).reset_index()
|
| 1181 |
+
|
| 1182 |
+
fig = px.scatter(
|
| 1183 |
+
agent_perf,
|
| 1184 |
+
x='execution_time_ms',
|
| 1185 |
+
y='overall_score',
|
| 1186 |
+
size='overall_score',
|
| 1187 |
+
color='agent_name',
|
| 1188 |
+
title="Speed vs Quality Trade-off",
|
| 1189 |
+
labels={'execution_time_ms': 'Response Time (ms)', 'overall_score': 'Quality Score'}
|
| 1190 |
+
)
|
| 1191 |
+
fig.update_layout(height=300)
|
| 1192 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1193 |
+
|
| 1194 |
+
# AI-powered insights and recommendations
|
| 1195 |
+
st.subheader("π€ AI-Powered Insights & Recommendations")
|
| 1196 |
+
|
| 1197 |
+
# Generate insights
|
| 1198 |
+
insights = self.generate_ai_insights(df_eval, df_analysis)
|
| 1199 |
+
|
| 1200 |
+
col1, col2 = st.columns(2)
|
| 1201 |
+
|
| 1202 |
+
with col1:
|
| 1203 |
+
st.write("**π‘ Key Insights**")
|
| 1204 |
+
for insight in insights['insights']:
|
| 1205 |
+
st.info(f"π {insight}")
|
| 1206 |
+
|
| 1207 |
+
with col2:
|
| 1208 |
+
st.write("**π Recommendations**")
|
| 1209 |
+
for rec in insights['recommendations']:
|
| 1210 |
+
st.success(f"π‘ {rec}")
|
| 1211 |
+
|
| 1212 |
+
# Performance anomaly detection
|
| 1213 |
+
st.subheader("π Anomaly Detection")
|
| 1214 |
+
|
| 1215 |
+
anomalies = self.detect_anomalies(df_eval)
|
| 1216 |
+
if anomalies:
|
| 1217 |
+
st.warning(f"β οΈ Detected {len(anomalies)} potential anomalies:")
|
| 1218 |
+
for anomaly in anomalies:
|
| 1219 |
+
st.write(f"β’ {anomaly}")
|
| 1220 |
+
else:
|
| 1221 |
+
st.success("β
No performance anomalies detected")
|
| 1222 |
+
|
| 1223 |
+
# Real-time monitoring simulation
|
| 1224 |
+
st.subheader("π‘ Real-time Monitoring Simulation")
|
| 1225 |
+
|
| 1226 |
+
if st.button("π Simulate Real-time Update"):
|
| 1227 |
+
# Simulate new data
|
| 1228 |
+
latest_data = self.simulate_realtime_data()
|
| 1229 |
+
|
| 1230 |
+
col1, col2, col3 = st.columns(3)
|
| 1231 |
+
with col1:
|
| 1232 |
+
st.metric("Latest Score", f"{latest_data['score']:.2f}", f"{latest_data['score_delta']:+.2f}")
|
| 1233 |
+
with col2:
|
| 1234 |
+
st.metric("Response Time", f"{latest_data['time']:.0f}ms", f"{latest_data['time_delta']:+.0f}ms")
|
| 1235 |
+
with col3:
|
| 1236 |
+
st.metric("Safety Status", "β
Passed" if latest_data['safe'] else "β Failed")
|
| 1237 |
+
|
| 1238 |
+
st.success("π Dashboard updated with latest data!")
|
| 1239 |
+
|
| 1240 |
+
def generate_ai_insights(self, df_eval, df_analysis):
|
| 1241 |
+
"""Generate AI-powered insights from the data"""
|
| 1242 |
+
insights = []
|
| 1243 |
+
recommendations = []
|
| 1244 |
+
|
| 1245 |
+
# Performance insights
|
| 1246 |
+
best_agent = df_eval.groupby('agent_name')['overall_score'].mean().idxmax()
|
| 1247 |
+
worst_agent = df_eval.groupby('agent_name')['overall_score'].mean().idxmin()
|
| 1248 |
+
|
| 1249 |
+
avg_score = df_eval['overall_score'].mean()
|
| 1250 |
+
score_trend = df_eval.groupby(df_eval['timestamp'].dt.date)['overall_score'].mean()
|
| 1251 |
+
|
| 1252 |
+
if len(score_trend) > 1:
|
| 1253 |
+
recent_trend = score_trend.iloc[-3:].mean() - score_trend.iloc[:3].mean()
|
| 1254 |
+
if recent_trend > 0.5:
|
| 1255 |
+
insights.append(f"Performance is improving! Recent scores are {recent_trend:.1f} points higher than earlier.")
|
| 1256 |
+
elif recent_trend < -0.5:
|
| 1257 |
+
insights.append(f"Performance decline detected. Recent scores are {abs(recent_trend):.1f} points lower.")
|
| 1258 |
+
|
| 1259 |
+
# Agent insights
|
| 1260 |
+
insights.append(f"{best_agent} is the top performer with highest average scores.")
|
| 1261 |
+
insights.append(f"Overall system performance: {avg_score:.1f}/10 - {'Excellent' if avg_score > 8.5 else 'Good' if avg_score > 7.5 else 'Needs Improvement'}")
|
| 1262 |
+
|
| 1263 |
+
# Response time insights
|
| 1264 |
+
avg_time = df_eval['execution_time_ms'].mean()
|
| 1265 |
+
if avg_time > 2000:
|
| 1266 |
+
insights.append(f"Response times are high (avg: {avg_time:.0f}ms). Consider optimization.")
|
| 1267 |
+
|
| 1268 |
+
# Safety insights
|
| 1269 |
+
safety_rate = (df_eval['guardrails_passed'].sum() / len(df_eval)) * 100
|
| 1270 |
+
if safety_rate < 95:
|
| 1271 |
+
insights.append(f"Safety pass rate is {safety_rate:.1f}% - below recommended 95% threshold.")
|
| 1272 |
+
|
| 1273 |
+
# Recommendations
|
| 1274 |
+
if worst_agent != best_agent:
|
| 1275 |
+
recommendations.append(f"Consider retraining {worst_agent} using patterns from {best_agent}")
|
| 1276 |
+
|
| 1277 |
+
if avg_time > 1500:
|
| 1278 |
+
recommendations.append("Implement caching or optimize model inference to reduce response times")
|
| 1279 |
+
|
| 1280 |
+
recommendations.append("Schedule regular performance reviews every 2 weeks")
|
| 1281 |
+
recommendations.append("Set up automated alerts for scores below 7.0 or response times above 3 seconds")
|
| 1282 |
+
|
| 1283 |
+
if not df_analysis.empty:
|
| 1284 |
+
avg_readability = df_analysis['readability_score'].mean()
|
| 1285 |
+
if avg_readability < 6:
|
| 1286 |
+
recommendations.append("Improve response readability - consider simpler language and shorter sentences")
|
| 1287 |
+
|
| 1288 |
+
return {'insights': insights, 'recommendations': recommendations}
|
| 1289 |
+
|
| 1290 |
+
def detect_anomalies(self, df_eval):
|
| 1291 |
+
"""Detect performance anomalies"""
|
| 1292 |
+
anomalies = []
|
| 1293 |
+
|
| 1294 |
+
# Score anomalies (using IQR method)
|
| 1295 |
+
Q1 = df_eval['overall_score'].quantile(0.25)
|
| 1296 |
+
Q3 = df_eval['overall_score'].quantile(0.75)
|
| 1297 |
+
IQR = Q3 - Q1
|
| 1298 |
+
lower_bound = Q1 - 1.5 * IQR
|
| 1299 |
+
upper_bound = Q3 + 1.5 * IQR
|
| 1300 |
+
|
| 1301 |
+
score_anomalies = df_eval[(df_eval['overall_score'] < lower_bound) | (df_eval['overall_score'] > upper_bound)]
|
| 1302 |
+
if len(score_anomalies) > 0:
|
| 1303 |
+
anomalies.append(f"{len(score_anomalies)} evaluations with unusual scores detected")
|
| 1304 |
+
|
| 1305 |
+
# Response time anomalies
|
| 1306 |
+
time_Q1 = df_eval['execution_time_ms'].quantile(0.25)
|
| 1307 |
+
time_Q3 = df_eval['execution_time_ms'].quantile(0.75)
|
| 1308 |
+
time_IQR = time_Q3 - time_Q1
|
| 1309 |
+
time_upper = time_Q3 + 1.5 * time_IQR
|
| 1310 |
+
|
| 1311 |
+
time_anomalies = df_eval[df_eval['execution_time_ms'] > time_upper]
|
| 1312 |
+
if len(time_anomalies) > 0:
|
| 1313 |
+
anomalies.append(f"{len(time_anomalies)} evaluations with unusually long response times")
|
| 1314 |
+
|
| 1315 |
+
# Safety anomalies
|
| 1316 |
+
safety_failures = df_eval[df_eval['guardrails_passed'] == False]
|
| 1317 |
+
if len(safety_failures) > len(df_eval) * 0.1: # More than 10% failures
|
| 1318 |
+
anomalies.append(f"High safety failure rate: {len(safety_failures)} failures out of {len(df_eval)} evaluations")
|
| 1319 |
+
|
| 1320 |
+
return anomalies
|
| 1321 |
+
|
| 1322 |
+
def simulate_realtime_data(self):
|
| 1323 |
+
"""Simulate real-time data update"""
|
| 1324 |
+
import random
|
| 1325 |
+
|
| 1326 |
+
return {
|
| 1327 |
+
'score': random.uniform(7.0, 9.5),
|
| 1328 |
+
'score_delta': random.uniform(-0.5, 0.5),
|
| 1329 |
+
'time': random.uniform(500, 2000),
|
| 1330 |
+
'time_delta': random.uniform(-200, 200),
|
| 1331 |
+
'safe': random.choice([True, True, True, False]) # 75% safe
|
| 1332 |
+
}
|
| 1333 |
+
|
| 1334 |
+
def show_workflow_visualization(self, data: Dict[str, pd.DataFrame]):
|
| 1335 |
+
"""Show workflow visualization with queries, scores, latency, hallucination, and token consumption"""
|
| 1336 |
+
st.header("π Workflow Visualization")
|
| 1337 |
+
|
| 1338 |
+
df_eval = data['evaluations']
|
| 1339 |
+
if df_eval.empty:
|
| 1340 |
+
st.warning("No evaluation data available for workflow visualization.")
|
| 1341 |
+
return
|
| 1342 |
+
|
| 1343 |
+
# Create workflow selection
|
| 1344 |
+
col1, col2 = st.columns([1, 1])
|
| 1345 |
+
|
| 1346 |
+
with col1:
|
| 1347 |
+
sessions = df_eval['session_id'].unique()
|
| 1348 |
+
selected_session = st.selectbox("Select Session", sessions, key="workflow_session")
|
| 1349 |
+
|
| 1350 |
+
with col2:
|
| 1351 |
+
agents = df_eval['agent_name'].unique()
|
| 1352 |
+
selected_agent = st.selectbox("Select Agent (Optional)", ['All'] + list(agents), key="workflow_agent")
|
| 1353 |
+
|
| 1354 |
+
# Filter data
|
| 1355 |
+
session_data = df_eval[df_eval['session_id'] == selected_session]
|
| 1356 |
+
if selected_agent != 'All':
|
| 1357 |
+
session_data = session_data[session_data['agent_name'] == selected_agent]
|
| 1358 |
+
|
| 1359 |
+
if session_data.empty:
|
| 1360 |
+
st.warning("No data found for selected filters.")
|
| 1361 |
+
return
|
| 1362 |
+
|
| 1363 |
+
# Create workflow diagram
|
| 1364 |
+
st.subheader("π Workflow Flow Diagram")
|
| 1365 |
+
|
| 1366 |
+
# Generate Mermaid diagram
|
| 1367 |
+
mermaid_diagram = self.create_workflow_diagram(session_data)
|
| 1368 |
+
|
| 1369 |
+
# Display the diagram using markdown (since create_diagram might not be available)
|
| 1370 |
+
st.markdown("```mermaid\n" + mermaid_diagram + "\n```")
|
| 1371 |
+
|
| 1372 |
+
# Workflow metrics overview
|
| 1373 |
+
st.subheader("π Session Metrics Overview")
|
| 1374 |
+
|
| 1375 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 1376 |
+
|
| 1377 |
+
with col1:
|
| 1378 |
+
avg_score = session_data['overall_score'].mean()
|
| 1379 |
+
st.metric("Avg Overall Score", f"{avg_score:.2f}/10",
|
| 1380 |
+
delta=f"{avg_score - 7.5:.2f}" if avg_score > 7.5 else f"{avg_score - 7.5:.2f}")
|
| 1381 |
+
|
| 1382 |
+
with col2:
|
| 1383 |
+
avg_latency = session_data['execution_time_ms'].mean()
|
| 1384 |
+
st.metric("Avg Response Time", f"{avg_latency:.0f}ms",
|
| 1385 |
+
delta=f"{avg_latency - 3000:.0f}ms" if avg_latency < 3000 else f"+{avg_latency - 3000:.0f}ms")
|
| 1386 |
+
|
| 1387 |
+
with col3:
|
| 1388 |
+
avg_hallucination = session_data['hallucination_score'].mean() if 'hallucination_score' in session_data.columns else 0
|
| 1389 |
+
st.metric("Avg Hallucination", f"{avg_hallucination:.2f}/10",
|
| 1390 |
+
delta=f"{5.0 - avg_hallucination:.2f}" if avg_hallucination < 5.0 else f"-{avg_hallucination - 5.0:.2f}")
|
| 1391 |
+
|
| 1392 |
+
with col4:
|
| 1393 |
+
total_tokens = session_data['total_tokens'].sum() if 'total_tokens' in session_data.columns else 0
|
| 1394 |
+
total_cost = session_data['cost_usd'].sum() if 'cost_usd' in session_data.columns else 0
|
| 1395 |
+
st.metric("Total Cost", f"${total_cost:.4f}", f"{total_tokens:,} tokens")
|
| 1396 |
+
|
| 1397 |
+
# Detailed workflow steps
|
| 1398 |
+
st.subheader("π Detailed Workflow Steps")
|
| 1399 |
+
|
| 1400 |
+
for idx, row in session_data.iterrows():
|
| 1401 |
+
with st.expander(f"Step {idx + 1}: {row['agent_name']} - Score: {row['overall_score']:.2f}/10"):
|
| 1402 |
+
|
| 1403 |
+
# Query and Response
|
| 1404 |
+
col1, col2 = st.columns([1, 1])
|
| 1405 |
+
|
| 1406 |
+
with col1:
|
| 1407 |
+
st.markdown("**Query:**")
|
| 1408 |
+
st.write(row['query'])
|
| 1409 |
+
|
| 1410 |
+
# Performance metrics
|
| 1411 |
+
st.markdown("**Performance Metrics:**")
|
| 1412 |
+
metrics_data = {
|
| 1413 |
+
'Overall Score': row['overall_score'],
|
| 1414 |
+
'Relevance': row['relevance_score'],
|
| 1415 |
+
'Accuracy': row['accuracy_score'],
|
| 1416 |
+
'Completeness': row['completeness_score'],
|
| 1417 |
+
'Coherence': row['coherence_score'],
|
| 1418 |
+
'Hallucination': row.get('hallucination_score', 0),
|
| 1419 |
+
'Safety': row['safety_score']
|
| 1420 |
+
}
|
| 1421 |
+
|
| 1422 |
+
# Create a bar chart for scores
|
| 1423 |
+
import plotly.graph_objects as go
|
| 1424 |
+
fig = go.Figure(data=[
|
| 1425 |
+
go.Bar(x=list(metrics_data.keys()), y=list(metrics_data.values()),
|
| 1426 |
+
marker_color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2'])
|
| 1427 |
+
])
|
| 1428 |
+
fig.update_layout(
|
| 1429 |
+
title="Score Breakdown",
|
| 1430 |
+
yaxis_title="Score (0-10)",
|
| 1431 |
+
height=300,
|
| 1432 |
+
showlegend=False
|
| 1433 |
+
)
|
| 1434 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1435 |
+
|
| 1436 |
+
with col2:
|
| 1437 |
+
st.markdown("**Response:**")
|
| 1438 |
+
st.write(row['response'])
|
| 1439 |
+
|
| 1440 |
+
# Token and cost information
|
| 1441 |
+
st.markdown("**Resource Consumption:**")
|
| 1442 |
+
|
| 1443 |
+
token_col1, token_col2 = st.columns(2)
|
| 1444 |
+
with token_col1:
|
| 1445 |
+
input_tokens = row.get('input_tokens', 0)
|
| 1446 |
+
output_tokens = row.get('output_tokens', 0)
|
| 1447 |
+
st.metric("Input Tokens", f"{input_tokens:,}")
|
| 1448 |
+
st.metric("Output Tokens", f"{output_tokens:,}")
|
| 1449 |
+
|
| 1450 |
+
with token_col2:
|
| 1451 |
+
total_tokens = row.get('total_tokens', 0)
|
| 1452 |
+
cost = row.get('cost_usd', 0)
|
| 1453 |
+
st.metric("Total Tokens", f"{total_tokens:,}")
|
| 1454 |
+
st.metric("Cost", f"${cost:.4f}")
|
| 1455 |
+
|
| 1456 |
+
# Execution details
|
| 1457 |
+
st.markdown("**Execution Details:**")
|
| 1458 |
+
exec_time = row['execution_time_ms']
|
| 1459 |
+
llm_provider = row.get('llm_provider', 'Unknown')
|
| 1460 |
+
model_name = row.get('model_name', 'Unknown')
|
| 1461 |
+
|
| 1462 |
+
st.write(f"β±οΈ **Execution Time:** {exec_time:.0f}ms")
|
| 1463 |
+
st.write(f"π€ **LLM Provider:** {llm_provider}")
|
| 1464 |
+
st.write(f"π§ **Model:** {model_name}")
|
| 1465 |
+
st.write(f"π‘οΈ **Safety Passed:** {'β
' if row['guardrails_passed'] else 'β'}")
|
| 1466 |
+
|
| 1467 |
+
# Comparative analysis
|
| 1468 |
+
st.subheader("π Comparative Analysis")
|
| 1469 |
+
|
| 1470 |
+
# Create comparison charts
|
| 1471 |
+
col1, col2 = st.columns(2)
|
| 1472 |
+
|
| 1473 |
+
with col1:
|
| 1474 |
+
# Score comparison
|
| 1475 |
+
fig = go.Figure()
|
| 1476 |
+
|
| 1477 |
+
score_columns = ['overall_score', 'relevance_score', 'accuracy_score', 'completeness_score', 'coherence_score']
|
| 1478 |
+
if 'hallucination_score' in session_data.columns:
|
| 1479 |
+
score_columns.append('hallucination_score')
|
| 1480 |
+
|
| 1481 |
+
for i, (idx, row) in enumerate(session_data.iterrows()):
|
| 1482 |
+
fig.add_trace(go.Scatterpolar(
|
| 1483 |
+
r=[row[col] for col in score_columns],
|
| 1484 |
+
theta=[col.replace('_score', '').title() for col in score_columns],
|
| 1485 |
+
fill='toself',
|
| 1486 |
+
name=f"{row['agent_name']} - Step {i+1}"
|
| 1487 |
+
))
|
| 1488 |
+
|
| 1489 |
+
fig.update_layout(
|
| 1490 |
+
polar=dict(
|
| 1491 |
+
radialaxis=dict(
|
| 1492 |
+
visible=True,
|
| 1493 |
+
range=[0, 10]
|
| 1494 |
+
)),
|
| 1495 |
+
showlegend=True,
|
| 1496 |
+
title="Score Comparison Radar Chart"
|
| 1497 |
+
)
|
| 1498 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1499 |
+
|
| 1500 |
+
with col2:
|
| 1501 |
+
# Token consumption over steps
|
| 1502 |
+
if 'total_tokens' in session_data.columns:
|
| 1503 |
+
fig = go.Figure()
|
| 1504 |
+
|
| 1505 |
+
steps = [f"Step {i+1}" for i in range(len(session_data))]
|
| 1506 |
+
fig.add_trace(go.Bar(
|
| 1507 |
+
x=steps,
|
| 1508 |
+
y=session_data['total_tokens'],
|
| 1509 |
+
name='Total Tokens',
|
| 1510 |
+
marker_color='lightblue'
|
| 1511 |
+
))
|
| 1512 |
+
|
| 1513 |
+
fig.add_trace(go.Scatter(
|
| 1514 |
+
x=steps,
|
| 1515 |
+
y=session_data['execution_time_ms'],
|
| 1516 |
+
yaxis='y2',
|
| 1517 |
+
name='Response Time (ms)',
|
| 1518 |
+
line=dict(color='red', width=2),
|
| 1519 |
+
mode='lines+markers'
|
| 1520 |
+
))
|
| 1521 |
+
|
| 1522 |
+
fig.update_layout(
|
| 1523 |
+
title="Token Consumption vs Response Time",
|
| 1524 |
+
xaxis_title="Workflow Steps",
|
| 1525 |
+
yaxis_title="Total Tokens",
|
| 1526 |
+
yaxis2=dict(
|
| 1527 |
+
title="Response Time (ms)",
|
| 1528 |
+
overlaying='y',
|
| 1529 |
+
side='right'
|
| 1530 |
+
),
|
| 1531 |
+
height=400
|
| 1532 |
+
)
|
| 1533 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 1534 |
+
|
| 1535 |
+
# Session summary
|
| 1536 |
+
st.subheader("π Session Summary")
|
| 1537 |
+
|
| 1538 |
+
summary_col1, summary_col2, summary_col3 = st.columns(3)
|
| 1539 |
+
|
| 1540 |
+
with summary_col1:
|
| 1541 |
+
st.markdown("**Quality Metrics:**")
|
| 1542 |
+
st.write(f"β’ Average Overall Score: {session_data['overall_score'].mean():.2f}/10")
|
| 1543 |
+
st.write(f"β’ Best Performing Step: {session_data.loc[session_data['overall_score'].idxmax(), 'agent_name']}")
|
| 1544 |
+
st.write(f"β’ Consistency (Std Dev): {session_data['overall_score'].std():.2f}")
|
| 1545 |
+
|
| 1546 |
+
with summary_col2:
|
| 1547 |
+
st.markdown("**Performance Metrics:**")
|
| 1548 |
+
st.write(f"β’ Total Execution Time: {session_data['execution_time_ms'].sum():.0f}ms")
|
| 1549 |
+
st.write(f"β’ Average Response Time: {session_data['execution_time_ms'].mean():.0f}ms")
|
| 1550 |
+
st.write(f"β’ Fastest Step: {session_data['execution_time_ms'].min():.0f}ms")
|
| 1551 |
+
|
| 1552 |
+
with summary_col3:
|
| 1553 |
+
st.markdown("**Resource Usage:**")
|
| 1554 |
+
if 'total_tokens' in session_data.columns:
|
| 1555 |
+
st.write(f"β’ Total Tokens Used: {session_data['total_tokens'].sum():,}")
|
| 1556 |
+
st.write(f"β’ Total Cost: ${session_data['cost_usd'].sum():.4f}")
|
| 1557 |
+
st.write(f"β’ Avg Cost per Query: ${session_data['cost_usd'].mean():.4f}")
|
| 1558 |
+
else:
|
| 1559 |
+
st.write("β’ Token data not available")
|
| 1560 |
+
|
| 1561 |
+
# Export functionality
|
| 1562 |
+
st.subheader("π€ Export Workflow Data")
|
| 1563 |
+
|
| 1564 |
+
if st.button("Export Session Data to CSV", key="export_workflow"):
|
| 1565 |
+
csv_data = session_data.to_csv(index=False)
|
| 1566 |
+
st.download_button(
|
| 1567 |
+
label="Download CSV",
|
| 1568 |
+
data=csv_data,
|
| 1569 |
+
file_name=f"workflow_session_{selected_session}.csv",
|
| 1570 |
+
mime="text/csv"
|
| 1571 |
+
)
|
| 1572 |
+
|
| 1573 |
+
def create_workflow_diagram(self, session_data):
|
| 1574 |
+
"""Create a Mermaid workflow diagram"""
|
| 1575 |
+
diagram = "graph TD\n"
|
| 1576 |
+
diagram += " Start([Session Start])\n"
|
| 1577 |
+
|
| 1578 |
+
for i, (idx, row) in enumerate(session_data.iterrows()):
|
| 1579 |
+
step_id = f"Step{i+1}"
|
| 1580 |
+
agent_name = row['agent_name'].replace(' ', '_')
|
| 1581 |
+
score = row['overall_score']
|
| 1582 |
+
exec_time = row['execution_time_ms']
|
| 1583 |
+
|
| 1584 |
+
# Color based on score
|
| 1585 |
+
if score >= 8.5:
|
| 1586 |
+
color = "fill:#90EE90" # Light green
|
| 1587 |
+
elif score >= 7.0:
|
| 1588 |
+
color = "fill:#FFE4B5" # Light orange
|
| 1589 |
+
else:
|
| 1590 |
+
color = "fill:#FFB6C1" # Light pink
|
| 1591 |
+
|
| 1592 |
+
diagram += f" {step_id}[\"{agent_name}<br/>Score: {score:.1f}/10<br/>Time: {exec_time:.0f}ms\"]\n"
|
| 1593 |
+
diagram += f" {step_id} --> {step_id}_result{{Result}}\n"
|
| 1594 |
+
|
| 1595 |
+
if i == 0:
|
| 1596 |
+
diagram += f" Start --> {step_id}\n"
|
| 1597 |
+
else:
|
| 1598 |
+
prev_step = f"Step{i}"
|
| 1599 |
+
diagram += f" {prev_step}_result --> {step_id}\n"
|
| 1600 |
+
|
| 1601 |
+
# Add styling
|
| 1602 |
+
diagram += f" class {step_id} stepClass;\n"
|
| 1603 |
+
|
| 1604 |
+
# Add end node
|
| 1605 |
+
last_step = f"Step{len(session_data)}"
|
| 1606 |
+
diagram += f" {last_step}_result --> End([Session End])\n"
|
| 1607 |
+
|
| 1608 |
+
# Add class definitions
|
| 1609 |
+
diagram += " classDef stepClass fill:#e1f5fe,stroke:#01579b,stroke-width:2px;\n"
|
| 1610 |
+
|
| 1611 |
+
return diagram
|
| 1612 |
+
|
| 1613 |
def run(self):
|
| 1614 |
"""Run the dashboard"""
|
| 1615 |
st.title("π€ Multi-Agent System Dashboard - Demo")
|
|
|
|
| 1644 |
if filters.get('safety_only', False):
|
| 1645 |
df = df[df['guardrails_passed'] == True]
|
| 1646 |
|
| 1647 |
+
# Performance tier filter
|
| 1648 |
+
if filters.get('performance_tier') != "All":
|
| 1649 |
+
if filters['performance_tier'] == "Excellent (8.5+)":
|
| 1650 |
+
df = df[df['overall_score'] >= 8.5]
|
| 1651 |
+
elif filters['performance_tier'] == "Good (7.0-8.5)":
|
| 1652 |
+
df = df[(df['overall_score'] >= 7.0) & (df['overall_score'] < 8.5)]
|
| 1653 |
+
elif filters['performance_tier'] == "Needs Improvement (<7.0)":
|
| 1654 |
+
df = df[df['overall_score'] < 7.0]
|
| 1655 |
+
|
| 1656 |
+
# Response time filter
|
| 1657 |
+
if 'max_response_time' in filters:
|
| 1658 |
+
df = df[df['execution_time_ms'] <= filters['max_response_time']]
|
| 1659 |
+
|
| 1660 |
+
# Provider filter
|
| 1661 |
+
if 'providers' in filters and filters['providers']:
|
| 1662 |
+
df = df[df['llm_provider'].isin(filters['providers'])]
|
| 1663 |
+
|
| 1664 |
filtered_data['evaluations'] = df
|
| 1665 |
|
| 1666 |
# Create tabs
|
| 1667 |
+
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
|
| 1668 |
"π Executive Summary",
|
| 1669 |
"π€ Agent Performance",
|
| 1670 |
"π‘οΈ Safety Analysis",
|
| 1671 |
+
"π Response Analysis",
|
| 1672 |
+
"π¬ Advanced Analytics",
|
| 1673 |
+
"π Workflow Visualization"
|
| 1674 |
])
|
| 1675 |
|
| 1676 |
with tab1:
|
|
|
|
| 1685 |
with tab4:
|
| 1686 |
self.show_response_analysis(filtered_data)
|
| 1687 |
|
| 1688 |
+
with tab5:
|
| 1689 |
+
self.show_advanced_analytics(filtered_data)
|
| 1690 |
+
|
| 1691 |
+
with tab6:
|
| 1692 |
+
self.show_workflow_visualization(filtered_data)
|
| 1693 |
+
|
| 1694 |
+
# Quick actions sidebar
|
| 1695 |
+
st.sidebar.markdown("---")
|
| 1696 |
+
st.sidebar.markdown("### β‘ Quick Actions")
|
| 1697 |
+
|
| 1698 |
+
if st.sidebar.button("π Generate Report"):
|
| 1699 |
+
st.sidebar.success("π Report generated!")
|
| 1700 |
+
# Could generate PDF report here
|
| 1701 |
+
|
| 1702 |
+
if st.sidebar.button("π Refresh Data"):
|
| 1703 |
+
st.sidebar.success("π Data refreshed!")
|
| 1704 |
+
st.experimental_rerun()
|
| 1705 |
+
|
| 1706 |
+
if st.sidebar.button("π§ Send Alert"):
|
| 1707 |
+
st.sidebar.success("π§ Alert sent to team!")
|
| 1708 |
+
|
| 1709 |
+
# Data summary in sidebar
|
| 1710 |
+
if not filtered_data['evaluations'].empty:
|
| 1711 |
+
st.sidebar.markdown("### π Current Session")
|
| 1712 |
+
st.sidebar.metric("Filtered Records", len(filtered_data['evaluations']))
|
| 1713 |
+
st.sidebar.metric("Avg Score", f"{filtered_data['evaluations']['overall_score'].mean():.2f}")
|
| 1714 |
+
st.sidebar.metric("Success Rate", f"{(filtered_data['evaluations']['guardrails_passed'].sum() / len(filtered_data['evaluations']) * 100):.1f}%")
|
| 1715 |
+
|
| 1716 |
# Footer
|
| 1717 |
st.markdown("---")
|
| 1718 |
+
col1, col2, col3 = st.columns(3)
|
| 1719 |
+
|
| 1720 |
+
with col1:
|
| 1721 |
+
st.markdown("π **Multi-Agent System Dashboard**")
|
| 1722 |
+
|
| 1723 |
+
with col2:
|
| 1724 |
+
st.markdown("Built with Streamlit & Plotly")
|
| 1725 |
+
|
| 1726 |
+
with col3:
|
| 1727 |
+
if st.button("βΉοΈ About"):
|
| 1728 |
+
st.info("""
|
| 1729 |
+
**Multi-Agent System Dashboard v2.0**
|
| 1730 |
+
|
| 1731 |
+
Features:
|
| 1732 |
+
- π Real-time monitoring
|
| 1733 |
+
- π€ AI-powered insights
|
| 1734 |
+
- π Advanced analytics
|
| 1735 |
+
- π Response tracing
|
| 1736 |
+
- π‘οΈ Safety monitoring
|
| 1737 |
+
- π Performance benchmarking
|
| 1738 |
+
|
| 1739 |
+
Built for production-grade multi-agent systems.
|
| 1740 |
+
""")
|
| 1741 |
|
| 1742 |
if __name__ == "__main__":
|
| 1743 |
dashboard = HuggingFaceDashboard()
|