Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Multi-Agent System Dashboard - Hugging Face Spaces Demo | |
| """ | |
| import streamlit as st | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| import sqlite3 | |
| from datetime import datetime, timedelta | |
| import json | |
| import numpy as np | |
| from typing import Dict, List, Any, Optional | |
| import os | |
| from pathlib import Path | |
| # Set page config first | |
| st.set_page_config( | |
| page_title="π€ Multi-Agent System Dashboard", | |
| page_icon="π€", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Fix for Hugging Face Spaces permissions | |
| import tempfile | |
| import os | |
| if not os.access('.', os.W_OK): | |
| # If current directory is not writable, use temp directory | |
| temp_dir = tempfile.gettempdir() | |
| os.chdir(temp_dir) | |
| class HuggingFaceDashboard: | |
| def __init__(self): | |
| # Use temp directory for database in Hugging Face Spaces | |
| import tempfile | |
| temp_dir = tempfile.gettempdir() | |
| self.db_path = os.path.join(temp_dir, "evaluation_logs.db") | |
| self.setup_demo_data() | |
| def setup_demo_data(self): | |
| """Setup demo data if database doesn't exist or is empty""" | |
| if not os.path.exists(self.db_path): | |
| self.create_demo_database() | |
| else: | |
| # Check if database has data | |
| try: | |
| conn = sqlite3.connect(self.db_path) | |
| cursor = conn.cursor() | |
| cursor.execute("SELECT COUNT(*) FROM evaluation_logs") | |
| count = cursor.fetchone()[0] | |
| conn.close() | |
| # If database is empty or has very little data, recreate it | |
| if count < 50: | |
| os.remove(self.db_path) | |
| self.create_demo_database() | |
| except: | |
| # If there's any error reading the database, recreate it | |
| if os.path.exists(self.db_path): | |
| os.remove(self.db_path) | |
| self.create_demo_database() | |
| def create_demo_database(self): | |
| """Create a demo database with sample data""" | |
| conn = sqlite3.connect(self.db_path) | |
| cursor = conn.cursor() | |
| # Create evaluation_logs table | |
| cursor.execute(''' | |
| CREATE TABLE IF NOT EXISTS evaluation_logs ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| session_id TEXT NOT NULL, | |
| agent_name TEXT NOT NULL, | |
| query TEXT NOT NULL, | |
| response TEXT, | |
| overall_score REAL, | |
| relevance_score REAL, | |
| accuracy_score REAL, | |
| completeness_score REAL, | |
| coherence_score REAL, | |
| hallucination_score REAL, | |
| guardrails_passed BOOLEAN, | |
| safety_score REAL, | |
| execution_time_ms REAL, | |
| input_tokens INTEGER, | |
| output_tokens INTEGER, | |
| total_tokens INTEGER, | |
| cost_usd REAL, | |
| error_occurred BOOLEAN DEFAULT FALSE, | |
| llm_provider TEXT, | |
| model_name TEXT, | |
| judge_reasoning TEXT, | |
| guardrails_failures TEXT DEFAULT '[]', | |
| timestamp DATETIME DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| ''') | |
| # Create workflow_traces table with enhanced response tracking | |
| cursor.execute(''' | |
| CREATE TABLE IF NOT EXISTS workflow_traces ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| session_id TEXT NOT NULL, | |
| step_name TEXT NOT NULL, | |
| agent_name TEXT, | |
| step_type TEXT, | |
| input_data TEXT, | |
| output_data TEXT, | |
| response_metadata TEXT, | |
| token_count INTEGER, | |
| response_length INTEGER, | |
| execution_time_ms REAL, | |
| error_occurred BOOLEAN DEFAULT FALSE, | |
| error_details TEXT, | |
| timestamp DATETIME DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| ''') | |
| # Create response_analysis table for detailed response tracking | |
| cursor.execute(''' | |
| CREATE TABLE IF NOT EXISTS response_analysis ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| evaluation_id INTEGER, | |
| session_id TEXT NOT NULL, | |
| agent_name TEXT NOT NULL, | |
| response_text TEXT NOT NULL, | |
| response_length INTEGER, | |
| word_count INTEGER, | |
| sentence_count INTEGER, | |
| readability_score REAL, | |
| sentiment_score REAL, | |
| key_topics TEXT, | |
| response_type TEXT, | |
| contains_code BOOLEAN DEFAULT FALSE, | |
| contains_links BOOLEAN DEFAULT FALSE, | |
| language_detected TEXT DEFAULT 'en', | |
| timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, | |
| FOREIGN KEY (evaluation_id) REFERENCES evaluation_logs (id) | |
| ) | |
| ''') | |
| # Insert demo data | |
| self.insert_demo_data(cursor) | |
| conn.commit() | |
| conn.close() | |
| def insert_demo_data(self, cursor): | |
| """Insert comprehensive demo data""" | |
| import random | |
| from datetime import datetime, timedelta | |
| agents = ["Diet Agent", "Support Agent", "Queries Agent"] | |
| # Comprehensive sample queries for each agent | |
| sample_queries = { | |
| "Diet Agent": [ | |
| "What's a healthy meal plan for weight loss?", | |
| "Can you suggest low-carb breakfast options?", | |
| "What are the benefits of intermittent fasting?", | |
| "How much protein should I eat daily?", | |
| "What foods are good for heart health?", | |
| "Can you create a vegetarian meal plan?", | |
| "What snacks are good for diabetics?", | |
| "How to meal prep for the week?", | |
| "What are superfoods I should include?", | |
| "How to calculate my daily calorie needs?", | |
| "What's the Mediterranean diet about?", | |
| "Are supplements necessary for nutrition?", | |
| "How to eat healthy on a budget?", | |
| "What foods help with inflammation?", | |
| "Can you suggest post-workout meals?", | |
| "What's a balanced breakfast for energy?", | |
| "How to reduce sugar in my diet?", | |
| "What are healthy cooking methods?", | |
| "Can you help with portion control?", | |
| "What foods boost metabolism?" | |
| ], | |
| "Support Agent": [ | |
| "I'm having trouble sleeping, can you help?", | |
| "How do I manage work stress?", | |
| "I feel overwhelmed with my tasks", | |
| "Can you help me organize my schedule?", | |
| "I'm having difficulty focusing", | |
| "How to improve my productivity?", | |
| "I need help with time management", | |
| "How to deal with anxiety?", | |
| "Can you suggest relaxation techniques?", | |
| "I'm feeling burned out at work", | |
| "How to maintain work-life balance?", | |
| "I need motivation to exercise", | |
| "How to build better habits?", | |
| "I'm struggling with procrastination", | |
| "Can you help me set goals?", | |
| "How to handle difficult conversations?", | |
| "I need help with decision making", | |
| "How to boost my confidence?", | |
| "Can you help me manage emotions?", | |
| "What are good stress relief activities?" | |
| ], | |
| "Queries Agent": [ | |
| "What are the latest developments in AI?", | |
| "How does blockchain technology work?", | |
| "What is quantum computing?", | |
| "Explain machine learning algorithms", | |
| "What are the benefits of cloud computing?", | |
| "How does renewable energy work?", | |
| "What is the future of electric vehicles?", | |
| "Explain cryptocurrency and Bitcoin", | |
| "What is cybersecurity and why is it important?", | |
| "How do neural networks function?", | |
| "What are the applications of IoT?", | |
| "Explain data science and analytics", | |
| "What is edge computing?", | |
| "How does 5G technology work?", | |
| "What are the trends in biotechnology?", | |
| "How does virtual reality work?", | |
| "What is artificial general intelligence?", | |
| "Explain the metaverse concept", | |
| "What are smart contracts?", | |
| "How does automation impact jobs?" | |
| ] | |
| } | |
| # Generate comprehensive demo data | |
| total_evaluations = 300 # Increased for better demo | |
| for i in range(total_evaluations): | |
| agent = random.choice(agents) | |
| query = random.choice(sample_queries[agent]) | |
| # Add query variations for realism | |
| if random.random() < 0.3: # 30% chance to modify query | |
| variations = [ | |
| f"Can you please {query.lower()}", | |
| f"I need help with: {query.lower()}", | |
| f"Could you explain {query.lower()}", | |
| f"What's your advice on {query.lower()}" | |
| ] | |
| query = random.choice(variations) | |
| # Generate realistic scores with agent-specific tendencies | |
| if agent == "Diet Agent": | |
| base_score = random.uniform(7.5, 9.2) # Diet agent performs well | |
| elif agent == "Support Agent": | |
| base_score = random.uniform(7.8, 9.5) # Support agent is consistent | |
| else: # Queries Agent | |
| base_score = random.uniform(6.8, 8.8) # More variable for complex queries | |
| # Create realistic timestamp distribution | |
| if i < 50: # Recent data (last 3 days) | |
| days_ago = random.randint(0, 2) | |
| elif i < 150: # Medium recent (last 2 weeks) | |
| days_ago = random.randint(3, 14) | |
| else: # Historical (last 30 days) | |
| days_ago = random.randint(15, 29) | |
| hours_ago = random.randint(0, 23) | |
| minutes_ago = random.randint(0, 59) | |
| timestamp = datetime.now() - timedelta(days=days_ago, hours=hours_ago, minutes=minutes_ago) | |
| # Generate realistic response | |
| response_templates = { | |
| "Diet Agent": [ | |
| f"Thank you for your question about nutrition and dietary guidance. I'd be happy to help you develop a healthier relationship with food and create sustainable eating habits.", | |
| f"I understand you're looking for dietary advice, and I'm here to provide evidence-based nutritional guidance tailored to your specific needs and goals.", | |
| f"Great question about nutrition! Let me share some comprehensive dietary recommendations that can help you achieve better health outcomes." | |
| ], | |
| "Support Agent": [ | |
| f"I appreciate you reaching out for support. It takes courage to ask for help, and I'm here to provide you with practical strategies and emotional guidance.", | |
| f"Thank you for sharing your concerns with me. I understand this can be challenging, and I want to help you work through this step by step with compassion and understanding.", | |
| f"I'm glad you've come to me for support. Your feelings are valid, and together we can explore effective coping strategies and build resilience." | |
| ], | |
| "Queries Agent": [ | |
| f"Excellent question! This is a fascinating topic that involves cutting-edge technology and has significant implications for our future. Let me provide you with a comprehensive overview.", | |
| f"Thank you for this thought-provoking question. This subject encompasses multiple disciplines and recent innovations. I'll break this down into key concepts and practical applications.", | |
| f"Great inquiry! This is an evolving field with exciting developments. Let me explain the fundamental principles and explore the current state of research and implementation." | |
| ] | |
| } | |
| # Generate more detailed response based on agent type | |
| base_response = random.choice(response_templates[agent]) | |
| # Add specific details based on agent type | |
| if agent == "Diet Agent": | |
| details = [ | |
| "**Key Nutritional Recommendations:**\n\n1. **Whole Foods Focus**: Prioritize unprocessed foods like fresh fruits, vegetables, whole grains, lean proteins, and healthy fats. These provide essential nutrients and fiber while avoiding added sugars and preservatives.\n\n2. **Portion Control**: Use the plate method - fill half your plate with non-starchy vegetables, one quarter with lean protein, and one quarter with complex carbohydrates.\n\n3. **Hydration**: Aim for 8-10 glasses of water daily. Proper hydration supports metabolism, digestion, and overall health.\n\n4. **Meal Timing**: Eat regular meals every 3-4 hours to maintain stable blood sugar levels and prevent overeating.\n\n**Sample Daily Meal Plan:**\n- Breakfast: Greek yogurt with berries and nuts\n- Lunch: Quinoa salad with grilled chicken and vegetables\n- Dinner: Baked salmon with roasted sweet potatoes and broccoli\n- Snacks: Apple with almond butter, or handful of mixed nuts", | |
| "**Evidence-Based Dietary Guidelines:**\n\n1. **Macronutrient Balance**: Aim for 45-65% carbohydrates (focus on complex carbs), 20-35% healthy fats, and 10-35% protein based on your activity level.\n\n2. **Micronutrient Density**: Choose foods rich in vitamins, minerals, and antioxidants. Include colorful fruits and vegetables to ensure variety.\n\n3. **Fiber Intake**: Target 25-35 grams daily through whole grains, legumes, fruits, and vegetables to support digestive health.\n\n4. **Healthy Fats**: Include omega-3 fatty acids from fish, walnuts, and flaxseeds, while limiting saturated and trans fats.\n\n**Practical Implementation Tips:**\n- Meal prep on weekends to ensure healthy options are available\n- Read nutrition labels to make informed choices\n- Practice mindful eating by eating slowly and paying attention to hunger cues\n- Keep a food diary to track patterns and identify areas for improvement", | |
| "**Personalized Nutrition Approach:**\n\nEvery individual has unique nutritional needs based on age, gender, activity level, health conditions, and personal preferences. Here's how to customize your approach:\n\n1. **Assessment**: Consider your current health status, goals (weight management, energy levels, disease prevention), and any dietary restrictions.\n\n2. **Gradual Changes**: Implement changes slowly to ensure sustainability. Start with one or two modifications per week.\n\n3. **Professional Guidance**: Consider consulting with a registered dietitian for personalized meal planning, especially if you have specific health conditions.\n\n4. **Regular Monitoring**: Track your progress through energy levels, sleep quality, and how you feel overall, not just weight.\n\n**Common Nutritional Myths Debunked:**\n- Carbs aren't inherently bad - choose complex carbohydrates over simple sugars\n- Fat doesn't make you fat - healthy fats are essential for hormone production and nutrient absorption\n- Skipping meals doesn't help with weight loss and can lead to overeating later" | |
| ] | |
| elif agent == "Support Agent": | |
| details = [ | |
| "**Comprehensive Support Strategy:**\n\n**Immediate Coping Techniques:**\n1. **Deep Breathing**: Practice the 4-7-8 technique - inhale for 4 counts, hold for 7, exhale for 8. This activates your parasympathetic nervous system.\n\n2. **Grounding Exercises**: Use the 5-4-3-2-1 method - identify 5 things you can see, 4 you can touch, 3 you can hear, 2 you can smell, and 1 you can taste.\n\n3. **Progressive Muscle Relaxation**: Tense and release each muscle group from toes to head, holding tension for 5 seconds before releasing.\n\n**Long-term Strategies:**\n- Establish a consistent daily routine to provide structure and predictability\n- Practice mindfulness meditation for 10-15 minutes daily\n- Maintain a journal to process emotions and identify patterns\n- Build a support network of trusted friends, family, or support groups\n\n**Professional Resources:**\nConsider reaching out to mental health professionals if you're experiencing persistent difficulties. Many offer telehealth options for convenience.", | |
| "**Building Emotional Resilience:**\n\n**Understanding Your Emotions:**\nEmotions are natural responses to life events. Learning to recognize, understand, and manage them is a skill that can be developed with practice.\n\n**Practical Steps:**\n1. **Emotion Identification**: Use an emotion wheel or journal to name specific feelings rather than general terms like 'bad' or 'stressed.'\n\n2. **Trigger Awareness**: Notice what situations, people, or thoughts tend to trigger difficult emotions.\n\n3. **Response vs. Reaction**: Create a pause between feeling and action. Ask yourself: 'What would be most helpful right now?'\n\n4. **Self-Compassion**: Treat yourself with the same kindness you'd offer a good friend facing similar challenges.\n\n**Daily Practices:**\n- Morning intention setting (5 minutes)\n- Midday check-in with your emotional state\n- Evening reflection on what went well and what you learned\n- Regular physical activity to support mental health\n\n**Crisis Resources:**\nIf you're experiencing thoughts of self-harm, please reach out immediately to a crisis hotline, emergency services, or trusted healthcare provider.", | |
| "**Stress Management and Well-being:**\n\n**Understanding Stress:**\nStress is a normal part of life, but chronic stress can impact your physical and mental health. Learning effective management techniques is crucial for long-term well-being.\n\n**Evidence-Based Techniques:**\n1. **Cognitive Restructuring**: Challenge negative thought patterns by asking: 'Is this thought realistic? What evidence supports or contradicts it? What would I tell a friend in this situation?'\n\n2. **Time Management**: Use techniques like the Pomodoro method, prioritization matrices, and saying no to non-essential commitments.\n\n3. **Physical Self-Care**: Regular exercise, adequate sleep (7-9 hours), and proper nutrition form the foundation of stress resilience.\n\n4. **Social Connection**: Maintain relationships with supportive people. Even brief positive interactions can improve mood and reduce stress.\n\n**Creating Your Personal Toolkit:**\n- Identify 3-5 coping strategies that work best for you\n- Practice them regularly, not just during stressful times\n- Adjust and refine your approach based on what's most effective\n- Remember that seeking help is a sign of strength, not weakness" | |
| ] | |
| else: # Queries Agent | |
| details = [ | |
| "**Technical Deep Dive:**\n\n**Fundamental Concepts:**\nThis technology represents a convergence of multiple disciplines including computer science, mathematics, engineering, and domain-specific expertise. The underlying principles involve complex algorithms, data structures, and computational methods.\n\n**Current Implementation:**\n1. **Healthcare**: AI-powered diagnostic tools, personalized treatment plans, drug discovery acceleration, and robotic surgery assistance.\n\n2. **Finance**: Algorithmic trading, fraud detection, risk assessment, and automated customer service through chatbots.\n\n3. **Transportation**: Autonomous vehicles, traffic optimization, predictive maintenance, and route planning algorithms.\n\n4. **Entertainment**: Recommendation systems, content generation, virtual reality experiences, and interactive gaming.\n\n**Technical Architecture:**\n- Data processing pipelines that handle massive datasets in real-time\n- Machine learning models trained on diverse, high-quality datasets\n- Cloud infrastructure enabling scalable deployment and accessibility\n- APIs and interfaces that allow integration with existing systems\n\n**Performance Metrics:**\nSuccess is measured through accuracy rates, processing speed, user engagement, cost efficiency, and real-world impact on problem-solving.", | |
| "**Industry Applications and Impact:**\n\n**Current Market Landscape:**\nThe technology sector is experiencing rapid transformation with significant investments in research and development. Major companies are competing to develop more efficient, ethical, and accessible solutions.\n\n**Real-World Applications:**\n1. **Smart Cities**: IoT sensors, traffic management, energy optimization, and public safety systems working together to improve urban living.\n\n2. **Environmental Monitoring**: Satellite imagery analysis, climate modeling, pollution tracking, and renewable energy optimization.\n\n3. **Education**: Personalized learning platforms, automated grading systems, virtual tutors, and accessibility tools for diverse learners.\n\n4. **Manufacturing**: Predictive maintenance, quality control, supply chain optimization, and human-robot collaboration.\n\n**Economic Impact:**\n- Job creation in new fields while transforming traditional roles\n- Increased productivity and efficiency across industries\n- New business models and revenue streams\n- Global competitiveness and innovation drivers\n\n**Challenges and Solutions:**\n- Addressing ethical concerns through responsible development practices\n- Ensuring data privacy and security through robust frameworks\n- Managing the digital divide through inclusive design and accessibility", | |
| "**Future Implications and Trends:**\n\n**Emerging Developments:**\nThe field is evolving rapidly with breakthrough research in quantum computing, neuromorphic chips, and advanced algorithms that promise to solve previously intractable problems.\n\n**Next 5-10 Years:**\n1. **Integration**: Seamless integration across platforms and devices, creating more intuitive user experiences.\n\n2. **Personalization**: Highly customized solutions that adapt to individual preferences and needs in real-time.\n\n3. **Sustainability**: Green technology initiatives focusing on energy efficiency and environmental responsibility.\n\n4. **Accessibility**: Universal design principles ensuring technology benefits all users regardless of abilities or circumstances.\n\n**Societal Considerations:**\n- Regulatory frameworks evolving to balance innovation with consumer protection\n- Educational systems adapting to prepare workforce for technological changes\n- International cooperation on standards and ethical guidelines\n- Public discourse on the role of technology in society\n\n**Preparing for the Future:**\n- Continuous learning and skill development\n- Critical thinking about technology's role in daily life\n- Participation in discussions about technology policy and ethics\n- Understanding both opportunities and risks associated with technological advancement" | |
| ] | |
| # Create a more comprehensive response | |
| response = f"{base_response}\n\n{random.choice(details)}" | |
| # Generate correlated scores (realistic relationships) | |
| relevance_score = max(0, min(10, base_score + random.uniform(-0.3, 0.3))) | |
| accuracy_score = max(0, min(10, base_score + random.uniform(-0.4, 0.2))) | |
| completeness_score = max(0, min(10, base_score + random.uniform(-0.5, 0.3))) | |
| coherence_score = max(0, min(10, base_score + random.uniform(-0.2, 0.4))) | |
| # Generate hallucination score (inverse relationship with accuracy) | |
| hallucination_score = max(0, min(10, 10 - accuracy_score + random.uniform(-1.0, 1.0))) | |
| # Generate token consumption based on response length and agent type | |
| response_length = len(response) | |
| input_tokens = len(query.split()) * 1.3 # Rough estimate | |
| output_tokens = response_length / 4 # Rough estimate (4 chars per token) | |
| total_tokens = int(input_tokens + output_tokens) | |
| # Calculate cost (rough estimates per 1K tokens) | |
| cost_per_1k_tokens = { | |
| "azure": 0.03, # GPT-4 | |
| "openai": 0.03, | |
| "anthropic": 0.025 | |
| } | |
| cost_usd = (total_tokens / 1000) * cost_per_1k_tokens.get(llm_provider, 0.03) | |
| # Realistic safety scenarios | |
| safety_pass_rate = 0.95 # 95% pass rate | |
| if random.random() < 0.02: # 2% chance of safety issues | |
| guardrails_passed = False | |
| safety_score = random.uniform(3.0, 6.0) | |
| guardrails_failures = '["content_safety", "inappropriate_advice"]' | |
| else: | |
| guardrails_passed = True | |
| safety_score = random.uniform(8.5, 10.0) | |
| guardrails_failures = "[]" | |
| # Realistic execution times (with some variation) | |
| if agent == "Diet Agent": | |
| execution_time = random.uniform(1500, 4000) # Moderate complexity | |
| elif agent == "Support Agent": | |
| execution_time = random.uniform(2000, 5000) # More thoughtful responses | |
| else: # Queries Agent | |
| execution_time = random.uniform(2500, 6000) # Complex information retrieval | |
| eval_data = ( | |
| f"demo_session_{i // 4 + 1}", # session_id (4 queries per session) | |
| agent, # agent_name | |
| query, # query | |
| response, # response | |
| base_score, # overall_score | |
| relevance_score, # relevance_score | |
| accuracy_score, # accuracy_score | |
| completeness_score, # completeness_score | |
| coherence_score, # coherence_score | |
| hallucination_score, # hallucination_score | |
| guardrails_passed, # guardrails_passed | |
| safety_score, # safety_score | |
| execution_time, # execution_time_ms | |
| int(input_tokens), # input_tokens | |
| int(output_tokens), # output_tokens | |
| total_tokens, # total_tokens | |
| round(cost_usd, 4), # cost_usd | |
| False, # error_occurred | |
| llm_provider, # llm_provider | |
| "gpt-4o", # model_name | |
| f"Comprehensive evaluation for {agent}: The response demonstrates good understanding of the query with appropriate depth and accuracy. Score breakdown reflects the quality across multiple dimensions.", # judge_reasoning | |
| guardrails_failures, # guardrails_failures | |
| timestamp.isoformat() # timestamp | |
| ) | |
| cursor.execute(''' | |
| INSERT INTO evaluation_logs ( | |
| session_id, agent_name, query, response, overall_score, | |
| relevance_score, accuracy_score, completeness_score, coherence_score, | |
| hallucination_score, guardrails_passed, safety_score, execution_time_ms, | |
| input_tokens, output_tokens, total_tokens, cost_usd, error_occurred, | |
| llm_provider, model_name, judge_reasoning, guardrails_failures, timestamp | |
| ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) | |
| ''', eval_data) | |
| # Get the evaluation ID for response analysis | |
| evaluation_id = cursor.lastrowid | |
| # Insert detailed response analysis | |
| self.insert_response_analysis(cursor, evaluation_id, eval_data[0], agent, response, timestamp) | |
| def insert_response_analysis(self, cursor, evaluation_id, session_id, agent_name, response_text, timestamp): | |
| """Insert detailed response analysis data""" | |
| import re | |
| # Calculate response metrics | |
| response_length = len(response_text) | |
| word_count = len(response_text.split()) | |
| sentence_count = len(re.split(r'[.!?]+', response_text)) - 1 | |
| # Simple readability score (Flesch-like approximation) | |
| if sentence_count > 0 and word_count > 0: | |
| avg_sentence_length = word_count / sentence_count | |
| readability_score = max(0, min(10, 10 - (avg_sentence_length - 15) * 0.1)) | |
| else: | |
| readability_score = 5.0 | |
| # Simple sentiment analysis (based on positive/negative words) | |
| positive_words = ['good', 'great', 'excellent', 'helpful', 'recommend', 'beneficial', 'effective', 'important', 'valuable', 'useful'] | |
| negative_words = ['bad', 'poor', 'difficult', 'problem', 'issue', 'concern', 'warning', 'avoid', 'risk', 'danger'] | |
| text_lower = response_text.lower() | |
| positive_count = sum(1 for word in positive_words if word in text_lower) | |
| negative_count = sum(1 for word in negative_words if word in text_lower) | |
| if positive_count + negative_count > 0: | |
| sentiment_score = (positive_count - negative_count) / (positive_count + negative_count) * 5 + 5 | |
| else: | |
| sentiment_score = 5.0 # Neutral | |
| # Extract key topics (simple keyword extraction) | |
| keywords = [] | |
| if 'diet' in text_lower or 'food' in text_lower or 'nutrition' in text_lower: | |
| keywords.append('nutrition') | |
| if 'exercise' in text_lower or 'workout' in text_lower or 'fitness' in text_lower: | |
| keywords.append('fitness') | |
| if 'stress' in text_lower or 'anxiety' in text_lower or 'mental' in text_lower: | |
| keywords.append('mental_health') | |
| if 'technology' in text_lower or 'ai' in text_lower or 'algorithm' in text_lower: | |
| keywords.append('technology') | |
| if 'health' in text_lower or 'medical' in text_lower: | |
| keywords.append('health') | |
| key_topics = ','.join(keywords) if keywords else 'general' | |
| # Determine response type | |
| if '?' in response_text: | |
| response_type = 'question' | |
| elif any(word in text_lower for word in ['recommend', 'suggest', 'try', 'consider']): | |
| response_type = 'recommendation' | |
| elif any(word in text_lower for word in ['explain', 'definition', 'means', 'is']): | |
| response_type = 'explanation' | |
| else: | |
| response_type = 'general' | |
| # Check for code and links | |
| contains_code = bool(re.search(r'```|`.*`|\bcode\b|\bfunction\b|\bclass\b', response_text)) | |
| contains_links = bool(re.search(r'http[s]?://|www\.|\.com|\.org', response_text)) | |
| # Insert response analysis | |
| cursor.execute(''' | |
| INSERT INTO response_analysis ( | |
| evaluation_id, session_id, agent_name, response_text, response_length, | |
| word_count, sentence_count, readability_score, sentiment_score, | |
| key_topics, response_type, contains_code, contains_links, | |
| language_detected, timestamp | |
| ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) | |
| ''', ( | |
| evaluation_id, session_id, agent_name, response_text, response_length, | |
| word_count, sentence_count, readability_score, sentiment_score, | |
| key_topics, response_type, contains_code, contains_links, | |
| 'en', timestamp.isoformat() | |
| )) | |
| def safe_column_access(self, df: pd.DataFrame, column: str, default_value=None): | |
| """Safely access DataFrame columns""" | |
| try: | |
| if column in df.columns: | |
| return df[column] | |
| else: | |
| return pd.Series([default_value] * len(df), index=df.index) | |
| except Exception: | |
| return pd.Series([default_value] * len(df) if len(df) > 0 else []) | |
| def load_data(self, date_filter: tuple = None, agent_filter: List[str] = None, session_filter: str = None) -> Dict[str, pd.DataFrame]: | |
| """Load and filter data from database""" | |
| try: | |
| conn = sqlite3.connect(self.db_path) | |
| # Base queries | |
| eval_query = "SELECT * FROM evaluation_logs" | |
| trace_query = "SELECT * FROM workflow_traces" | |
| response_analysis_query = "SELECT * FROM response_analysis" | |
| # Apply filters | |
| conditions = [] | |
| params = [] | |
| if date_filter: | |
| conditions.append("timestamp BETWEEN ? AND ?") | |
| params.extend([date_filter[0].strftime('%Y-%m-%d'), date_filter[1].strftime('%Y-%m-%d')]) | |
| if agent_filter: | |
| placeholders = ','.join(['?' for _ in agent_filter]) | |
| conditions.append(f"agent_name IN ({placeholders})") | |
| params.extend(agent_filter) | |
| if session_filter: | |
| conditions.append("session_id LIKE ?") | |
| params.append(f"%{session_filter}%") | |
| if conditions: | |
| eval_query += " WHERE " + " AND ".join(conditions) | |
| trace_query += " WHERE " + " AND ".join(conditions) | |
| response_analysis_query += " WHERE " + " AND ".join(conditions) | |
| # Load data | |
| evaluations = pd.read_sql_query(eval_query, conn, params=params) | |
| traces = pd.read_sql_query(trace_query, conn, params=params) | |
| # Load response analysis data (handle if table doesn't exist yet) | |
| try: | |
| response_analysis = pd.read_sql_query(response_analysis_query, conn, params=params) | |
| except Exception: | |
| response_analysis = pd.DataFrame() | |
| conn.close() | |
| # Convert timestamp columns | |
| if not evaluations.empty: | |
| evaluations['timestamp'] = pd.to_datetime(evaluations['timestamp']) | |
| if not traces.empty: | |
| traces['timestamp'] = pd.to_datetime(traces['timestamp']) | |
| if not response_analysis.empty: | |
| response_analysis['timestamp'] = pd.to_datetime(response_analysis['timestamp']) | |
| return { | |
| 'evaluations': evaluations, | |
| 'traces': traces, | |
| 'response_analysis': response_analysis | |
| } | |
| except Exception as e: | |
| st.error(f"Error loading data: {str(e)}") | |
| return {'evaluations': pd.DataFrame(), 'traces': pd.DataFrame(), 'response_analysis': pd.DataFrame()} | |
| def create_sidebar_filters(self, data: Dict[str, pd.DataFrame]) -> Dict[str, Any]: | |
| """Create sidebar filters""" | |
| st.sidebar.header("π Filters") | |
| filters = {} | |
| # Date range filter | |
| if not data['evaluations'].empty: | |
| min_date = data['evaluations']['timestamp'].min().date() | |
| max_date = data['evaluations']['timestamp'].max().date() | |
| filters['date_range'] = st.sidebar.date_input( | |
| "π Date Range", | |
| value=(min_date, max_date), | |
| min_value=min_date, | |
| max_value=max_date | |
| ) | |
| # Agent filter | |
| if not data['evaluations'].empty: | |
| agents = data['evaluations']['agent_name'].unique().tolist() | |
| filters['agents'] = st.sidebar.multiselect( | |
| "π€ Agents", | |
| options=agents, | |
| default=agents | |
| ) | |
| # Session filter | |
| filters['session'] = st.sidebar.text_input( | |
| "π Session ID (partial match)", | |
| placeholder="Enter session ID..." | |
| ) | |
| # Score range filter | |
| filters['score_range'] = st.sidebar.slider( | |
| "π Score Range", | |
| min_value=0.0, | |
| max_value=10.0, | |
| value=(0.0, 10.0), | |
| step=0.1 | |
| ) | |
| # Safety filter | |
| filters['safety_only'] = st.sidebar.checkbox( | |
| "π‘οΈ Show only safe responses", | |
| value=False | |
| ) | |
| # Advanced filters | |
| st.sidebar.markdown("### π¬ Advanced Filters") | |
| # Performance tier filter | |
| filters['performance_tier'] = st.sidebar.selectbox( | |
| "π Performance Tier", | |
| options=["All", "Excellent (8.5+)", "Good (7.0-8.5)", "Needs Improvement (<7.0)"], | |
| index=0 | |
| ) | |
| # Response time filter | |
| if not data['evaluations'].empty: | |
| max_time = data['evaluations']['execution_time_ms'].max() | |
| filters['max_response_time'] = st.sidebar.slider( | |
| "β±οΈ Max Response Time (ms)", | |
| min_value=0, | |
| max_value=int(max_time), | |
| value=int(max_time), | |
| step=100 | |
| ) | |
| # Model/Provider filter | |
| if not data['evaluations'].empty and 'llm_provider' in data['evaluations'].columns: | |
| providers = data['evaluations']['llm_provider'].unique().tolist() | |
| filters['providers'] = st.sidebar.multiselect( | |
| "π€ LLM Providers", | |
| options=providers, | |
| default=providers | |
| ) | |
| # Auto-refresh option | |
| filters['auto_refresh'] = st.sidebar.checkbox( | |
| "π Auto-refresh (30s)", | |
| value=False, | |
| help="Automatically refresh data every 30 seconds" | |
| ) | |
| if filters.get('auto_refresh', False): | |
| st.sidebar.success("π Auto-refresh enabled") | |
| # Add auto-refresh logic here if needed | |
| return filters | |
| def show_executive_summary(self, data: Dict[str, pd.DataFrame]): | |
| """Show executive summary with key metrics""" | |
| st.header("π Executive Summary") | |
| if data['evaluations'].empty: | |
| st.warning("No evaluation data available") | |
| return | |
| df = data['evaluations'] | |
| # Key metrics | |
| col1, col2, col3, col4, col5 = st.columns(5) | |
| with col1: | |
| total_evals = len(df) | |
| st.metric("Total Evaluations", f"{total_evals:,}") | |
| with col2: | |
| avg_score = self.safe_column_access(df, 'overall_score', 0).mean() | |
| st.metric("Average Score", f"{avg_score:.2f}/10") | |
| with col3: | |
| safety_rate = (self.safe_column_access(df, 'guardrails_passed', True).sum() / len(df)) * 100 | |
| st.metric("Safety Pass Rate", f"{safety_rate:.1f}%") | |
| with col4: | |
| avg_time = self.safe_column_access(df, 'execution_time_ms', 0).mean() / 1000 | |
| st.metric("Avg Response Time", f"{avg_time:.2f}s") | |
| with col5: | |
| unique_sessions = df['session_id'].nunique() | |
| st.metric("Unique Sessions", f"{unique_sessions:,}") | |
| # Performance trends | |
| col1, col2 = st.columns([3, 1]) | |
| with col1: | |
| st.subheader("π Performance Trends") | |
| with col2: | |
| trend_period = st.selectbox( | |
| "π Period", | |
| options=["7 days", "30 days", "All time"], | |
| index=1, | |
| key="trend_period" | |
| ) | |
| # Filter data based on selected period | |
| if trend_period == "7 days": | |
| cutoff_date = datetime.now() - timedelta(days=7) | |
| trend_df = df[df['timestamp'] >= cutoff_date] | |
| elif trend_period == "30 days": | |
| cutoff_date = datetime.now() - timedelta(days=30) | |
| trend_df = df[df['timestamp'] >= cutoff_date] | |
| else: | |
| trend_df = df | |
| # Daily performance trend | |
| df_daily = trend_df.groupby(trend_df['timestamp'].dt.date).agg({ | |
| 'overall_score': 'mean', | |
| 'execution_time_ms': 'mean', | |
| 'guardrails_passed': lambda x: (x.sum() / len(x)) * 100 | |
| }).reset_index() | |
| fig = make_subplots( | |
| rows=2, cols=2, | |
| subplot_titles=('Daily Average Score', 'Daily Response Time', 'Daily Safety Rate', 'Score Distribution'), | |
| specs=[[{"secondary_y": False}, {"secondary_y": False}], | |
| [{"secondary_y": False}, {"secondary_y": False}]] | |
| ) | |
| # Score trend | |
| fig.add_trace( | |
| go.Scatter(x=df_daily['timestamp'], y=df_daily['overall_score'], | |
| mode='lines+markers', name='Score', line=dict(color='#1f77b4')), | |
| row=1, col=1 | |
| ) | |
| # Response time trend | |
| fig.add_trace( | |
| go.Scatter(x=df_daily['timestamp'], y=df_daily['execution_time_ms']/1000, | |
| mode='lines+markers', name='Response Time', line=dict(color='#ff7f0e')), | |
| row=1, col=2 | |
| ) | |
| # Safety rate trend | |
| fig.add_trace( | |
| go.Scatter(x=df_daily['timestamp'], y=df_daily['guardrails_passed'], | |
| mode='lines+markers', name='Safety Rate', line=dict(color='#2ca02c')), | |
| row=2, col=1 | |
| ) | |
| # Score distribution | |
| fig.add_trace( | |
| go.Histogram(x=self.safe_column_access(df, 'overall_score', 0), | |
| nbinsx=20, name='Score Distribution', marker_color='#d62728'), | |
| row=2, col=2 | |
| ) | |
| fig.update_layout(height=600, showlegend=False, title_text="Performance Analytics") | |
| st.plotly_chart(fig, use_container_width=True) | |
| def show_agent_performance(self, data: Dict[str, pd.DataFrame]): | |
| """Show detailed agent performance analysis""" | |
| st.header("π€ Agent Performance Analysis") | |
| if data['evaluations'].empty: | |
| st.warning("No evaluation data available") | |
| return | |
| df = data['evaluations'] | |
| # Agent comparison | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("π Agent Score Comparison") | |
| agent_scores = df.groupby('agent_name').agg({ | |
| 'overall_score': ['mean', 'std', 'count'], | |
| 'relevance_score': 'mean', | |
| 'accuracy_score': 'mean', | |
| 'completeness_score': 'mean', | |
| 'coherence_score': 'mean' | |
| }).round(2) | |
| # Flatten column names | |
| agent_scores.columns = ['_'.join(col).strip() for col in agent_scores.columns] | |
| fig = px.bar( | |
| x=agent_scores.index, | |
| y=agent_scores['overall_score_mean'], | |
| error_y=agent_scores['overall_score_std'], | |
| title="Average Score by Agent", | |
| labels={'x': 'Agent', 'y': 'Average Score'} | |
| ) | |
| fig.update_layout(showlegend=False) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| st.subheader("β‘ Response Time Analysis") | |
| agent_times = df.groupby('agent_name')['execution_time_ms'].agg(['mean', 'std']).reset_index() | |
| agent_times['mean'] = agent_times['mean'] / 1000 # Convert to seconds | |
| agent_times['std'] = agent_times['std'] / 1000 | |
| fig = px.bar( | |
| agent_times, | |
| x='agent_name', | |
| y='mean', | |
| error_y='std', | |
| title="Average Response Time by Agent", | |
| labels={'agent_name': 'Agent', 'mean': 'Response Time (seconds)'} | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Detailed score breakdown | |
| st.subheader("π― Detailed Score Breakdown") | |
| score_columns = ['relevance_score', 'accuracy_score', 'completeness_score', 'coherence_score'] | |
| available_scores = [col for col in score_columns if col in df.columns] | |
| if available_scores: | |
| agent_detailed = df.groupby('agent_name')[available_scores].mean().reset_index() | |
| fig = go.Figure() | |
| for agent in agent_detailed['agent_name'].unique(): | |
| agent_data = agent_detailed[agent_detailed['agent_name'] == agent] | |
| fig.add_trace(go.Scatterpolar( | |
| r=[agent_data[col].iloc[0] for col in available_scores], | |
| theta=[col.replace('_score', '').title() for col in available_scores], | |
| fill='toself', | |
| name=agent | |
| )) | |
| fig.update_layout( | |
| polar=dict( | |
| radialaxis=dict(visible=True, range=[0, 10]) | |
| ), | |
| showlegend=True, | |
| title="Agent Performance Radar Chart" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| def show_safety_analysis(self, data: Dict[str, pd.DataFrame]): | |
| """Show safety and guardrails analysis""" | |
| st.header("π‘οΈ Safety & Guardrails Analysis") | |
| if data['evaluations'].empty: | |
| st.warning("No evaluation data available") | |
| return | |
| df = data['evaluations'] | |
| # Safety metrics | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| total_checks = len(df) | |
| passed_checks = self.safe_column_access(df, 'guardrails_passed', True).sum() | |
| safety_rate = (passed_checks / total_checks) * 100 if total_checks > 0 else 0 | |
| st.metric("Overall Safety Rate", f"{safety_rate:.1f}%", f"{passed_checks}/{total_checks}") | |
| with col2: | |
| avg_safety_score = self.safe_column_access(df, 'safety_score', 10).mean() | |
| st.metric("Average Safety Score", f"{avg_safety_score:.2f}/10") | |
| with col3: | |
| failed_checks = total_checks - passed_checks | |
| st.metric("Failed Checks", f"{failed_checks:,}") | |
| # Safety by agent | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("π€ Safety Rate by Agent") | |
| safety_by_agent = df.groupby('agent_name').agg({ | |
| 'guardrails_passed': lambda x: (x.sum() / len(x)) * 100 | |
| }).reset_index() | |
| fig = px.bar( | |
| safety_by_agent, | |
| x='agent_name', | |
| y='guardrails_passed', | |
| title="Safety Pass Rate by Agent", | |
| labels={'agent_name': 'Agent', 'guardrails_passed': 'Safety Rate (%)'}, | |
| color='guardrails_passed', | |
| color_continuous_scale='RdYlGn' | |
| ) | |
| fig.update_layout(showlegend=False) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| st.subheader("π Safety Trends Over Time") | |
| df_daily_safety = df.groupby(df['timestamp'].dt.date).agg({ | |
| 'guardrails_passed': lambda x: (x.sum() / len(x)) * 100 | |
| }).reset_index() | |
| fig = px.line( | |
| df_daily_safety, | |
| x='timestamp', | |
| y='guardrails_passed', | |
| title="Daily Safety Rate Trend", | |
| labels={'timestamp': 'Date', 'guardrails_passed': 'Safety Rate (%)'} | |
| ) | |
| fig.add_hline(y=95, line_dash="dash", line_color="red", | |
| annotation_text="95% Target") | |
| st.plotly_chart(fig, use_container_width=True) | |
| def show_response_analysis(self, data: Dict[str, pd.DataFrame]): | |
| """Show detailed response analysis and tracing""" | |
| st.header("π Response Analysis & Tracing") | |
| if data['evaluations'].empty: | |
| st.warning("No evaluation data available") | |
| return | |
| df_eval = data['evaluations'] | |
| df_analysis = data.get('response_analysis', pd.DataFrame()) | |
| # Response overview metrics | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| avg_response_length = df_eval['response'].str.len().mean() if 'response' in df_eval.columns else 0 | |
| st.metric("Avg Response Length", f"{avg_response_length:.0f} chars") | |
| with col2: | |
| if not df_analysis.empty: | |
| avg_word_count = df_analysis['word_count'].mean() | |
| st.metric("Avg Word Count", f"{avg_word_count:.0f} words") | |
| else: | |
| st.metric("Avg Word Count", "N/A") | |
| with col3: | |
| if not df_analysis.empty: | |
| avg_readability = df_analysis['readability_score'].mean() | |
| st.metric("Avg Readability", f"{avg_readability:.1f}/10") | |
| else: | |
| st.metric("Avg Readability", "N/A") | |
| with col4: | |
| if not df_analysis.empty: | |
| avg_sentiment = df_analysis['sentiment_score'].mean() | |
| st.metric("Avg Sentiment", f"{avg_sentiment:.1f}/10") | |
| else: | |
| st.metric("Avg Sentiment", "N/A") | |
| # Response analysis charts | |
| if not df_analysis.empty: | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("π Response Length Distribution") | |
| fig = px.histogram( | |
| df_analysis, | |
| x='response_length', | |
| nbins=20, | |
| title="Response Length Distribution", | |
| labels={'response_length': 'Response Length (characters)', 'count': 'Frequency'} | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| st.subheader("π Readability vs Sentiment") | |
| fig = px.scatter( | |
| df_analysis, | |
| x='readability_score', | |
| y='sentiment_score', | |
| color='agent_name', | |
| title="Readability vs Sentiment by Agent", | |
| labels={'readability_score': 'Readability Score', 'sentiment_score': 'Sentiment Score'} | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Response type analysis | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("π·οΈ Response Types") | |
| response_types = df_analysis['response_type'].value_counts() | |
| fig = px.pie( | |
| values=response_types.values, | |
| names=response_types.index, | |
| title="Distribution of Response Types" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| st.subheader("π Key Topics") | |
| # Process key topics | |
| all_topics = [] | |
| for topics in df_analysis['key_topics'].dropna(): | |
| all_topics.extend(topics.split(',')) | |
| if all_topics: | |
| topic_counts = pd.Series(all_topics).value_counts().head(10) | |
| fig = px.bar( | |
| x=topic_counts.values, | |
| y=topic_counts.index, | |
| orientation='h', | |
| title="Top 10 Key Topics", | |
| labels={'x': 'Frequency', 'y': 'Topics'} | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| st.info("No topic data available") | |
| # Response tracing section | |
| st.subheader("π Response Tracing") | |
| # Search functionality | |
| search_term = st.text_input("π Search in responses:", placeholder="Enter keywords to search...") | |
| if search_term: | |
| mask = df_eval['response'].str.contains(search_term, case=False, na=False) | |
| filtered_responses = df_eval[mask] | |
| else: | |
| filtered_responses = df_eval.head(10) # Show first 10 by default | |
| # Display responses with details | |
| if not filtered_responses.empty: | |
| st.write(f"**Found {len(filtered_responses)} responses**") | |
| for idx, row in filtered_responses.iterrows(): | |
| with st.expander(f"π€ {row['agent_name']} - Session: {row['session_id'][:8]}... - Score: {row['overall_score']:.1f}"): | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| st.write("**Query:**") | |
| st.write(row['query']) | |
| st.write("**Response:**") | |
| st.write(row['response']) | |
| with col2: | |
| st.write("**Evaluation Scores:**") | |
| st.write(f"Overall: {row['overall_score']:.1f}/10") | |
| if 'relevance_score' in row: | |
| st.write(f"Relevance: {row['relevance_score']:.1f}/10") | |
| if 'accuracy_score' in row: | |
| st.write(f"Accuracy: {row['accuracy_score']:.1f}/10") | |
| if 'completeness_score' in row: | |
| st.write(f"Completeness: {row['completeness_score']:.1f}/10") | |
| if 'coherence_score' in row: | |
| st.write(f"Coherence: {row['coherence_score']:.1f}/10") | |
| st.write("**Metadata:**") | |
| st.write(f"Timestamp: {row['timestamp']}") | |
| st.write(f"Response Time: {row['execution_time_ms']:.0f}ms") | |
| st.write(f"Safety: {'β Passed' if row['guardrails_passed'] else 'β Failed'}") | |
| # Show response analysis if available | |
| if not df_analysis.empty: | |
| analysis_row = df_analysis[df_analysis['evaluation_id'] == row['id']] | |
| if not analysis_row.empty: | |
| analysis = analysis_row.iloc[0] | |
| st.write("**Response Analysis:**") | |
| st.write(f"Length: {analysis['response_length']} chars") | |
| st.write(f"Words: {analysis['word_count']}") | |
| st.write(f"Readability: {analysis['readability_score']:.1f}/10") | |
| st.write(f"Sentiment: {analysis['sentiment_score']:.1f}/10") | |
| st.write(f"Type: {analysis['response_type']}") | |
| st.write(f"Topics: {analysis['key_topics']}") | |
| else: | |
| st.info("No responses found matching your search criteria.") | |
| # Export response data | |
| st.subheader("π€ Export Response Data") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.button("π Export Evaluation Data"): | |
| csv = df_eval.to_csv(index=False) | |
| st.download_button( | |
| label="Download CSV", | |
| data=csv, | |
| file_name="evaluation_responses.csv", | |
| mime="text/csv" | |
| ) | |
| with col2: | |
| if not df_analysis.empty and st.button("π Export Analysis Data"): | |
| csv = df_analysis.to_csv(index=False) | |
| st.download_button( | |
| label="Download CSV", | |
| data=csv, | |
| file_name="response_analysis.csv", | |
| mime="text/csv" | |
| ) | |
| def show_advanced_analytics(self, data: Dict[str, pd.DataFrame]): | |
| """Show advanced analytics and insights""" | |
| st.header("π¬ Advanced Analytics & AI Insights") | |
| if data['evaluations'].empty: | |
| st.warning("No evaluation data available") | |
| return | |
| df_eval = data['evaluations'] | |
| df_analysis = data.get('response_analysis', pd.DataFrame()) | |
| # Performance trends and predictions | |
| st.subheader("π Performance Trends & Predictions") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write("**π Score Trends Over Time**") | |
| # Daily performance trend with moving average | |
| df_daily = df_eval.groupby(df_eval['timestamp'].dt.date).agg({ | |
| 'overall_score': ['mean', 'count'], | |
| 'execution_time_ms': 'mean' | |
| }).reset_index() | |
| df_daily.columns = ['date', 'avg_score', 'count', 'avg_time'] | |
| # Calculate moving average | |
| df_daily['score_ma'] = df_daily['avg_score'].rolling(window=7, min_periods=1).mean() | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter( | |
| x=df_daily['date'], | |
| y=df_daily['avg_score'], | |
| mode='lines+markers', | |
| name='Daily Score', | |
| line=dict(color='lightblue', width=1), | |
| opacity=0.7 | |
| )) | |
| fig.add_trace(go.Scatter( | |
| x=df_daily['date'], | |
| y=df_daily['score_ma'], | |
| mode='lines', | |
| name='7-Day Moving Average', | |
| line=dict(color='red', width=3) | |
| )) | |
| fig.update_layout( | |
| title="Score Trends with Moving Average", | |
| xaxis_title="Date", | |
| yaxis_title="Score", | |
| height=400 | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| st.write("**β‘ Performance Correlation Matrix**") | |
| # Correlation analysis | |
| score_cols = ['overall_score', 'relevance_score', 'accuracy_score', | |
| 'completeness_score', 'coherence_score', 'execution_time_ms'] | |
| available_cols = [col for col in score_cols if col in df_eval.columns] | |
| if len(available_cols) > 2: | |
| corr_matrix = df_eval[available_cols].corr() | |
| fig = px.imshow( | |
| corr_matrix, | |
| title="Performance Metrics Correlation", | |
| color_continuous_scale='RdBu', | |
| aspect="auto" | |
| ) | |
| fig.update_layout(height=400) | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| st.info("Need more metrics for correlation analysis") | |
| # Agent comparison and benchmarking | |
| st.subheader("π Agent Benchmarking & Competition") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.write("**π₯ Agent Leaderboard**") | |
| leaderboard = df_eval.groupby('agent_name').agg({ | |
| 'overall_score': ['mean', 'std', 'count'], | |
| 'execution_time_ms': 'mean' | |
| }).round(2) | |
| leaderboard.columns = ['Avg Score', 'Score StdDev', 'Total Evals', 'Avg Time (ms)'] | |
| leaderboard['Efficiency'] = (leaderboard['Avg Score'] / (leaderboard['Avg Time (ms)'] / 1000)).round(2) | |
| leaderboard = leaderboard.sort_values('Avg Score', ascending=False) | |
| # Add rank and medals | |
| leaderboard['Rank'] = range(1, len(leaderboard) + 1) | |
| medals = ['π₯', 'π₯', 'π₯'] + ['π '] * (len(leaderboard) - 3) | |
| leaderboard['Medal'] = medals[:len(leaderboard)] | |
| st.dataframe(leaderboard[['Medal', 'Rank', 'Avg Score', 'Efficiency', 'Total Evals']], use_container_width=True) | |
| with col2: | |
| st.write("**π Performance Distribution**") | |
| fig = px.box( | |
| df_eval, | |
| x='agent_name', | |
| y='overall_score', | |
| title="Score Distribution by Agent", | |
| color='agent_name' | |
| ) | |
| fig.update_layout(height=300, showlegend=False) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col3: | |
| st.write("**β‘ Speed vs Quality**") | |
| agent_perf = df_eval.groupby('agent_name').agg({ | |
| 'overall_score': 'mean', | |
| 'execution_time_ms': 'mean' | |
| }).reset_index() | |
| fig = px.scatter( | |
| agent_perf, | |
| x='execution_time_ms', | |
| y='overall_score', | |
| size='overall_score', | |
| color='agent_name', | |
| title="Speed vs Quality Trade-off", | |
| labels={'execution_time_ms': 'Response Time (ms)', 'overall_score': 'Quality Score'} | |
| ) | |
| fig.update_layout(height=300) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # AI-powered insights and recommendations | |
| st.subheader("π€ AI-Powered Insights & Recommendations") | |
| # Generate insights | |
| insights = self.generate_ai_insights(df_eval, df_analysis) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write("**π‘ Key Insights**") | |
| for insight in insights['insights']: | |
| st.info(f"π {insight}") | |
| with col2: | |
| st.write("**π Recommendations**") | |
| for rec in insights['recommendations']: | |
| st.success(f"π‘ {rec}") | |
| # Performance anomaly detection | |
| st.subheader("π Anomaly Detection") | |
| anomalies = self.detect_anomalies(df_eval) | |
| if anomalies: | |
| st.warning(f"β οΈ Detected {len(anomalies)} potential anomalies:") | |
| for anomaly in anomalies: | |
| st.write(f"β’ {anomaly}") | |
| else: | |
| st.success("β No performance anomalies detected") | |
| # Real-time monitoring simulation | |
| st.subheader("π‘ Real-time Monitoring Simulation") | |
| if st.button("π Simulate Real-time Update"): | |
| # Simulate new data | |
| latest_data = self.simulate_realtime_data() | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Latest Score", f"{latest_data['score']:.2f}", f"{latest_data['score_delta']:+.2f}") | |
| with col2: | |
| st.metric("Response Time", f"{latest_data['time']:.0f}ms", f"{latest_data['time_delta']:+.0f}ms") | |
| with col3: | |
| st.metric("Safety Status", "β Passed" if latest_data['safe'] else "β Failed") | |
| st.success("π Dashboard updated with latest data!") | |
| def generate_ai_insights(self, df_eval, df_analysis): | |
| """Generate AI-powered insights from the data""" | |
| insights = [] | |
| recommendations = [] | |
| # Performance insights | |
| best_agent = df_eval.groupby('agent_name')['overall_score'].mean().idxmax() | |
| worst_agent = df_eval.groupby('agent_name')['overall_score'].mean().idxmin() | |
| avg_score = df_eval['overall_score'].mean() | |
| score_trend = df_eval.groupby(df_eval['timestamp'].dt.date)['overall_score'].mean() | |
| if len(score_trend) > 1: | |
| recent_trend = score_trend.iloc[-3:].mean() - score_trend.iloc[:3].mean() | |
| if recent_trend > 0.5: | |
| insights.append(f"Performance is improving! Recent scores are {recent_trend:.1f} points higher than earlier.") | |
| elif recent_trend < -0.5: | |
| insights.append(f"Performance decline detected. Recent scores are {abs(recent_trend):.1f} points lower.") | |
| # Agent insights | |
| insights.append(f"{best_agent} is the top performer with highest average scores.") | |
| insights.append(f"Overall system performance: {avg_score:.1f}/10 - {'Excellent' if avg_score > 8.5 else 'Good' if avg_score > 7.5 else 'Needs Improvement'}") | |
| # Response time insights | |
| avg_time = df_eval['execution_time_ms'].mean() | |
| if avg_time > 2000: | |
| insights.append(f"Response times are high (avg: {avg_time:.0f}ms). Consider optimization.") | |
| # Safety insights | |
| safety_rate = (df_eval['guardrails_passed'].sum() / len(df_eval)) * 100 | |
| if safety_rate < 95: | |
| insights.append(f"Safety pass rate is {safety_rate:.1f}% - below recommended 95% threshold.") | |
| # Recommendations | |
| if worst_agent != best_agent: | |
| recommendations.append(f"Consider retraining {worst_agent} using patterns from {best_agent}") | |
| if avg_time > 1500: | |
| recommendations.append("Implement caching or optimize model inference to reduce response times") | |
| recommendations.append("Schedule regular performance reviews every 2 weeks") | |
| recommendations.append("Set up automated alerts for scores below 7.0 or response times above 3 seconds") | |
| if not df_analysis.empty: | |
| avg_readability = df_analysis['readability_score'].mean() | |
| if avg_readability < 6: | |
| recommendations.append("Improve response readability - consider simpler language and shorter sentences") | |
| return {'insights': insights, 'recommendations': recommendations} | |
| def detect_anomalies(self, df_eval): | |
| """Detect performance anomalies""" | |
| anomalies = [] | |
| # Score anomalies (using IQR method) | |
| Q1 = df_eval['overall_score'].quantile(0.25) | |
| Q3 = df_eval['overall_score'].quantile(0.75) | |
| IQR = Q3 - Q1 | |
| lower_bound = Q1 - 1.5 * IQR | |
| upper_bound = Q3 + 1.5 * IQR | |
| score_anomalies = df_eval[(df_eval['overall_score'] < lower_bound) | (df_eval['overall_score'] > upper_bound)] | |
| if len(score_anomalies) > 0: | |
| anomalies.append(f"{len(score_anomalies)} evaluations with unusual scores detected") | |
| # Response time anomalies | |
| time_Q1 = df_eval['execution_time_ms'].quantile(0.25) | |
| time_Q3 = df_eval['execution_time_ms'].quantile(0.75) | |
| time_IQR = time_Q3 - time_Q1 | |
| time_upper = time_Q3 + 1.5 * time_IQR | |
| time_anomalies = df_eval[df_eval['execution_time_ms'] > time_upper] | |
| if len(time_anomalies) > 0: | |
| anomalies.append(f"{len(time_anomalies)} evaluations with unusually long response times") | |
| # Safety anomalies | |
| safety_failures = df_eval[df_eval['guardrails_passed'] == False] | |
| if len(safety_failures) > len(df_eval) * 0.1: # More than 10% failures | |
| anomalies.append(f"High safety failure rate: {len(safety_failures)} failures out of {len(df_eval)} evaluations") | |
| return anomalies | |
| def simulate_realtime_data(self): | |
| """Simulate real-time data update""" | |
| import random | |
| return { | |
| 'score': random.uniform(7.0, 9.5), | |
| 'score_delta': random.uniform(-0.5, 0.5), | |
| 'time': random.uniform(500, 2000), | |
| 'time_delta': random.uniform(-200, 200), | |
| 'safe': random.choice([True, True, True, False]) # 75% safe | |
| } | |
| def show_workflow_visualization(self, data: Dict[str, pd.DataFrame]): | |
| """Show workflow visualization with queries, scores, latency, hallucination, and token consumption""" | |
| st.header("π Workflow Visualization") | |
| df_eval = data['evaluations'] | |
| if df_eval.empty: | |
| st.warning("No evaluation data available for workflow visualization.") | |
| return | |
| # Create workflow selection | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| sessions = df_eval['session_id'].unique() | |
| selected_session = st.selectbox("Select Session", sessions, key="workflow_session") | |
| with col2: | |
| agents = df_eval['agent_name'].unique() | |
| selected_agent = st.selectbox("Select Agent (Optional)", ['All'] + list(agents), key="workflow_agent") | |
| # Filter data | |
| session_data = df_eval[df_eval['session_id'] == selected_session] | |
| if selected_agent != 'All': | |
| session_data = session_data[session_data['agent_name'] == selected_agent] | |
| if session_data.empty: | |
| st.warning("No data found for selected filters.") | |
| return | |
| # Create workflow diagram | |
| st.subheader("π Workflow Flow Diagram") | |
| # Generate Mermaid diagram | |
| mermaid_diagram = self.create_workflow_diagram(session_data) | |
| # Display the diagram using markdown (since create_diagram might not be available) | |
| st.markdown("```mermaid\n" + mermaid_diagram + "\n```") | |
| # Workflow metrics overview | |
| st.subheader("π Session Metrics Overview") | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| avg_score = session_data['overall_score'].mean() | |
| st.metric("Avg Overall Score", f"{avg_score:.2f}/10", | |
| delta=f"{avg_score - 7.5:.2f}" if avg_score > 7.5 else f"{avg_score - 7.5:.2f}") | |
| with col2: | |
| avg_latency = session_data['execution_time_ms'].mean() | |
| st.metric("Avg Response Time", f"{avg_latency:.0f}ms", | |
| delta=f"{avg_latency - 3000:.0f}ms" if avg_latency < 3000 else f"+{avg_latency - 3000:.0f}ms") | |
| with col3: | |
| avg_hallucination = session_data['hallucination_score'].mean() if 'hallucination_score' in session_data.columns else 0 | |
| st.metric("Avg Hallucination", f"{avg_hallucination:.2f}/10", | |
| delta=f"{5.0 - avg_hallucination:.2f}" if avg_hallucination < 5.0 else f"-{avg_hallucination - 5.0:.2f}") | |
| with col4: | |
| total_tokens = session_data['total_tokens'].sum() if 'total_tokens' in session_data.columns else 0 | |
| total_cost = session_data['cost_usd'].sum() if 'cost_usd' in session_data.columns else 0 | |
| st.metric("Total Cost", f"${total_cost:.4f}", f"{total_tokens:,} tokens") | |
| # Detailed workflow steps | |
| st.subheader("π Detailed Workflow Steps") | |
| for idx, row in session_data.iterrows(): | |
| with st.expander(f"Step {idx + 1}: {row['agent_name']} - Score: {row['overall_score']:.2f}/10"): | |
| # Query and Response | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| st.markdown("**Query:**") | |
| st.write(row['query']) | |
| # Performance metrics | |
| st.markdown("**Performance Metrics:**") | |
| metrics_data = { | |
| 'Overall Score': row['overall_score'], | |
| 'Relevance': row['relevance_score'], | |
| 'Accuracy': row['accuracy_score'], | |
| 'Completeness': row['completeness_score'], | |
| 'Coherence': row['coherence_score'], | |
| 'Hallucination': row.get('hallucination_score', 0), | |
| 'Safety': row['safety_score'] | |
| } | |
| # Create a bar chart for scores | |
| import plotly.graph_objects as go | |
| fig = go.Figure(data=[ | |
| go.Bar(x=list(metrics_data.keys()), y=list(metrics_data.values()), | |
| marker_color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2']) | |
| ]) | |
| fig.update_layout( | |
| title="Score Breakdown", | |
| yaxis_title="Score (0-10)", | |
| height=300, | |
| showlegend=False | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| st.markdown("**Response:**") | |
| st.write(row['response']) | |
| # Token and cost information | |
| st.markdown("**Resource Consumption:**") | |
| token_col1, token_col2 = st.columns(2) | |
| with token_col1: | |
| input_tokens = row.get('input_tokens', 0) | |
| output_tokens = row.get('output_tokens', 0) | |
| st.metric("Input Tokens", f"{input_tokens:,}") | |
| st.metric("Output Tokens", f"{output_tokens:,}") | |
| with token_col2: | |
| total_tokens = row.get('total_tokens', 0) | |
| cost = row.get('cost_usd', 0) | |
| st.metric("Total Tokens", f"{total_tokens:,}") | |
| st.metric("Cost", f"${cost:.4f}") | |
| # Execution details | |
| st.markdown("**Execution Details:**") | |
| exec_time = row['execution_time_ms'] | |
| llm_provider = row.get('llm_provider', 'Unknown') | |
| model_name = row.get('model_name', 'Unknown') | |
| st.write(f"β±οΈ **Execution Time:** {exec_time:.0f}ms") | |
| st.write(f"π€ **LLM Provider:** {llm_provider}") | |
| st.write(f"π§ **Model:** {model_name}") | |
| st.write(f"π‘οΈ **Safety Passed:** {'β ' if row['guardrails_passed'] else 'β'}") | |
| # Comparative analysis | |
| st.subheader("π Comparative Analysis") | |
| # Create comparison charts | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Score comparison | |
| fig = go.Figure() | |
| score_columns = ['overall_score', 'relevance_score', 'accuracy_score', 'completeness_score', 'coherence_score'] | |
| if 'hallucination_score' in session_data.columns: | |
| score_columns.append('hallucination_score') | |
| for i, (idx, row) in enumerate(session_data.iterrows()): | |
| fig.add_trace(go.Scatterpolar( | |
| r=[row[col] for col in score_columns], | |
| theta=[col.replace('_score', '').title() for col in score_columns], | |
| fill='toself', | |
| name=f"{row['agent_name']} - Step {i+1}" | |
| )) | |
| fig.update_layout( | |
| polar=dict( | |
| radialaxis=dict( | |
| visible=True, | |
| range=[0, 10] | |
| )), | |
| showlegend=True, | |
| title="Score Comparison Radar Chart" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| # Token consumption over steps | |
| if 'total_tokens' in session_data.columns: | |
| fig = go.Figure() | |
| steps = [f"Step {i+1}" for i in range(len(session_data))] | |
| fig.add_trace(go.Bar( | |
| x=steps, | |
| y=session_data['total_tokens'], | |
| name='Total Tokens', | |
| marker_color='lightblue' | |
| )) | |
| fig.add_trace(go.Scatter( | |
| x=steps, | |
| y=session_data['execution_time_ms'], | |
| yaxis='y2', | |
| name='Response Time (ms)', | |
| line=dict(color='red', width=2), | |
| mode='lines+markers' | |
| )) | |
| fig.update_layout( | |
| title="Token Consumption vs Response Time", | |
| xaxis_title="Workflow Steps", | |
| yaxis_title="Total Tokens", | |
| yaxis2=dict( | |
| title="Response Time (ms)", | |
| overlaying='y', | |
| side='right' | |
| ), | |
| height=400 | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Session summary | |
| st.subheader("π Session Summary") | |
| summary_col1, summary_col2, summary_col3 = st.columns(3) | |
| with summary_col1: | |
| st.markdown("**Quality Metrics:**") | |
| st.write(f"β’ Average Overall Score: {session_data['overall_score'].mean():.2f}/10") | |
| st.write(f"β’ Best Performing Step: {session_data.loc[session_data['overall_score'].idxmax(), 'agent_name']}") | |
| st.write(f"β’ Consistency (Std Dev): {session_data['overall_score'].std():.2f}") | |
| with summary_col2: | |
| st.markdown("**Performance Metrics:**") | |
| st.write(f"β’ Total Execution Time: {session_data['execution_time_ms'].sum():.0f}ms") | |
| st.write(f"β’ Average Response Time: {session_data['execution_time_ms'].mean():.0f}ms") | |
| st.write(f"β’ Fastest Step: {session_data['execution_time_ms'].min():.0f}ms") | |
| with summary_col3: | |
| st.markdown("**Resource Usage:**") | |
| if 'total_tokens' in session_data.columns: | |
| st.write(f"β’ Total Tokens Used: {session_data['total_tokens'].sum():,}") | |
| st.write(f"β’ Total Cost: ${session_data['cost_usd'].sum():.4f}") | |
| st.write(f"β’ Avg Cost per Query: ${session_data['cost_usd'].mean():.4f}") | |
| else: | |
| st.write("β’ Token data not available") | |
| # Export functionality | |
| st.subheader("π€ Export Workflow Data") | |
| if st.button("Export Session Data to CSV", key="export_workflow"): | |
| csv_data = session_data.to_csv(index=False) | |
| st.download_button( | |
| label="Download CSV", | |
| data=csv_data, | |
| file_name=f"workflow_session_{selected_session}.csv", | |
| mime="text/csv" | |
| ) | |
| def create_workflow_diagram(self, session_data): | |
| """Create a Mermaid workflow diagram""" | |
| diagram = "graph TD\n" | |
| diagram += " Start([Session Start])\n" | |
| for i, (idx, row) in enumerate(session_data.iterrows()): | |
| step_id = f"Step{i+1}" | |
| agent_name = row['agent_name'].replace(' ', '_') | |
| score = row['overall_score'] | |
| exec_time = row['execution_time_ms'] | |
| # Color based on score | |
| if score >= 8.5: | |
| color = "fill:#90EE90" # Light green | |
| elif score >= 7.0: | |
| color = "fill:#FFE4B5" # Light orange | |
| else: | |
| color = "fill:#FFB6C1" # Light pink | |
| diagram += f" {step_id}[\"{agent_name}<br/>Score: {score:.1f}/10<br/>Time: {exec_time:.0f}ms\"]\n" | |
| diagram += f" {step_id} --> {step_id}_result{{Result}}\n" | |
| if i == 0: | |
| diagram += f" Start --> {step_id}\n" | |
| else: | |
| prev_step = f"Step{i}" | |
| diagram += f" {prev_step}_result --> {step_id}\n" | |
| # Add styling | |
| diagram += f" class {step_id} stepClass;\n" | |
| # Add end node | |
| last_step = f"Step{len(session_data)}" | |
| diagram += f" {last_step}_result --> End([Session End])\n" | |
| # Add class definitions | |
| diagram += " classDef stepClass fill:#e1f5fe,stroke:#01579b,stroke-width:2px;\n" | |
| return diagram | |
| def run(self): | |
| """Run the dashboard""" | |
| st.title("π€ Multi-Agent System Dashboard - Demo") | |
| st.markdown("---") | |
| # Demo info | |
| st.info("π **Welcome to the Multi-Agent System Dashboard Demo!** This showcases a comprehensive evaluation system with LLM judge scoring, safety guardrails, and advanced analytics for Diet, Support, and Queries agents.") | |
| # Load initial data | |
| initial_data = self.load_data() | |
| # Create filters | |
| filters = self.create_sidebar_filters(initial_data) | |
| # Apply filters and reload data | |
| filtered_data = self.load_data( | |
| date_filter=filters.get('date_range'), | |
| agent_filter=filters.get('agents'), | |
| session_filter=filters.get('session') | |
| ) | |
| # Apply additional filters | |
| if not filtered_data['evaluations'].empty: | |
| df = filtered_data['evaluations'] | |
| # Score range filter | |
| if 'score_range' in filters: | |
| score_min, score_max = filters['score_range'] | |
| df = df[(df['overall_score'] >= score_min) & (df['overall_score'] <= score_max)] | |
| # Safety filter | |
| if filters.get('safety_only', False): | |
| df = df[df['guardrails_passed'] == True] | |
| # Performance tier filter | |
| if filters.get('performance_tier') != "All": | |
| if filters['performance_tier'] == "Excellent (8.5+)": | |
| df = df[df['overall_score'] >= 8.5] | |
| elif filters['performance_tier'] == "Good (7.0-8.5)": | |
| df = df[(df['overall_score'] >= 7.0) & (df['overall_score'] < 8.5)] | |
| elif filters['performance_tier'] == "Needs Improvement (<7.0)": | |
| df = df[df['overall_score'] < 7.0] | |
| # Response time filter | |
| if 'max_response_time' in filters: | |
| df = df[df['execution_time_ms'] <= filters['max_response_time']] | |
| # Provider filter | |
| if 'providers' in filters and filters['providers']: | |
| df = df[df['llm_provider'].isin(filters['providers'])] | |
| filtered_data['evaluations'] = df | |
| # Create tabs | |
| tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([ | |
| "π Executive Summary", | |
| "π€ Agent Performance", | |
| "π‘οΈ Safety Analysis", | |
| "π Response Analysis", | |
| "π¬ Advanced Analytics", | |
| "π Workflow Visualization" | |
| ]) | |
| with tab1: | |
| self.show_executive_summary(filtered_data) | |
| with tab2: | |
| self.show_agent_performance(filtered_data) | |
| with tab3: | |
| self.show_safety_analysis(filtered_data) | |
| with tab4: | |
| self.show_response_analysis(filtered_data) | |
| with tab5: | |
| self.show_advanced_analytics(filtered_data) | |
| with tab6: | |
| self.show_workflow_visualization(filtered_data) | |
| # Quick actions sidebar | |
| st.sidebar.markdown("---") | |
| st.sidebar.markdown("### β‘ Quick Actions") | |
| if st.sidebar.button("π Generate Report"): | |
| st.sidebar.success("π Report generated!") | |
| # Could generate PDF report here | |
| if st.sidebar.button("π Refresh Data"): | |
| st.sidebar.success("π Data refreshed!") | |
| st.experimental_rerun() | |
| if st.sidebar.button("π§ Send Alert"): | |
| st.sidebar.success("π§ Alert sent to team!") | |
| # Data summary in sidebar | |
| if not filtered_data['evaluations'].empty: | |
| st.sidebar.markdown("### π Current Session") | |
| st.sidebar.metric("Filtered Records", len(filtered_data['evaluations'])) | |
| st.sidebar.metric("Avg Score", f"{filtered_data['evaluations']['overall_score'].mean():.2f}") | |
| st.sidebar.metric("Success Rate", f"{(filtered_data['evaluations']['guardrails_passed'].sum() / len(filtered_data['evaluations']) * 100):.1f}%") | |
| # Footer | |
| st.markdown("---") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.markdown("π **Multi-Agent System Dashboard**") | |
| with col2: | |
| st.markdown("Built with Streamlit & Plotly") | |
| with col3: | |
| if st.button("βΉοΈ About"): | |
| st.info(""" | |
| **Multi-Agent System Dashboard v2.0** | |
| Features: | |
| - π Real-time monitoring | |
| - π€ AI-powered insights | |
| - π Advanced analytics | |
| - π Response tracing | |
| - π‘οΈ Safety monitoring | |
| - π Performance benchmarking | |
| Built for production-grade multi-agent systems. | |
| """) | |
| if __name__ == "__main__": | |
| dashboard = HuggingFaceDashboard() | |
| dashboard.run() |