Spaces:

saptyfun
/

multiagent

Sleeping

App Files Files Community

saptyfun commited on Jun 17, 2025

Commit

f4c4c5f

verified ·

1 Parent(s): d053b0b

Upload app.py

Browse files

Files changed (1) hide show

src/app.py +740 -30

src/app.py CHANGED Viewed

@@ -81,9 +81,14 @@ class HuggingFaceDashboard:
             accuracy_score REAL,
             completeness_score REAL,
             coherence_score REAL,
             guardrails_passed BOOLEAN,
             safety_score REAL,
             execution_time_ms REAL,
             error_occurred BOOLEAN DEFAULT FALSE,
             llm_provider TEXT,
             model_name TEXT,
@@ -259,19 +264,19 @@ class HuggingFaceDashboard:
             # Generate realistic response
             response_templates = {
                 "Diet Agent": [
-                    f"Based on your query about {query[:30]}..., I recommend focusing on balanced nutrition with emphasis on whole foods, proper portion sizes, and regular meal timing.",
-                    f"For your question regarding {query[:30]}..., here's a comprehensive approach that considers your nutritional needs and health goals.",
-                    f"Addressing your concern about {query[:30]}..., let me provide evidence-based dietary guidance tailored to your situation."
                 ],
                 "Support Agent": [
-                    f"I understand you're dealing with {query[:30]}... This is a common challenge, and I'm here to help you work through it step by step.",
-                    f"Thank you for sharing your concern about {query[:30]}... Let's explore some practical strategies that can make a real difference.",
-                    f"Your question about {query[:30]}... resonates with many people. Here are some effective approaches you can try."
                 ],
                 "Queries Agent": [
-                    f"Great question about {query[:30]}... This is a complex topic that involves several key concepts and recent developments.",
-                    f"To answer your query about {query[:30]}..., let me break this down into the fundamental principles and current applications.",
-                    f"Your question regarding {query[:30]}... touches on important technological and societal implications. Here's a comprehensive overview."
                 ]
             }
@@ -281,26 +286,30 @@ class HuggingFaceDashboard:
             # Add specific details based on agent type
             if agent == "Diet Agent":
                 details = [
-                    "Key recommendations: 1) Focus on whole foods, 2) Control portions, 3) Stay hydrated",
-                    "Nutritional guidelines: Aim for 50% vegetables, 25% lean protein, 25% complex carbs",
-                    "Meal timing: Consider eating every 3-4 hours to maintain stable blood sugar",
-                    "Sample foods: Quinoa, salmon, leafy greens, berries, nuts, and legumes"
                 ]
             elif agent == "Support Agent":
                 details = [
-                    "Action steps: 1) Identify triggers, 2) Develop coping strategies, 3) Practice regularly",
-                    "Techniques to try: Deep breathing, progressive muscle relaxation, mindfulness meditation",
-                    "Timeline: Start with 5-10 minutes daily, gradually increase as comfort grows",
-                    "Resources: Consider apps like Headspace, Calm, or consulting a professional"
                 ]
             else:  # Queries Agent
                 details = [
-                    "Technical overview: This involves complex algorithms and data processing methods",
-                    "Current applications: Used in healthcare, finance, transportation, and entertainment",
-                    "Future implications: Expected to revolutionize how we work and interact with technology",
-                    "Key considerations: Privacy, security, ethical implications, and regulatory frameworks"
                 ]
             response = f"{base_response}\n\n{random.choice(details)}"
             # Generate correlated scores (realistic relationships)
@@ -309,6 +318,23 @@ class HuggingFaceDashboard:
             completeness_score = max(0, min(10, base_score + random.uniform(-0.5, 0.3)))
             coherence_score = max(0, min(10, base_score + random.uniform(-0.2, 0.4)))
             # Realistic safety scenarios
             safety_pass_rate = 0.95  # 95% pass rate
             if random.random() < 0.02:  # 2% chance of safety issues
@@ -338,11 +364,16 @@ class HuggingFaceDashboard:
                 accuracy_score,  # accuracy_score
                 completeness_score,  # completeness_score
                 coherence_score,  # coherence_score
                 guardrails_passed,  # guardrails_passed
                 safety_score,  # safety_score
                 execution_time,  # execution_time_ms
                 False,  # error_occurred
-                "azure",  # llm_provider
                 "gpt-4o",  # model_name
                 f"Comprehensive evaluation for {agent}: The response demonstrates good understanding of the query with appropriate depth and accuracy. Score breakdown reflects the quality across multiple dimensions.",  # judge_reasoning
                 guardrails_failures,  # guardrails_failures
@@ -353,16 +384,17 @@ class HuggingFaceDashboard:
             INSERT INTO evaluation_logs (
                 session_id, agent_name, query, response, overall_score,
                 relevance_score, accuracy_score, completeness_score, coherence_score,
-                guardrails_passed, safety_score, execution_time_ms, error_occurred,
                 llm_provider, model_name, judge_reasoning, guardrails_failures, timestamp
-            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
             ''', eval_data)
             # Get the evaluation ID for response analysis
             evaluation_id = cursor.lastrowid
             # Insert detailed response analysis
-            self.insert_response_analysis(cursor, evaluation_id, session_id, agent, response, timestamp)
     def insert_response_analysis(self, cursor, evaluation_id, session_id, agent_name, response_text, timestamp):
         """Insert detailed response analysis data"""
@@ -557,6 +589,47 @@ class HuggingFaceDashboard:
             value=False
         )
         return filters
     def show_executive_summary(self, data: Dict[str, pd.DataFrame]):
@@ -593,10 +666,31 @@ class HuggingFaceDashboard:
             st.metric("Unique Sessions", f"{unique_sessions:,}")
         # Performance trends
-        st.subheader("📊 Performance Trends")
         # Daily performance trend
-        df_daily = df.groupby(df['timestamp'].dt.date).agg({
             'overall_score': 'mean',
             'execution_time_ms': 'mean',
             'guardrails_passed': lambda x: (x.sum() / len(x)) * 100
@@ -969,6 +1063,553 @@ class HuggingFaceDashboard:
                     mime="text/csv"
                 )
     def run(self):
         """Run the dashboard"""
         st.title("🤖 Multi-Agent System Dashboard - Demo")
@@ -1003,14 +1644,33 @@ class HuggingFaceDashboard:
             if filters.get('safety_only', False):
                 df = df[df['guardrails_passed'] == True]
             filtered_data['evaluations'] = df
         # Create tabs
-        tab1, tab2, tab3, tab4 = st.tabs([
             "📈 Executive Summary",
             "🤖 Agent Performance",
             "🛡️ Safety Analysis",
-            "📝 Response Analysis"
         ])
         with tab1:
@@ -1025,9 +1685,59 @@ class HuggingFaceDashboard:
         with tab4:
             self.show_response_analysis(filtered_data)
         # Footer
         st.markdown("---")
-        st.markdown("🚀 **Multi-Agent System Dashboard** | Built with Streamlit & Plotly | Demo hosted on Hugging Face Spaces")
 if __name__ == "__main__":
     dashboard = HuggingFaceDashboard()

             accuracy_score REAL,
             completeness_score REAL,
             coherence_score REAL,
+            hallucination_score REAL,
             guardrails_passed BOOLEAN,
             safety_score REAL,
             execution_time_ms REAL,
+            input_tokens INTEGER,
+            output_tokens INTEGER,
+            total_tokens INTEGER,
+            cost_usd REAL,
             error_occurred BOOLEAN DEFAULT FALSE,
             llm_provider TEXT,
             model_name TEXT,
             # Generate realistic response
             response_templates = {
                 "Diet Agent": [
+                    f"Thank you for your question about nutrition and dietary guidance. I'd be happy to help you develop a healthier relationship with food and create sustainable eating habits.",
+                    f"I understand you're looking for dietary advice, and I'm here to provide evidence-based nutritional guidance tailored to your specific needs and goals.",
+                    f"Great question about nutrition! Let me share some comprehensive dietary recommendations that can help you achieve better health outcomes."
                 ],
                 "Support Agent": [
+                    f"I appreciate you reaching out for support. It takes courage to ask for help, and I'm here to provide you with practical strategies and emotional guidance.",
+                    f"Thank you for sharing your concerns with me. I understand this can be challenging, and I want to help you work through this step by step with compassion and understanding.",
+                    f"I'm glad you've come to me for support. Your feelings are valid, and together we can explore effective coping strategies and build resilience."
                 ],
                 "Queries Agent": [
+                    f"Excellent question! This is a fascinating topic that involves cutting-edge technology and has significant implications for our future. Let me provide you with a comprehensive overview.",
+                    f"Thank you for this thought-provoking question. This subject encompasses multiple disciplines and recent innovations. I'll break this down into key concepts and practical applications.",
+                    f"Great inquiry! This is an evolving field with exciting developments. Let me explain the fundamental principles and explore the current state of research and implementation."
                 ]
             }
             # Add specific details based on agent type
             if agent == "Diet Agent":
                 details = [
+                    "**Key Nutritional Recommendations:**\n\n1. **Whole Foods Focus**: Prioritize unprocessed foods like fresh fruits, vegetables, whole grains, lean proteins, and healthy fats. These provide essential nutrients and fiber while avoiding added sugars and preservatives.\n\n2. **Portion Control**: Use the plate method - fill half your plate with non-starchy vegetables, one quarter with lean protein, and one quarter with complex carbohydrates.\n\n3. **Hydration**: Aim for 8-10 glasses of water daily. Proper hydration supports metabolism, digestion, and overall health.\n\n4. **Meal Timing**: Eat regular meals every 3-4 hours to maintain stable blood sugar levels and prevent overeating.\n\n**Sample Daily Meal Plan:**\n- Breakfast: Greek yogurt with berries and nuts\n- Lunch: Quinoa salad with grilled chicken and vegetables\n- Dinner: Baked salmon with roasted sweet potatoes and broccoli\n- Snacks: Apple with almond butter, or handful of mixed nuts",
+                    "**Evidence-Based Dietary Guidelines:**\n\n1. **Macronutrient Balance**: Aim for 45-65% carbohydrates (focus on complex carbs), 20-35% healthy fats, and 10-35% protein based on your activity level.\n\n2. **Micronutrient Density**: Choose foods rich in vitamins, minerals, and antioxidants. Include colorful fruits and vegetables to ensure variety.\n\n3. **Fiber Intake**: Target 25-35 grams daily through whole grains, legumes, fruits, and vegetables to support digestive health.\n\n4. **Healthy Fats**: Include omega-3 fatty acids from fish, walnuts, and flaxseeds, while limiting saturated and trans fats.\n\n**Practical Implementation Tips:**\n- Meal prep on weekends to ensure healthy options are available\n- Read nutrition labels to make informed choices\n- Practice mindful eating by eating slowly and paying attention to hunger cues\n- Keep a food diary to track patterns and identify areas for improvement",
+                    "**Personalized Nutrition Approach:**\n\nEvery individual has unique nutritional needs based on age, gender, activity level, health conditions, and personal preferences. Here's how to customize your approach:\n\n1. **Assessment**: Consider your current health status, goals (weight management, energy levels, disease prevention), and any dietary restrictions.\n\n2. **Gradual Changes**: Implement changes slowly to ensure sustainability. Start with one or two modifications per week.\n\n3. **Professional Guidance**: Consider consulting with a registered dietitian for personalized meal planning, especially if you have specific health conditions.\n\n4. **Regular Monitoring**: Track your progress through energy levels, sleep quality, and how you feel overall, not just weight.\n\n**Common Nutritional Myths Debunked:**\n- Carbs aren't inherently bad - choose complex carbohydrates over simple sugars\n- Fat doesn't make you fat - healthy fats are essential for hormone production and nutrient absorption\n- Skipping meals doesn't help with weight loss and can lead to overeating later"
                 ]
             elif agent == "Support Agent":
                 details = [
+                    "**Comprehensive Support Strategy:**\n\n**Immediate Coping Techniques:**\n1. **Deep Breathing**: Practice the 4-7-8 technique - inhale for 4 counts, hold for 7, exhale for 8. This activates your parasympathetic nervous system.\n\n2. **Grounding Exercises**: Use the 5-4-3-2-1 method - identify 5 things you can see, 4 you can touch, 3 you can hear, 2 you can smell, and 1 you can taste.\n\n3. **Progressive Muscle Relaxation**: Tense and release each muscle group from toes to head, holding tension for 5 seconds before releasing.\n\n**Long-term Strategies:**\n- Establish a consistent daily routine to provide structure and predictability\n- Practice mindfulness meditation for 10-15 minutes daily\n- Maintain a journal to process emotions and identify patterns\n- Build a support network of trusted friends, family, or support groups\n\n**Professional Resources:**\nConsider reaching out to mental health professionals if you're experiencing persistent difficulties. Many offer telehealth options for convenience.",
+                    "**Building Emotional Resilience:**\n\n**Understanding Your Emotions:**\nEmotions are natural responses to life events. Learning to recognize, understand, and manage them is a skill that can be developed with practice.\n\n**Practical Steps:**\n1. **Emotion Identification**: Use an emotion wheel or journal to name specific feelings rather than general terms like 'bad' or 'stressed.'\n\n2. **Trigger Awareness**: Notice what situations, people, or thoughts tend to trigger difficult emotions.\n\n3. **Response vs. Reaction**: Create a pause between feeling and action. Ask yourself: 'What would be most helpful right now?'\n\n4. **Self-Compassion**: Treat yourself with the same kindness you'd offer a good friend facing similar challenges.\n\n**Daily Practices:**\n- Morning intention setting (5 minutes)\n- Midday check-in with your emotional state\n- Evening reflection on what went well and what you learned\n- Regular physical activity to support mental health\n\n**Crisis Resources:**\nIf you're experiencing thoughts of self-harm, please reach out immediately to a crisis hotline, emergency services, or trusted healthcare provider.",
+                    "**Stress Management and Well-being:**\n\n**Understanding Stress:**\nStress is a normal part of life, but chronic stress can impact your physical and mental health. Learning effective management techniques is crucial for long-term well-being.\n\n**Evidence-Based Techniques:**\n1. **Cognitive Restructuring**: Challenge negative thought patterns by asking: 'Is this thought realistic? What evidence supports or contradicts it? What would I tell a friend in this situation?'\n\n2. **Time Management**: Use techniques like the Pomodoro method, prioritization matrices, and saying no to non-essential commitments.\n\n3. **Physical Self-Care**: Regular exercise, adequate sleep (7-9 hours), and proper nutrition form the foundation of stress resilience.\n\n4. **Social Connection**: Maintain relationships with supportive people. Even brief positive interactions can improve mood and reduce stress.\n\n**Creating Your Personal Toolkit:**\n- Identify 3-5 coping strategies that work best for you\n- Practice them regularly, not just during stressful times\n- Adjust and refine your approach based on what's most effective\n- Remember that seeking help is a sign of strength, not weakness"
                 ]
             else:  # Queries Agent
                 details = [
+                    "**Technical Deep Dive:**\n\n**Fundamental Concepts:**\nThis technology represents a convergence of multiple disciplines including computer science, mathematics, engineering, and domain-specific expertise. The underlying principles involve complex algorithms, data structures, and computational methods.\n\n**Current Implementation:**\n1. **Healthcare**: AI-powered diagnostic tools, personalized treatment plans, drug discovery acceleration, and robotic surgery assistance.\n\n2. **Finance**: Algorithmic trading, fraud detection, risk assessment, and automated customer service through chatbots.\n\n3. **Transportation**: Autonomous vehicles, traffic optimization, predictive maintenance, and route planning algorithms.\n\n4. **Entertainment**: Recommendation systems, content generation, virtual reality experiences, and interactive gaming.\n\n**Technical Architecture:**\n- Data processing pipelines that handle massive datasets in real-time\n- Machine learning models trained on diverse, high-quality datasets\n- Cloud infrastructure enabling scalable deployment and accessibility\n- APIs and interfaces that allow integration with existing systems\n\n**Performance Metrics:**\nSuccess is measured through accuracy rates, processing speed, user engagement, cost efficiency, and real-world impact on problem-solving.",
+                    "**Industry Applications and Impact:**\n\n**Current Market Landscape:**\nThe technology sector is experiencing rapid transformation with significant investments in research and development. Major companies are competing to develop more efficient, ethical, and accessible solutions.\n\n**Real-World Applications:**\n1. **Smart Cities**: IoT sensors, traffic management, energy optimization, and public safety systems working together to improve urban living.\n\n2. **Environmental Monitoring**: Satellite imagery analysis, climate modeling, pollution tracking, and renewable energy optimization.\n\n3. **Education**: Personalized learning platforms, automated grading systems, virtual tutors, and accessibility tools for diverse learners.\n\n4. **Manufacturing**: Predictive maintenance, quality control, supply chain optimization, and human-robot collaboration.\n\n**Economic Impact:**\n- Job creation in new fields while transforming traditional roles\n- Increased productivity and efficiency across industries\n- New business models and revenue streams\n- Global competitiveness and innovation drivers\n\n**Challenges and Solutions:**\n- Addressing ethical concerns through responsible development practices\n- Ensuring data privacy and security through robust frameworks\n- Managing the digital divide through inclusive design and accessibility",
+                    "**Future Implications and Trends:**\n\n**Emerging Developments:**\nThe field is evolving rapidly with breakthrough research in quantum computing, neuromorphic chips, and advanced algorithms that promise to solve previously intractable problems.\n\n**Next 5-10 Years:**\n1. **Integration**: Seamless integration across platforms and devices, creating more intuitive user experiences.\n\n2. **Personalization**: Highly customized solutions that adapt to individual preferences and needs in real-time.\n\n3. **Sustainability**: Green technology initiatives focusing on energy efficiency and environmental responsibility.\n\n4. **Accessibility**: Universal design principles ensuring technology benefits all users regardless of abilities or circumstances.\n\n**Societal Considerations:**\n- Regulatory frameworks evolving to balance innovation with consumer protection\n- Educational systems adapting to prepare workforce for technological changes\n- International cooperation on standards and ethical guidelines\n- Public discourse on the role of technology in society\n\n**Preparing for the Future:**\n- Continuous learning and skill development\n- Critical thinking about technology's role in daily life\n- Participation in discussions about technology policy and ethics\n- Understanding both opportunities and risks associated with technological advancement"
                 ]
+            # Create a more comprehensive response
             response = f"{base_response}\n\n{random.choice(details)}"
             # Generate correlated scores (realistic relationships)
             completeness_score = max(0, min(10, base_score + random.uniform(-0.5, 0.3)))
             coherence_score = max(0, min(10, base_score + random.uniform(-0.2, 0.4)))
+            # Generate hallucination score (inverse relationship with accuracy)
+            hallucination_score = max(0, min(10, 10 - accuracy_score + random.uniform(-1.0, 1.0)))
+            # Generate token consumption based on response length and agent type
+            response_length = len(response)
+            input_tokens = len(query.split()) * 1.3  # Rough estimate
+            output_tokens = response_length / 4  # Rough estimate (4 chars per token)
+            total_tokens = int(input_tokens + output_tokens)
+            # Calculate cost (rough estimates per 1K tokens)
+            cost_per_1k_tokens = {
+                "azure": 0.03,  # GPT-4
+                "openai": 0.03,
+                "anthropic": 0.025
+            }
+            cost_usd = (total_tokens / 1000) * cost_per_1k_tokens.get(llm_provider, 0.03)
             # Realistic safety scenarios
             safety_pass_rate = 0.95  # 95% pass rate
             if random.random() < 0.02:  # 2% chance of safety issues
                 accuracy_score,  # accuracy_score
                 completeness_score,  # completeness_score
                 coherence_score,  # coherence_score
+                hallucination_score,  # hallucination_score
                 guardrails_passed,  # guardrails_passed
                 safety_score,  # safety_score
                 execution_time,  # execution_time_ms
+                int(input_tokens),  # input_tokens
+                int(output_tokens),  # output_tokens
+                total_tokens,  # total_tokens
+                round(cost_usd, 4),  # cost_usd
                 False,  # error_occurred
+                llm_provider,  # llm_provider
                 "gpt-4o",  # model_name
                 f"Comprehensive evaluation for {agent}: The response demonstrates good understanding of the query with appropriate depth and accuracy. Score breakdown reflects the quality across multiple dimensions.",  # judge_reasoning
                 guardrails_failures,  # guardrails_failures
             INSERT INTO evaluation_logs (
                 session_id, agent_name, query, response, overall_score,
                 relevance_score, accuracy_score, completeness_score, coherence_score,
+                hallucination_score, guardrails_passed, safety_score, execution_time_ms,
+                input_tokens, output_tokens, total_tokens, cost_usd, error_occurred,
                 llm_provider, model_name, judge_reasoning, guardrails_failures, timestamp
+            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
             ''', eval_data)
             # Get the evaluation ID for response analysis
             evaluation_id = cursor.lastrowid
             # Insert detailed response analysis
+            self.insert_response_analysis(cursor, evaluation_id, eval_data[0], agent, response, timestamp)
     def insert_response_analysis(self, cursor, evaluation_id, session_id, agent_name, response_text, timestamp):
         """Insert detailed response analysis data"""
             value=False
         )
+        # Advanced filters
+        st.sidebar.markdown("### 🔬 Advanced Filters")
+        # Performance tier filter
+        filters['performance_tier'] = st.sidebar.selectbox(
+            "📊 Performance Tier",
+            options=["All", "Excellent (8.5+)", "Good (7.0-8.5)", "Needs Improvement (<7.0)"],
+            index=0
+        )
+        # Response time filter
+        if not data['evaluations'].empty:
+            max_time = data['evaluations']['execution_time_ms'].max()
+            filters['max_response_time'] = st.sidebar.slider(
+                "⏱️ Max Response Time (ms)",
+                min_value=0,
+                max_value=int(max_time),
+                value=int(max_time),
+                step=100
+            )
+        # Model/Provider filter
+        if not data['evaluations'].empty and 'llm_provider' in data['evaluations'].columns:
+            providers = data['evaluations']['llm_provider'].unique().tolist()
+            filters['providers'] = st.sidebar.multiselect(
+                "🤖 LLM Providers",
+                options=providers,
+                default=providers
+            )
+        # Auto-refresh option
+        filters['auto_refresh'] = st.sidebar.checkbox(
+            "🔄 Auto-refresh (30s)",
+            value=False,
+            help="Automatically refresh data every 30 seconds"
+        )
+        if filters.get('auto_refresh', False):
+            st.sidebar.success("🔄 Auto-refresh enabled")
+            # Add auto-refresh logic here if needed
         return filters
     def show_executive_summary(self, data: Dict[str, pd.DataFrame]):
             st.metric("Unique Sessions", f"{unique_sessions:,}")
         # Performance trends
+        col1, col2 = st.columns([3, 1])
+        with col1:
+            st.subheader("📊 Performance Trends")
+        with col2:
+            trend_period = st.selectbox(
+                "📅 Period",
+                options=["7 days", "30 days", "All time"],
+                index=1,
+                key="trend_period"
+            )
+        # Filter data based on selected period
+        if trend_period == "7 days":
+            cutoff_date = datetime.now() - timedelta(days=7)
+            trend_df = df[df['timestamp'] >= cutoff_date]
+        elif trend_period == "30 days":
+            cutoff_date = datetime.now() - timedelta(days=30)
+            trend_df = df[df['timestamp'] >= cutoff_date]
+        else:
+            trend_df = df
         # Daily performance trend
+        df_daily = trend_df.groupby(trend_df['timestamp'].dt.date).agg({
             'overall_score': 'mean',
             'execution_time_ms': 'mean',
             'guardrails_passed': lambda x: (x.sum() / len(x)) * 100
                     mime="text/csv"
                 )
+    def show_advanced_analytics(self, data: Dict[str, pd.DataFrame]):
+        """Show advanced analytics and insights"""
+        st.header("🔬 Advanced Analytics & AI Insights")
+        if data['evaluations'].empty:
+            st.warning("No evaluation data available")
+            return
+        df_eval = data['evaluations']
+        df_analysis = data.get('response_analysis', pd.DataFrame())
+        # Performance trends and predictions
+        st.subheader("📊 Performance Trends & Predictions")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.write("**📈 Score Trends Over Time**")
+            # Daily performance trend with moving average
+            df_daily = df_eval.groupby(df_eval['timestamp'].dt.date).agg({
+                'overall_score': ['mean', 'count'],
+                'execution_time_ms': 'mean'
+            }).reset_index()
+            df_daily.columns = ['date', 'avg_score', 'count', 'avg_time']
+            # Calculate moving average
+            df_daily['score_ma'] = df_daily['avg_score'].rolling(window=7, min_periods=1).mean()
+            fig = go.Figure()
+            fig.add_trace(go.Scatter(
+                x=df_daily['date'],
+                y=df_daily['avg_score'],
+                mode='lines+markers',
+                name='Daily Score',
+                line=dict(color='lightblue', width=1),
+                opacity=0.7
+            ))
+            fig.add_trace(go.Scatter(
+                x=df_daily['date'],
+                y=df_daily['score_ma'],
+                mode='lines',
+                name='7-Day Moving Average',
+                line=dict(color='red', width=3)
+            ))
+            fig.update_layout(
+                title="Score Trends with Moving Average",
+                xaxis_title="Date",
+                yaxis_title="Score",
+                height=400
+            )
+            st.plotly_chart(fig, use_container_width=True)
+        with col2:
+            st.write("**⚡ Performance Correlation Matrix**")
+            # Correlation analysis
+            score_cols = ['overall_score', 'relevance_score', 'accuracy_score',
+                         'completeness_score', 'coherence_score', 'execution_time_ms']
+            available_cols = [col for col in score_cols if col in df_eval.columns]
+            if len(available_cols) > 2:
+                corr_matrix = df_eval[available_cols].corr()
+                fig = px.imshow(
+                    corr_matrix,
+                    title="Performance Metrics Correlation",
+                    color_continuous_scale='RdBu',
+                    aspect="auto"
+                )
+                fig.update_layout(height=400)
+                st.plotly_chart(fig, use_container_width=True)
+            else:
+                st.info("Need more metrics for correlation analysis")
+        # Agent comparison and benchmarking
+        st.subheader("🏆 Agent Benchmarking & Competition")
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.write("**🥇 Agent Leaderboard**")
+            leaderboard = df_eval.groupby('agent_name').agg({
+                'overall_score': ['mean', 'std', 'count'],
+                'execution_time_ms': 'mean'
+            }).round(2)
+            leaderboard.columns = ['Avg Score', 'Score StdDev', 'Total Evals', 'Avg Time (ms)']
+            leaderboard['Efficiency'] = (leaderboard['Avg Score'] / (leaderboard['Avg Time (ms)'] / 1000)).round(2)
+            leaderboard = leaderboard.sort_values('Avg Score', ascending=False)
+            # Add rank and medals
+            leaderboard['Rank'] = range(1, len(leaderboard) + 1)
+            medals = ['🥇', '🥈', '🥉'] + ['🏅'] * (len(leaderboard) - 3)
+            leaderboard['Medal'] = medals[:len(leaderboard)]
+            st.dataframe(leaderboard[['Medal', 'Rank', 'Avg Score', 'Efficiency', 'Total Evals']], use_container_width=True)
+        with col2:
+            st.write("**📊 Performance Distribution**")
+            fig = px.box(
+                df_eval,
+                x='agent_name',
+                y='overall_score',
+                title="Score Distribution by Agent",
+                color='agent_name'
+            )
+            fig.update_layout(height=300, showlegend=False)
+            st.plotly_chart(fig, use_container_width=True)
+        with col3:
+            st.write("**⚡ Speed vs Quality**")
+            agent_perf = df_eval.groupby('agent_name').agg({
+                'overall_score': 'mean',
+                'execution_time_ms': 'mean'
+            }).reset_index()
+            fig = px.scatter(
+                agent_perf,
+                x='execution_time_ms',
+                y='overall_score',
+                size='overall_score',
+                color='agent_name',
+                title="Speed vs Quality Trade-off",
+                labels={'execution_time_ms': 'Response Time (ms)', 'overall_score': 'Quality Score'}
+            )
+            fig.update_layout(height=300)
+            st.plotly_chart(fig, use_container_width=True)
+        # AI-powered insights and recommendations
+        st.subheader("🤖 AI-Powered Insights & Recommendations")
+        # Generate insights
+        insights = self.generate_ai_insights(df_eval, df_analysis)
+        col1, col2 = st.columns(2)
+        with col1:
+            st.write("**💡 Key Insights**")
+            for insight in insights['insights']:
+                st.info(f"🔍 {insight}")
+        with col2:
+            st.write("**🚀 Recommendations**")
+            for rec in insights['recommendations']:
+                st.success(f"💡 {rec}")
+        # Performance anomaly detection
+        st.subheader("🔍 Anomaly Detection")
+        anomalies = self.detect_anomalies(df_eval)
+        if anomalies:
+            st.warning(f"⚠️ Detected {len(anomalies)} potential anomalies:")
+            for anomaly in anomalies:
+                st.write(f"• {anomaly}")
+        else:
+            st.success("✅ No performance anomalies detected")
+        # Real-time monitoring simulation
+        st.subheader("📡 Real-time Monitoring Simulation")
+        if st.button("🔄 Simulate Real-time Update"):
+            # Simulate new data
+            latest_data = self.simulate_realtime_data()
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Latest Score", f"{latest_data['score']:.2f}", f"{latest_data['score_delta']:+.2f}")
+            with col2:
+                st.metric("Response Time", f"{latest_data['time']:.0f}ms", f"{latest_data['time_delta']:+.0f}ms")
+            with col3:
+                st.metric("Safety Status", "✅ Passed" if latest_data['safe'] else "❌ Failed")
+            st.success("🔄 Dashboard updated with latest data!")
+    def generate_ai_insights(self, df_eval, df_analysis):
+        """Generate AI-powered insights from the data"""
+        insights = []
+        recommendations = []
+        # Performance insights
+        best_agent = df_eval.groupby('agent_name')['overall_score'].mean().idxmax()
+        worst_agent = df_eval.groupby('agent_name')['overall_score'].mean().idxmin()
+        avg_score = df_eval['overall_score'].mean()
+        score_trend = df_eval.groupby(df_eval['timestamp'].dt.date)['overall_score'].mean()
+        if len(score_trend) > 1:
+            recent_trend = score_trend.iloc[-3:].mean() - score_trend.iloc[:3].mean()
+            if recent_trend > 0.5:
+                insights.append(f"Performance is improving! Recent scores are {recent_trend:.1f} points higher than earlier.")
+            elif recent_trend < -0.5:
+                insights.append(f"Performance decline detected. Recent scores are {abs(recent_trend):.1f} points lower.")
+        # Agent insights
+        insights.append(f"{best_agent} is the top performer with highest average scores.")
+        insights.append(f"Overall system performance: {avg_score:.1f}/10 - {'Excellent' if avg_score > 8.5 else 'Good' if avg_score > 7.5 else 'Needs Improvement'}")
+        # Response time insights
+        avg_time = df_eval['execution_time_ms'].mean()
+        if avg_time > 2000:
+            insights.append(f"Response times are high (avg: {avg_time:.0f}ms). Consider optimization.")
+        # Safety insights
+        safety_rate = (df_eval['guardrails_passed'].sum() / len(df_eval)) * 100
+        if safety_rate < 95:
+            insights.append(f"Safety pass rate is {safety_rate:.1f}% - below recommended 95% threshold.")
+        # Recommendations
+        if worst_agent != best_agent:
+            recommendations.append(f"Consider retraining {worst_agent} using patterns from {best_agent}")
+        if avg_time > 1500:
+            recommendations.append("Implement caching or optimize model inference to reduce response times")
+        recommendations.append("Schedule regular performance reviews every 2 weeks")
+        recommendations.append("Set up automated alerts for scores below 7.0 or response times above 3 seconds")
+        if not df_analysis.empty:
+            avg_readability = df_analysis['readability_score'].mean()
+            if avg_readability < 6:
+                recommendations.append("Improve response readability - consider simpler language and shorter sentences")
+        return {'insights': insights, 'recommendations': recommendations}
+    def detect_anomalies(self, df_eval):
+        """Detect performance anomalies"""
+        anomalies = []
+        # Score anomalies (using IQR method)
+        Q1 = df_eval['overall_score'].quantile(0.25)
+        Q3 = df_eval['overall_score'].quantile(0.75)
+        IQR = Q3 - Q1
+        lower_bound = Q1 - 1.5 * IQR
+        upper_bound = Q3 + 1.5 * IQR
+        score_anomalies = df_eval[(df_eval['overall_score'] < lower_bound) | (df_eval['overall_score'] > upper_bound)]
+        if len(score_anomalies) > 0:
+            anomalies.append(f"{len(score_anomalies)} evaluations with unusual scores detected")
+        # Response time anomalies
+        time_Q1 = df_eval['execution_time_ms'].quantile(0.25)
+        time_Q3 = df_eval['execution_time_ms'].quantile(0.75)
+        time_IQR = time_Q3 - time_Q1
+        time_upper = time_Q3 + 1.5 * time_IQR
+        time_anomalies = df_eval[df_eval['execution_time_ms'] > time_upper]
+        if len(time_anomalies) > 0:
+            anomalies.append(f"{len(time_anomalies)} evaluations with unusually long response times")
+        # Safety anomalies
+        safety_failures = df_eval[df_eval['guardrails_passed'] == False]
+        if len(safety_failures) > len(df_eval) * 0.1:  # More than 10% failures
+            anomalies.append(f"High safety failure rate: {len(safety_failures)} failures out of {len(df_eval)} evaluations")
+        return anomalies
+    def simulate_realtime_data(self):
+        """Simulate real-time data update"""
+        import random
+        return {
+            'score': random.uniform(7.0, 9.5),
+            'score_delta': random.uniform(-0.5, 0.5),
+            'time': random.uniform(500, 2000),
+            'time_delta': random.uniform(-200, 200),
+            'safe': random.choice([True, True, True, False])  # 75% safe
+        }
+    def show_workflow_visualization(self, data: Dict[str, pd.DataFrame]):
+        """Show workflow visualization with queries, scores, latency, hallucination, and token consumption"""
+        st.header("🔄 Workflow Visualization")
+        df_eval = data['evaluations']
+        if df_eval.empty:
+            st.warning("No evaluation data available for workflow visualization.")
+            return
+        # Create workflow selection
+        col1, col2 = st.columns([1, 1])
+        with col1:
+            sessions = df_eval['session_id'].unique()
+            selected_session = st.selectbox("Select Session", sessions, key="workflow_session")
+        with col2:
+            agents = df_eval['agent_name'].unique()
+            selected_agent = st.selectbox("Select Agent (Optional)", ['All'] + list(agents), key="workflow_agent")
+        # Filter data
+        session_data = df_eval[df_eval['session_id'] == selected_session]
+        if selected_agent != 'All':
+            session_data = session_data[session_data['agent_name'] == selected_agent]
+        if session_data.empty:
+            st.warning("No data found for selected filters.")
+            return
+        # Create workflow diagram
+        st.subheader("📊 Workflow Flow Diagram")
+        # Generate Mermaid diagram
+        mermaid_diagram = self.create_workflow_diagram(session_data)
+        # Display the diagram using markdown (since create_diagram might not be available)
+        st.markdown("```mermaid\n" + mermaid_diagram + "\n```")
+        # Workflow metrics overview
+        st.subheader("📈 Session Metrics Overview")
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            avg_score = session_data['overall_score'].mean()
+            st.metric("Avg Overall Score", f"{avg_score:.2f}/10",
+                     delta=f"{avg_score - 7.5:.2f}" if avg_score > 7.5 else f"{avg_score - 7.5:.2f}")
+        with col2:
+            avg_latency = session_data['execution_time_ms'].mean()
+            st.metric("Avg Response Time", f"{avg_latency:.0f}ms",
+                     delta=f"{avg_latency - 3000:.0f}ms" if avg_latency < 3000 else f"+{avg_latency - 3000:.0f}ms")
+        with col3:
+            avg_hallucination = session_data['hallucination_score'].mean() if 'hallucination_score' in session_data.columns else 0
+            st.metric("Avg Hallucination", f"{avg_hallucination:.2f}/10",
+                     delta=f"{5.0 - avg_hallucination:.2f}" if avg_hallucination < 5.0 else f"-{avg_hallucination - 5.0:.2f}")
+        with col4:
+            total_tokens = session_data['total_tokens'].sum() if 'total_tokens' in session_data.columns else 0
+            total_cost = session_data['cost_usd'].sum() if 'cost_usd' in session_data.columns else 0
+            st.metric("Total Cost", f"${total_cost:.4f}", f"{total_tokens:,} tokens")
+        # Detailed workflow steps
+        st.subheader("🔍 Detailed Workflow Steps")
+        for idx, row in session_data.iterrows():
+            with st.expander(f"Step {idx + 1}: {row['agent_name']} - Score: {row['overall_score']:.2f}/10"):
+                # Query and Response
+                col1, col2 = st.columns([1, 1])
+                with col1:
+                    st.markdown("**Query:**")
+                    st.write(row['query'])
+                    # Performance metrics
+                    st.markdown("**Performance Metrics:**")
+                    metrics_data = {
+                        'Overall Score': row['overall_score'],
+                        'Relevance': row['relevance_score'],
+                        'Accuracy': row['accuracy_score'],
+                        'Completeness': row['completeness_score'],
+                        'Coherence': row['coherence_score'],
+                        'Hallucination': row.get('hallucination_score', 0),
+                        'Safety': row['safety_score']
+                    }
+                    # Create a bar chart for scores
+                    import plotly.graph_objects as go
+                    fig = go.Figure(data=[
+                        go.Bar(x=list(metrics_data.keys()), y=list(metrics_data.values()),
+                               marker_color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2'])
+                    ])
+                    fig.update_layout(
+                        title="Score Breakdown",
+                        yaxis_title="Score (0-10)",
+                        height=300,
+                        showlegend=False
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+                with col2:
+                    st.markdown("**Response:**")
+                    st.write(row['response'])
+                    # Token and cost information
+                    st.markdown("**Resource Consumption:**")
+                    token_col1, token_col2 = st.columns(2)
+                    with token_col1:
+                        input_tokens = row.get('input_tokens', 0)
+                        output_tokens = row.get('output_tokens', 0)
+                        st.metric("Input Tokens", f"{input_tokens:,}")
+                        st.metric("Output Tokens", f"{output_tokens:,}")
+                    with token_col2:
+                        total_tokens = row.get('total_tokens', 0)
+                        cost = row.get('cost_usd', 0)
+                        st.metric("Total Tokens", f"{total_tokens:,}")
+                        st.metric("Cost", f"${cost:.4f}")
+                    # Execution details
+                    st.markdown("**Execution Details:**")
+                    exec_time = row['execution_time_ms']
+                    llm_provider = row.get('llm_provider', 'Unknown')
+                    model_name = row.get('model_name', 'Unknown')
+                    st.write(f"⏱️ **Execution Time:** {exec_time:.0f}ms")
+                    st.write(f"🤖 **LLM Provider:** {llm_provider}")
+                    st.write(f"🧠 **Model:** {model_name}")
+                    st.write(f"🛡️ **Safety Passed:** {'✅' if row['guardrails_passed'] else '❌'}")
+        # Comparative analysis
+        st.subheader("📊 Comparative Analysis")
+        # Create comparison charts
+        col1, col2 = st.columns(2)
+        with col1:
+            # Score comparison
+            fig = go.Figure()
+            score_columns = ['overall_score', 'relevance_score', 'accuracy_score', 'completeness_score', 'coherence_score']
+            if 'hallucination_score' in session_data.columns:
+                score_columns.append('hallucination_score')
+            for i, (idx, row) in enumerate(session_data.iterrows()):
+                fig.add_trace(go.Scatterpolar(
+                    r=[row[col] for col in score_columns],
+                    theta=[col.replace('_score', '').title() for col in score_columns],
+                    fill='toself',
+                    name=f"{row['agent_name']} - Step {i+1}"
+                ))
+            fig.update_layout(
+                polar=dict(
+                    radialaxis=dict(
+                        visible=True,
+                        range=[0, 10]
+                    )),
+                showlegend=True,
+                title="Score Comparison Radar Chart"
+            )
+            st.plotly_chart(fig, use_container_width=True)
+        with col2:
+            # Token consumption over steps
+            if 'total_tokens' in session_data.columns:
+                fig = go.Figure()
+                steps = [f"Step {i+1}" for i in range(len(session_data))]
+                fig.add_trace(go.Bar(
+                    x=steps,
+                    y=session_data['total_tokens'],
+                    name='Total Tokens',
+                    marker_color='lightblue'
+                ))
+                fig.add_trace(go.Scatter(
+                    x=steps,
+                    y=session_data['execution_time_ms'],
+                    yaxis='y2',
+                    name='Response Time (ms)',
+                    line=dict(color='red', width=2),
+                    mode='lines+markers'
+                ))
+                fig.update_layout(
+                    title="Token Consumption vs Response Time",
+                    xaxis_title="Workflow Steps",
+                    yaxis_title="Total Tokens",
+                    yaxis2=dict(
+                        title="Response Time (ms)",
+                        overlaying='y',
+                        side='right'
+                    ),
+                    height=400
+                )
+                st.plotly_chart(fig, use_container_width=True)
+        # Session summary
+        st.subheader("📋 Session Summary")
+        summary_col1, summary_col2, summary_col3 = st.columns(3)
+        with summary_col1:
+            st.markdown("**Quality Metrics:**")
+            st.write(f"• Average Overall Score: {session_data['overall_score'].mean():.2f}/10")
+            st.write(f"• Best Performing Step: {session_data.loc[session_data['overall_score'].idxmax(), 'agent_name']}")
+            st.write(f"• Consistency (Std Dev): {session_data['overall_score'].std():.2f}")
+        with summary_col2:
+            st.markdown("**Performance Metrics:**")
+            st.write(f"• Total Execution Time: {session_data['execution_time_ms'].sum():.0f}ms")
+            st.write(f"• Average Response Time: {session_data['execution_time_ms'].mean():.0f}ms")
+            st.write(f"• Fastest Step: {session_data['execution_time_ms'].min():.0f}ms")
+        with summary_col3:
+            st.markdown("**Resource Usage:**")
+            if 'total_tokens' in session_data.columns:
+                st.write(f"• Total Tokens Used: {session_data['total_tokens'].sum():,}")
+                st.write(f"• Total Cost: ${session_data['cost_usd'].sum():.4f}")
+                st.write(f"• Avg Cost per Query: ${session_data['cost_usd'].mean():.4f}")
+            else:
+                st.write("• Token data not available")
+        # Export functionality
+        st.subheader("📤 Export Workflow Data")
+        if st.button("Export Session Data to CSV", key="export_workflow"):
+            csv_data = session_data.to_csv(index=False)
+            st.download_button(
+                label="Download CSV",
+                data=csv_data,
+                file_name=f"workflow_session_{selected_session}.csv",
+                mime="text/csv"
+            )
+    def create_workflow_diagram(self, session_data):
+        """Create a Mermaid workflow diagram"""
+        diagram = "graph TD\n"
+        diagram += "    Start([Session Start])\n"
+        for i, (idx, row) in enumerate(session_data.iterrows()):
+            step_id = f"Step{i+1}"
+            agent_name = row['agent_name'].replace(' ', '_')
+            score = row['overall_score']
+            exec_time = row['execution_time_ms']
+            # Color based on score
+            if score >= 8.5:
+                color = "fill:#90EE90"  # Light green
+            elif score >= 7.0:
+                color = "fill:#FFE4B5"  # Light orange
+            else:
+                color = "fill:#FFB6C1"  # Light pink
+            diagram += f"    {step_id}[\"{agent_name}<br/>Score: {score:.1f}/10<br/>Time: {exec_time:.0f}ms\"]\n"
+            diagram += f"    {step_id} --> {step_id}_result{{Result}}\n"
+            if i == 0:
+                diagram += f"    Start --> {step_id}\n"
+            else:
+                prev_step = f"Step{i}"
+                diagram += f"    {prev_step}_result --> {step_id}\n"
+            # Add styling
+            diagram += f"    class {step_id} stepClass;\n"
+        # Add end node
+        last_step = f"Step{len(session_data)}"
+        diagram += f"    {last_step}_result --> End([Session End])\n"
+        # Add class definitions
+        diagram += "    classDef stepClass fill:#e1f5fe,stroke:#01579b,stroke-width:2px;\n"
+        return diagram
     def run(self):
         """Run the dashboard"""
         st.title("🤖 Multi-Agent System Dashboard - Demo")
             if filters.get('safety_only', False):
                 df = df[df['guardrails_passed'] == True]
+            # Performance tier filter
+            if filters.get('performance_tier') != "All":
+                if filters['performance_tier'] == "Excellent (8.5+)":
+                    df = df[df['overall_score'] >= 8.5]
+                elif filters['performance_tier'] == "Good (7.0-8.5)":
+                    df = df[(df['overall_score'] >= 7.0) & (df['overall_score'] < 8.5)]
+                elif filters['performance_tier'] == "Needs Improvement (<7.0)":
+                    df = df[df['overall_score'] < 7.0]
+            # Response time filter
+            if 'max_response_time' in filters:
+                df = df[df['execution_time_ms'] <= filters['max_response_time']]
+            # Provider filter
+            if 'providers' in filters and filters['providers']:
+                df = df[df['llm_provider'].isin(filters['providers'])]
             filtered_data['evaluations'] = df
         # Create tabs
+        tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
             "📈 Executive Summary",
             "🤖 Agent Performance",
             "🛡️ Safety Analysis",
+            "📝 Response Analysis",
+            "🔬 Advanced Analytics",
+            "🔄 Workflow Visualization"
         ])
         with tab1:
         with tab4:
             self.show_response_analysis(filtered_data)
+        with tab5:
+            self.show_advanced_analytics(filtered_data)
+        with tab6:
+            self.show_workflow_visualization(filtered_data)
+        # Quick actions sidebar
+        st.sidebar.markdown("---")
+        st.sidebar.markdown("### ⚡ Quick Actions")
+        if st.sidebar.button("📊 Generate Report"):
+            st.sidebar.success("📄 Report generated!")
+            # Could generate PDF report here
+        if st.sidebar.button("🔄 Refresh Data"):
+            st.sidebar.success("🔄 Data refreshed!")
+            st.experimental_rerun()
+        if st.sidebar.button("📧 Send Alert"):
+            st.sidebar.success("📧 Alert sent to team!")
+        # Data summary in sidebar
+        if not filtered_data['evaluations'].empty:
+            st.sidebar.markdown("### 📈 Current Session")
+            st.sidebar.metric("Filtered Records", len(filtered_data['evaluations']))
+            st.sidebar.metric("Avg Score", f"{filtered_data['evaluations']['overall_score'].mean():.2f}")
+            st.sidebar.metric("Success Rate", f"{(filtered_data['evaluations']['guardrails_passed'].sum() / len(filtered_data['evaluations']) * 100):.1f}%")
         # Footer
         st.markdown("---")
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.markdown("🚀 **Multi-Agent System Dashboard**")
+        with col2:
+            st.markdown("Built with Streamlit & Plotly")
+        with col3:
+            if st.button("ℹ️ About"):
+                st.info("""
+                **Multi-Agent System Dashboard v2.0**
+                Features:
+                - 📊 Real-time monitoring
+                - 🤖 AI-powered insights
+                - 🔍 Advanced analytics
+                - 📝 Response tracing
+                - 🛡️ Safety monitoring
+                - 📈 Performance benchmarking
+                Built for production-grade multi-agent systems.
+                """)
 if __name__ == "__main__":
     dashboard = HuggingFaceDashboard()