saptyfun commited on
Commit
f4c4c5f
Β·
verified Β·
1 Parent(s): d053b0b

Upload app.py

Browse files
Files changed (1) hide show
  1. src/app.py +740 -30
src/app.py CHANGED
@@ -81,9 +81,14 @@ class HuggingFaceDashboard:
81
  accuracy_score REAL,
82
  completeness_score REAL,
83
  coherence_score REAL,
 
84
  guardrails_passed BOOLEAN,
85
  safety_score REAL,
86
  execution_time_ms REAL,
 
 
 
 
87
  error_occurred BOOLEAN DEFAULT FALSE,
88
  llm_provider TEXT,
89
  model_name TEXT,
@@ -259,19 +264,19 @@ class HuggingFaceDashboard:
259
  # Generate realistic response
260
  response_templates = {
261
  "Diet Agent": [
262
- f"Based on your query about {query[:30]}..., I recommend focusing on balanced nutrition with emphasis on whole foods, proper portion sizes, and regular meal timing.",
263
- f"For your question regarding {query[:30]}..., here's a comprehensive approach that considers your nutritional needs and health goals.",
264
- f"Addressing your concern about {query[:30]}..., let me provide evidence-based dietary guidance tailored to your situation."
265
  ],
266
  "Support Agent": [
267
- f"I understand you're dealing with {query[:30]}... This is a common challenge, and I'm here to help you work through it step by step.",
268
- f"Thank you for sharing your concern about {query[:30]}... Let's explore some practical strategies that can make a real difference.",
269
- f"Your question about {query[:30]}... resonates with many people. Here are some effective approaches you can try."
270
  ],
271
  "Queries Agent": [
272
- f"Great question about {query[:30]}... This is a complex topic that involves several key concepts and recent developments.",
273
- f"To answer your query about {query[:30]}..., let me break this down into the fundamental principles and current applications.",
274
- f"Your question regarding {query[:30]}... touches on important technological and societal implications. Here's a comprehensive overview."
275
  ]
276
  }
277
 
@@ -281,26 +286,30 @@ class HuggingFaceDashboard:
281
  # Add specific details based on agent type
282
  if agent == "Diet Agent":
283
  details = [
284
- "Key recommendations: 1) Focus on whole foods, 2) Control portions, 3) Stay hydrated",
285
- "Nutritional guidelines: Aim for 50% vegetables, 25% lean protein, 25% complex carbs",
286
- "Meal timing: Consider eating every 3-4 hours to maintain stable blood sugar",
287
- "Sample foods: Quinoa, salmon, leafy greens, berries, nuts, and legumes"
 
288
  ]
289
  elif agent == "Support Agent":
290
  details = [
291
- "Action steps: 1) Identify triggers, 2) Develop coping strategies, 3) Practice regularly",
292
- "Techniques to try: Deep breathing, progressive muscle relaxation, mindfulness meditation",
293
- "Timeline: Start with 5-10 minutes daily, gradually increase as comfort grows",
294
- "Resources: Consider apps like Headspace, Calm, or consulting a professional"
 
295
  ]
296
  else: # Queries Agent
297
  details = [
298
- "Technical overview: This involves complex algorithms and data processing methods",
299
- "Current applications: Used in healthcare, finance, transportation, and entertainment",
300
- "Future implications: Expected to revolutionize how we work and interact with technology",
301
- "Key considerations: Privacy, security, ethical implications, and regulatory frameworks"
 
302
  ]
303
 
 
304
  response = f"{base_response}\n\n{random.choice(details)}"
305
 
306
  # Generate correlated scores (realistic relationships)
@@ -309,6 +318,23 @@ class HuggingFaceDashboard:
309
  completeness_score = max(0, min(10, base_score + random.uniform(-0.5, 0.3)))
310
  coherence_score = max(0, min(10, base_score + random.uniform(-0.2, 0.4)))
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  # Realistic safety scenarios
313
  safety_pass_rate = 0.95 # 95% pass rate
314
  if random.random() < 0.02: # 2% chance of safety issues
@@ -338,11 +364,16 @@ class HuggingFaceDashboard:
338
  accuracy_score, # accuracy_score
339
  completeness_score, # completeness_score
340
  coherence_score, # coherence_score
 
341
  guardrails_passed, # guardrails_passed
342
  safety_score, # safety_score
343
  execution_time, # execution_time_ms
 
 
 
 
344
  False, # error_occurred
345
- "azure", # llm_provider
346
  "gpt-4o", # model_name
347
  f"Comprehensive evaluation for {agent}: The response demonstrates good understanding of the query with appropriate depth and accuracy. Score breakdown reflects the quality across multiple dimensions.", # judge_reasoning
348
  guardrails_failures, # guardrails_failures
@@ -353,16 +384,17 @@ class HuggingFaceDashboard:
353
  INSERT INTO evaluation_logs (
354
  session_id, agent_name, query, response, overall_score,
355
  relevance_score, accuracy_score, completeness_score, coherence_score,
356
- guardrails_passed, safety_score, execution_time_ms, error_occurred,
 
357
  llm_provider, model_name, judge_reasoning, guardrails_failures, timestamp
358
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
359
  ''', eval_data)
360
 
361
  # Get the evaluation ID for response analysis
362
  evaluation_id = cursor.lastrowid
363
 
364
  # Insert detailed response analysis
365
- self.insert_response_analysis(cursor, evaluation_id, session_id, agent, response, timestamp)
366
 
367
  def insert_response_analysis(self, cursor, evaluation_id, session_id, agent_name, response_text, timestamp):
368
  """Insert detailed response analysis data"""
@@ -557,6 +589,47 @@ class HuggingFaceDashboard:
557
  value=False
558
  )
559
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
560
  return filters
561
 
562
  def show_executive_summary(self, data: Dict[str, pd.DataFrame]):
@@ -593,10 +666,31 @@ class HuggingFaceDashboard:
593
  st.metric("Unique Sessions", f"{unique_sessions:,}")
594
 
595
  # Performance trends
596
- st.subheader("πŸ“Š Performance Trends")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597
 
598
  # Daily performance trend
599
- df_daily = df.groupby(df['timestamp'].dt.date).agg({
600
  'overall_score': 'mean',
601
  'execution_time_ms': 'mean',
602
  'guardrails_passed': lambda x: (x.sum() / len(x)) * 100
@@ -969,6 +1063,553 @@ class HuggingFaceDashboard:
969
  mime="text/csv"
970
  )
971
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
972
  def run(self):
973
  """Run the dashboard"""
974
  st.title("πŸ€– Multi-Agent System Dashboard - Demo")
@@ -1003,14 +1644,33 @@ class HuggingFaceDashboard:
1003
  if filters.get('safety_only', False):
1004
  df = df[df['guardrails_passed'] == True]
1005
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1006
  filtered_data['evaluations'] = df
1007
 
1008
  # Create tabs
1009
- tab1, tab2, tab3, tab4 = st.tabs([
1010
  "πŸ“ˆ Executive Summary",
1011
  "πŸ€– Agent Performance",
1012
  "πŸ›‘οΈ Safety Analysis",
1013
- "πŸ“ Response Analysis"
 
 
1014
  ])
1015
 
1016
  with tab1:
@@ -1025,9 +1685,59 @@ class HuggingFaceDashboard:
1025
  with tab4:
1026
  self.show_response_analysis(filtered_data)
1027
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1028
  # Footer
1029
  st.markdown("---")
1030
- st.markdown("πŸš€ **Multi-Agent System Dashboard** | Built with Streamlit & Plotly | Demo hosted on Hugging Face Spaces")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1031
 
1032
  if __name__ == "__main__":
1033
  dashboard = HuggingFaceDashboard()
 
81
  accuracy_score REAL,
82
  completeness_score REAL,
83
  coherence_score REAL,
84
+ hallucination_score REAL,
85
  guardrails_passed BOOLEAN,
86
  safety_score REAL,
87
  execution_time_ms REAL,
88
+ input_tokens INTEGER,
89
+ output_tokens INTEGER,
90
+ total_tokens INTEGER,
91
+ cost_usd REAL,
92
  error_occurred BOOLEAN DEFAULT FALSE,
93
  llm_provider TEXT,
94
  model_name TEXT,
 
264
  # Generate realistic response
265
  response_templates = {
266
  "Diet Agent": [
267
+ f"Thank you for your question about nutrition and dietary guidance. I'd be happy to help you develop a healthier relationship with food and create sustainable eating habits.",
268
+ f"I understand you're looking for dietary advice, and I'm here to provide evidence-based nutritional guidance tailored to your specific needs and goals.",
269
+ f"Great question about nutrition! Let me share some comprehensive dietary recommendations that can help you achieve better health outcomes."
270
  ],
271
  "Support Agent": [
272
+ f"I appreciate you reaching out for support. It takes courage to ask for help, and I'm here to provide you with practical strategies and emotional guidance.",
273
+ f"Thank you for sharing your concerns with me. I understand this can be challenging, and I want to help you work through this step by step with compassion and understanding.",
274
+ f"I'm glad you've come to me for support. Your feelings are valid, and together we can explore effective coping strategies and build resilience."
275
  ],
276
  "Queries Agent": [
277
+ f"Excellent question! This is a fascinating topic that involves cutting-edge technology and has significant implications for our future. Let me provide you with a comprehensive overview.",
278
+ f"Thank you for this thought-provoking question. This subject encompasses multiple disciplines and recent innovations. I'll break this down into key concepts and practical applications.",
279
+ f"Great inquiry! This is an evolving field with exciting developments. Let me explain the fundamental principles and explore the current state of research and implementation."
280
  ]
281
  }
282
 
 
286
  # Add specific details based on agent type
287
  if agent == "Diet Agent":
288
  details = [
289
+ "**Key Nutritional Recommendations:**\n\n1. **Whole Foods Focus**: Prioritize unprocessed foods like fresh fruits, vegetables, whole grains, lean proteins, and healthy fats. These provide essential nutrients and fiber while avoiding added sugars and preservatives.\n\n2. **Portion Control**: Use the plate method - fill half your plate with non-starchy vegetables, one quarter with lean protein, and one quarter with complex carbohydrates.\n\n3. **Hydration**: Aim for 8-10 glasses of water daily. Proper hydration supports metabolism, digestion, and overall health.\n\n4. **Meal Timing**: Eat regular meals every 3-4 hours to maintain stable blood sugar levels and prevent overeating.\n\n**Sample Daily Meal Plan:**\n- Breakfast: Greek yogurt with berries and nuts\n- Lunch: Quinoa salad with grilled chicken and vegetables\n- Dinner: Baked salmon with roasted sweet potatoes and broccoli\n- Snacks: Apple with almond butter, or handful of mixed nuts",
290
+
291
+ "**Evidence-Based Dietary Guidelines:**\n\n1. **Macronutrient Balance**: Aim for 45-65% carbohydrates (focus on complex carbs), 20-35% healthy fats, and 10-35% protein based on your activity level.\n\n2. **Micronutrient Density**: Choose foods rich in vitamins, minerals, and antioxidants. Include colorful fruits and vegetables to ensure variety.\n\n3. **Fiber Intake**: Target 25-35 grams daily through whole grains, legumes, fruits, and vegetables to support digestive health.\n\n4. **Healthy Fats**: Include omega-3 fatty acids from fish, walnuts, and flaxseeds, while limiting saturated and trans fats.\n\n**Practical Implementation Tips:**\n- Meal prep on weekends to ensure healthy options are available\n- Read nutrition labels to make informed choices\n- Practice mindful eating by eating slowly and paying attention to hunger cues\n- Keep a food diary to track patterns and identify areas for improvement",
292
+
293
+ "**Personalized Nutrition Approach:**\n\nEvery individual has unique nutritional needs based on age, gender, activity level, health conditions, and personal preferences. Here's how to customize your approach:\n\n1. **Assessment**: Consider your current health status, goals (weight management, energy levels, disease prevention), and any dietary restrictions.\n\n2. **Gradual Changes**: Implement changes slowly to ensure sustainability. Start with one or two modifications per week.\n\n3. **Professional Guidance**: Consider consulting with a registered dietitian for personalized meal planning, especially if you have specific health conditions.\n\n4. **Regular Monitoring**: Track your progress through energy levels, sleep quality, and how you feel overall, not just weight.\n\n**Common Nutritional Myths Debunked:**\n- Carbs aren't inherently bad - choose complex carbohydrates over simple sugars\n- Fat doesn't make you fat - healthy fats are essential for hormone production and nutrient absorption\n- Skipping meals doesn't help with weight loss and can lead to overeating later"
294
  ]
295
  elif agent == "Support Agent":
296
  details = [
297
+ "**Comprehensive Support Strategy:**\n\n**Immediate Coping Techniques:**\n1. **Deep Breathing**: Practice the 4-7-8 technique - inhale for 4 counts, hold for 7, exhale for 8. This activates your parasympathetic nervous system.\n\n2. **Grounding Exercises**: Use the 5-4-3-2-1 method - identify 5 things you can see, 4 you can touch, 3 you can hear, 2 you can smell, and 1 you can taste.\n\n3. **Progressive Muscle Relaxation**: Tense and release each muscle group from toes to head, holding tension for 5 seconds before releasing.\n\n**Long-term Strategies:**\n- Establish a consistent daily routine to provide structure and predictability\n- Practice mindfulness meditation for 10-15 minutes daily\n- Maintain a journal to process emotions and identify patterns\n- Build a support network of trusted friends, family, or support groups\n\n**Professional Resources:**\nConsider reaching out to mental health professionals if you're experiencing persistent difficulties. Many offer telehealth options for convenience.",
298
+
299
+ "**Building Emotional Resilience:**\n\n**Understanding Your Emotions:**\nEmotions are natural responses to life events. Learning to recognize, understand, and manage them is a skill that can be developed with practice.\n\n**Practical Steps:**\n1. **Emotion Identification**: Use an emotion wheel or journal to name specific feelings rather than general terms like 'bad' or 'stressed.'\n\n2. **Trigger Awareness**: Notice what situations, people, or thoughts tend to trigger difficult emotions.\n\n3. **Response vs. Reaction**: Create a pause between feeling and action. Ask yourself: 'What would be most helpful right now?'\n\n4. **Self-Compassion**: Treat yourself with the same kindness you'd offer a good friend facing similar challenges.\n\n**Daily Practices:**\n- Morning intention setting (5 minutes)\n- Midday check-in with your emotional state\n- Evening reflection on what went well and what you learned\n- Regular physical activity to support mental health\n\n**Crisis Resources:**\nIf you're experiencing thoughts of self-harm, please reach out immediately to a crisis hotline, emergency services, or trusted healthcare provider.",
300
+
301
+ "**Stress Management and Well-being:**\n\n**Understanding Stress:**\nStress is a normal part of life, but chronic stress can impact your physical and mental health. Learning effective management techniques is crucial for long-term well-being.\n\n**Evidence-Based Techniques:**\n1. **Cognitive Restructuring**: Challenge negative thought patterns by asking: 'Is this thought realistic? What evidence supports or contradicts it? What would I tell a friend in this situation?'\n\n2. **Time Management**: Use techniques like the Pomodoro method, prioritization matrices, and saying no to non-essential commitments.\n\n3. **Physical Self-Care**: Regular exercise, adequate sleep (7-9 hours), and proper nutrition form the foundation of stress resilience.\n\n4. **Social Connection**: Maintain relationships with supportive people. Even brief positive interactions can improve mood and reduce stress.\n\n**Creating Your Personal Toolkit:**\n- Identify 3-5 coping strategies that work best for you\n- Practice them regularly, not just during stressful times\n- Adjust and refine your approach based on what's most effective\n- Remember that seeking help is a sign of strength, not weakness"
302
  ]
303
  else: # Queries Agent
304
  details = [
305
+ "**Technical Deep Dive:**\n\n**Fundamental Concepts:**\nThis technology represents a convergence of multiple disciplines including computer science, mathematics, engineering, and domain-specific expertise. The underlying principles involve complex algorithms, data structures, and computational methods.\n\n**Current Implementation:**\n1. **Healthcare**: AI-powered diagnostic tools, personalized treatment plans, drug discovery acceleration, and robotic surgery assistance.\n\n2. **Finance**: Algorithmic trading, fraud detection, risk assessment, and automated customer service through chatbots.\n\n3. **Transportation**: Autonomous vehicles, traffic optimization, predictive maintenance, and route planning algorithms.\n\n4. **Entertainment**: Recommendation systems, content generation, virtual reality experiences, and interactive gaming.\n\n**Technical Architecture:**\n- Data processing pipelines that handle massive datasets in real-time\n- Machine learning models trained on diverse, high-quality datasets\n- Cloud infrastructure enabling scalable deployment and accessibility\n- APIs and interfaces that allow integration with existing systems\n\n**Performance Metrics:**\nSuccess is measured through accuracy rates, processing speed, user engagement, cost efficiency, and real-world impact on problem-solving.",
306
+
307
+ "**Industry Applications and Impact:**\n\n**Current Market Landscape:**\nThe technology sector is experiencing rapid transformation with significant investments in research and development. Major companies are competing to develop more efficient, ethical, and accessible solutions.\n\n**Real-World Applications:**\n1. **Smart Cities**: IoT sensors, traffic management, energy optimization, and public safety systems working together to improve urban living.\n\n2. **Environmental Monitoring**: Satellite imagery analysis, climate modeling, pollution tracking, and renewable energy optimization.\n\n3. **Education**: Personalized learning platforms, automated grading systems, virtual tutors, and accessibility tools for diverse learners.\n\n4. **Manufacturing**: Predictive maintenance, quality control, supply chain optimization, and human-robot collaboration.\n\n**Economic Impact:**\n- Job creation in new fields while transforming traditional roles\n- Increased productivity and efficiency across industries\n- New business models and revenue streams\n- Global competitiveness and innovation drivers\n\n**Challenges and Solutions:**\n- Addressing ethical concerns through responsible development practices\n- Ensuring data privacy and security through robust frameworks\n- Managing the digital divide through inclusive design and accessibility",
308
+
309
+ "**Future Implications and Trends:**\n\n**Emerging Developments:**\nThe field is evolving rapidly with breakthrough research in quantum computing, neuromorphic chips, and advanced algorithms that promise to solve previously intractable problems.\n\n**Next 5-10 Years:**\n1. **Integration**: Seamless integration across platforms and devices, creating more intuitive user experiences.\n\n2. **Personalization**: Highly customized solutions that adapt to individual preferences and needs in real-time.\n\n3. **Sustainability**: Green technology initiatives focusing on energy efficiency and environmental responsibility.\n\n4. **Accessibility**: Universal design principles ensuring technology benefits all users regardless of abilities or circumstances.\n\n**Societal Considerations:**\n- Regulatory frameworks evolving to balance innovation with consumer protection\n- Educational systems adapting to prepare workforce for technological changes\n- International cooperation on standards and ethical guidelines\n- Public discourse on the role of technology in society\n\n**Preparing for the Future:**\n- Continuous learning and skill development\n- Critical thinking about technology's role in daily life\n- Participation in discussions about technology policy and ethics\n- Understanding both opportunities and risks associated with technological advancement"
310
  ]
311
 
312
+ # Create a more comprehensive response
313
  response = f"{base_response}\n\n{random.choice(details)}"
314
 
315
  # Generate correlated scores (realistic relationships)
 
318
  completeness_score = max(0, min(10, base_score + random.uniform(-0.5, 0.3)))
319
  coherence_score = max(0, min(10, base_score + random.uniform(-0.2, 0.4)))
320
 
321
+ # Generate hallucination score (inverse relationship with accuracy)
322
+ hallucination_score = max(0, min(10, 10 - accuracy_score + random.uniform(-1.0, 1.0)))
323
+
324
+ # Generate token consumption based on response length and agent type
325
+ response_length = len(response)
326
+ input_tokens = len(query.split()) * 1.3 # Rough estimate
327
+ output_tokens = response_length / 4 # Rough estimate (4 chars per token)
328
+ total_tokens = int(input_tokens + output_tokens)
329
+
330
+ # Calculate cost (rough estimates per 1K tokens)
331
+ cost_per_1k_tokens = {
332
+ "azure": 0.03, # GPT-4
333
+ "openai": 0.03,
334
+ "anthropic": 0.025
335
+ }
336
+ cost_usd = (total_tokens / 1000) * cost_per_1k_tokens.get(llm_provider, 0.03)
337
+
338
  # Realistic safety scenarios
339
  safety_pass_rate = 0.95 # 95% pass rate
340
  if random.random() < 0.02: # 2% chance of safety issues
 
364
  accuracy_score, # accuracy_score
365
  completeness_score, # completeness_score
366
  coherence_score, # coherence_score
367
+ hallucination_score, # hallucination_score
368
  guardrails_passed, # guardrails_passed
369
  safety_score, # safety_score
370
  execution_time, # execution_time_ms
371
+ int(input_tokens), # input_tokens
372
+ int(output_tokens), # output_tokens
373
+ total_tokens, # total_tokens
374
+ round(cost_usd, 4), # cost_usd
375
  False, # error_occurred
376
+ llm_provider, # llm_provider
377
  "gpt-4o", # model_name
378
  f"Comprehensive evaluation for {agent}: The response demonstrates good understanding of the query with appropriate depth and accuracy. Score breakdown reflects the quality across multiple dimensions.", # judge_reasoning
379
  guardrails_failures, # guardrails_failures
 
384
  INSERT INTO evaluation_logs (
385
  session_id, agent_name, query, response, overall_score,
386
  relevance_score, accuracy_score, completeness_score, coherence_score,
387
+ hallucination_score, guardrails_passed, safety_score, execution_time_ms,
388
+ input_tokens, output_tokens, total_tokens, cost_usd, error_occurred,
389
  llm_provider, model_name, judge_reasoning, guardrails_failures, timestamp
390
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
391
  ''', eval_data)
392
 
393
  # Get the evaluation ID for response analysis
394
  evaluation_id = cursor.lastrowid
395
 
396
  # Insert detailed response analysis
397
+ self.insert_response_analysis(cursor, evaluation_id, eval_data[0], agent, response, timestamp)
398
 
399
  def insert_response_analysis(self, cursor, evaluation_id, session_id, agent_name, response_text, timestamp):
400
  """Insert detailed response analysis data"""
 
589
  value=False
590
  )
591
 
592
+ # Advanced filters
593
+ st.sidebar.markdown("### πŸ”¬ Advanced Filters")
594
+
595
+ # Performance tier filter
596
+ filters['performance_tier'] = st.sidebar.selectbox(
597
+ "πŸ“Š Performance Tier",
598
+ options=["All", "Excellent (8.5+)", "Good (7.0-8.5)", "Needs Improvement (<7.0)"],
599
+ index=0
600
+ )
601
+
602
+ # Response time filter
603
+ if not data['evaluations'].empty:
604
+ max_time = data['evaluations']['execution_time_ms'].max()
605
+ filters['max_response_time'] = st.sidebar.slider(
606
+ "⏱️ Max Response Time (ms)",
607
+ min_value=0,
608
+ max_value=int(max_time),
609
+ value=int(max_time),
610
+ step=100
611
+ )
612
+
613
+ # Model/Provider filter
614
+ if not data['evaluations'].empty and 'llm_provider' in data['evaluations'].columns:
615
+ providers = data['evaluations']['llm_provider'].unique().tolist()
616
+ filters['providers'] = st.sidebar.multiselect(
617
+ "πŸ€– LLM Providers",
618
+ options=providers,
619
+ default=providers
620
+ )
621
+
622
+ # Auto-refresh option
623
+ filters['auto_refresh'] = st.sidebar.checkbox(
624
+ "πŸ”„ Auto-refresh (30s)",
625
+ value=False,
626
+ help="Automatically refresh data every 30 seconds"
627
+ )
628
+
629
+ if filters.get('auto_refresh', False):
630
+ st.sidebar.success("πŸ”„ Auto-refresh enabled")
631
+ # Add auto-refresh logic here if needed
632
+
633
  return filters
634
 
635
  def show_executive_summary(self, data: Dict[str, pd.DataFrame]):
 
666
  st.metric("Unique Sessions", f"{unique_sessions:,}")
667
 
668
  # Performance trends
669
+ col1, col2 = st.columns([3, 1])
670
+
671
+ with col1:
672
+ st.subheader("πŸ“Š Performance Trends")
673
+
674
+ with col2:
675
+ trend_period = st.selectbox(
676
+ "πŸ“… Period",
677
+ options=["7 days", "30 days", "All time"],
678
+ index=1,
679
+ key="trend_period"
680
+ )
681
+
682
+ # Filter data based on selected period
683
+ if trend_period == "7 days":
684
+ cutoff_date = datetime.now() - timedelta(days=7)
685
+ trend_df = df[df['timestamp'] >= cutoff_date]
686
+ elif trend_period == "30 days":
687
+ cutoff_date = datetime.now() - timedelta(days=30)
688
+ trend_df = df[df['timestamp'] >= cutoff_date]
689
+ else:
690
+ trend_df = df
691
 
692
  # Daily performance trend
693
+ df_daily = trend_df.groupby(trend_df['timestamp'].dt.date).agg({
694
  'overall_score': 'mean',
695
  'execution_time_ms': 'mean',
696
  'guardrails_passed': lambda x: (x.sum() / len(x)) * 100
 
1063
  mime="text/csv"
1064
  )
1065
 
1066
+ def show_advanced_analytics(self, data: Dict[str, pd.DataFrame]):
1067
+ """Show advanced analytics and insights"""
1068
+ st.header("πŸ”¬ Advanced Analytics & AI Insights")
1069
+
1070
+ if data['evaluations'].empty:
1071
+ st.warning("No evaluation data available")
1072
+ return
1073
+
1074
+ df_eval = data['evaluations']
1075
+ df_analysis = data.get('response_analysis', pd.DataFrame())
1076
+
1077
+ # Performance trends and predictions
1078
+ st.subheader("πŸ“Š Performance Trends & Predictions")
1079
+
1080
+ col1, col2 = st.columns(2)
1081
+
1082
+ with col1:
1083
+ st.write("**πŸ“ˆ Score Trends Over Time**")
1084
+ # Daily performance trend with moving average
1085
+ df_daily = df_eval.groupby(df_eval['timestamp'].dt.date).agg({
1086
+ 'overall_score': ['mean', 'count'],
1087
+ 'execution_time_ms': 'mean'
1088
+ }).reset_index()
1089
+
1090
+ df_daily.columns = ['date', 'avg_score', 'count', 'avg_time']
1091
+
1092
+ # Calculate moving average
1093
+ df_daily['score_ma'] = df_daily['avg_score'].rolling(window=7, min_periods=1).mean()
1094
+
1095
+ fig = go.Figure()
1096
+ fig.add_trace(go.Scatter(
1097
+ x=df_daily['date'],
1098
+ y=df_daily['avg_score'],
1099
+ mode='lines+markers',
1100
+ name='Daily Score',
1101
+ line=dict(color='lightblue', width=1),
1102
+ opacity=0.7
1103
+ ))
1104
+ fig.add_trace(go.Scatter(
1105
+ x=df_daily['date'],
1106
+ y=df_daily['score_ma'],
1107
+ mode='lines',
1108
+ name='7-Day Moving Average',
1109
+ line=dict(color='red', width=3)
1110
+ ))
1111
+ fig.update_layout(
1112
+ title="Score Trends with Moving Average",
1113
+ xaxis_title="Date",
1114
+ yaxis_title="Score",
1115
+ height=400
1116
+ )
1117
+ st.plotly_chart(fig, use_container_width=True)
1118
+
1119
+ with col2:
1120
+ st.write("**⚑ Performance Correlation Matrix**")
1121
+ # Correlation analysis
1122
+ score_cols = ['overall_score', 'relevance_score', 'accuracy_score',
1123
+ 'completeness_score', 'coherence_score', 'execution_time_ms']
1124
+ available_cols = [col for col in score_cols if col in df_eval.columns]
1125
+
1126
+ if len(available_cols) > 2:
1127
+ corr_matrix = df_eval[available_cols].corr()
1128
+
1129
+ fig = px.imshow(
1130
+ corr_matrix,
1131
+ title="Performance Metrics Correlation",
1132
+ color_continuous_scale='RdBu',
1133
+ aspect="auto"
1134
+ )
1135
+ fig.update_layout(height=400)
1136
+ st.plotly_chart(fig, use_container_width=True)
1137
+ else:
1138
+ st.info("Need more metrics for correlation analysis")
1139
+
1140
+ # Agent comparison and benchmarking
1141
+ st.subheader("πŸ† Agent Benchmarking & Competition")
1142
+
1143
+ col1, col2, col3 = st.columns(3)
1144
+
1145
+ with col1:
1146
+ st.write("**πŸ₯‡ Agent Leaderboard**")
1147
+ leaderboard = df_eval.groupby('agent_name').agg({
1148
+ 'overall_score': ['mean', 'std', 'count'],
1149
+ 'execution_time_ms': 'mean'
1150
+ }).round(2)
1151
+
1152
+ leaderboard.columns = ['Avg Score', 'Score StdDev', 'Total Evals', 'Avg Time (ms)']
1153
+ leaderboard['Efficiency'] = (leaderboard['Avg Score'] / (leaderboard['Avg Time (ms)'] / 1000)).round(2)
1154
+ leaderboard = leaderboard.sort_values('Avg Score', ascending=False)
1155
+
1156
+ # Add rank and medals
1157
+ leaderboard['Rank'] = range(1, len(leaderboard) + 1)
1158
+ medals = ['πŸ₯‡', 'πŸ₯ˆ', 'πŸ₯‰'] + ['πŸ…'] * (len(leaderboard) - 3)
1159
+ leaderboard['Medal'] = medals[:len(leaderboard)]
1160
+
1161
+ st.dataframe(leaderboard[['Medal', 'Rank', 'Avg Score', 'Efficiency', 'Total Evals']], use_container_width=True)
1162
+
1163
+ with col2:
1164
+ st.write("**πŸ“Š Performance Distribution**")
1165
+ fig = px.box(
1166
+ df_eval,
1167
+ x='agent_name',
1168
+ y='overall_score',
1169
+ title="Score Distribution by Agent",
1170
+ color='agent_name'
1171
+ )
1172
+ fig.update_layout(height=300, showlegend=False)
1173
+ st.plotly_chart(fig, use_container_width=True)
1174
+
1175
+ with col3:
1176
+ st.write("**⚑ Speed vs Quality**")
1177
+ agent_perf = df_eval.groupby('agent_name').agg({
1178
+ 'overall_score': 'mean',
1179
+ 'execution_time_ms': 'mean'
1180
+ }).reset_index()
1181
+
1182
+ fig = px.scatter(
1183
+ agent_perf,
1184
+ x='execution_time_ms',
1185
+ y='overall_score',
1186
+ size='overall_score',
1187
+ color='agent_name',
1188
+ title="Speed vs Quality Trade-off",
1189
+ labels={'execution_time_ms': 'Response Time (ms)', 'overall_score': 'Quality Score'}
1190
+ )
1191
+ fig.update_layout(height=300)
1192
+ st.plotly_chart(fig, use_container_width=True)
1193
+
1194
+ # AI-powered insights and recommendations
1195
+ st.subheader("πŸ€– AI-Powered Insights & Recommendations")
1196
+
1197
+ # Generate insights
1198
+ insights = self.generate_ai_insights(df_eval, df_analysis)
1199
+
1200
+ col1, col2 = st.columns(2)
1201
+
1202
+ with col1:
1203
+ st.write("**πŸ’‘ Key Insights**")
1204
+ for insight in insights['insights']:
1205
+ st.info(f"πŸ” {insight}")
1206
+
1207
+ with col2:
1208
+ st.write("**πŸš€ Recommendations**")
1209
+ for rec in insights['recommendations']:
1210
+ st.success(f"πŸ’‘ {rec}")
1211
+
1212
+ # Performance anomaly detection
1213
+ st.subheader("πŸ” Anomaly Detection")
1214
+
1215
+ anomalies = self.detect_anomalies(df_eval)
1216
+ if anomalies:
1217
+ st.warning(f"⚠️ Detected {len(anomalies)} potential anomalies:")
1218
+ for anomaly in anomalies:
1219
+ st.write(f"β€’ {anomaly}")
1220
+ else:
1221
+ st.success("βœ… No performance anomalies detected")
1222
+
1223
+ # Real-time monitoring simulation
1224
+ st.subheader("πŸ“‘ Real-time Monitoring Simulation")
1225
+
1226
+ if st.button("πŸ”„ Simulate Real-time Update"):
1227
+ # Simulate new data
1228
+ latest_data = self.simulate_realtime_data()
1229
+
1230
+ col1, col2, col3 = st.columns(3)
1231
+ with col1:
1232
+ st.metric("Latest Score", f"{latest_data['score']:.2f}", f"{latest_data['score_delta']:+.2f}")
1233
+ with col2:
1234
+ st.metric("Response Time", f"{latest_data['time']:.0f}ms", f"{latest_data['time_delta']:+.0f}ms")
1235
+ with col3:
1236
+ st.metric("Safety Status", "βœ… Passed" if latest_data['safe'] else "❌ Failed")
1237
+
1238
+ st.success("πŸ”„ Dashboard updated with latest data!")
1239
+
1240
+ def generate_ai_insights(self, df_eval, df_analysis):
1241
+ """Generate AI-powered insights from the data"""
1242
+ insights = []
1243
+ recommendations = []
1244
+
1245
+ # Performance insights
1246
+ best_agent = df_eval.groupby('agent_name')['overall_score'].mean().idxmax()
1247
+ worst_agent = df_eval.groupby('agent_name')['overall_score'].mean().idxmin()
1248
+
1249
+ avg_score = df_eval['overall_score'].mean()
1250
+ score_trend = df_eval.groupby(df_eval['timestamp'].dt.date)['overall_score'].mean()
1251
+
1252
+ if len(score_trend) > 1:
1253
+ recent_trend = score_trend.iloc[-3:].mean() - score_trend.iloc[:3].mean()
1254
+ if recent_trend > 0.5:
1255
+ insights.append(f"Performance is improving! Recent scores are {recent_trend:.1f} points higher than earlier.")
1256
+ elif recent_trend < -0.5:
1257
+ insights.append(f"Performance decline detected. Recent scores are {abs(recent_trend):.1f} points lower.")
1258
+
1259
+ # Agent insights
1260
+ insights.append(f"{best_agent} is the top performer with highest average scores.")
1261
+ insights.append(f"Overall system performance: {avg_score:.1f}/10 - {'Excellent' if avg_score > 8.5 else 'Good' if avg_score > 7.5 else 'Needs Improvement'}")
1262
+
1263
+ # Response time insights
1264
+ avg_time = df_eval['execution_time_ms'].mean()
1265
+ if avg_time > 2000:
1266
+ insights.append(f"Response times are high (avg: {avg_time:.0f}ms). Consider optimization.")
1267
+
1268
+ # Safety insights
1269
+ safety_rate = (df_eval['guardrails_passed'].sum() / len(df_eval)) * 100
1270
+ if safety_rate < 95:
1271
+ insights.append(f"Safety pass rate is {safety_rate:.1f}% - below recommended 95% threshold.")
1272
+
1273
+ # Recommendations
1274
+ if worst_agent != best_agent:
1275
+ recommendations.append(f"Consider retraining {worst_agent} using patterns from {best_agent}")
1276
+
1277
+ if avg_time > 1500:
1278
+ recommendations.append("Implement caching or optimize model inference to reduce response times")
1279
+
1280
+ recommendations.append("Schedule regular performance reviews every 2 weeks")
1281
+ recommendations.append("Set up automated alerts for scores below 7.0 or response times above 3 seconds")
1282
+
1283
+ if not df_analysis.empty:
1284
+ avg_readability = df_analysis['readability_score'].mean()
1285
+ if avg_readability < 6:
1286
+ recommendations.append("Improve response readability - consider simpler language and shorter sentences")
1287
+
1288
+ return {'insights': insights, 'recommendations': recommendations}
1289
+
1290
+ def detect_anomalies(self, df_eval):
1291
+ """Detect performance anomalies"""
1292
+ anomalies = []
1293
+
1294
+ # Score anomalies (using IQR method)
1295
+ Q1 = df_eval['overall_score'].quantile(0.25)
1296
+ Q3 = df_eval['overall_score'].quantile(0.75)
1297
+ IQR = Q3 - Q1
1298
+ lower_bound = Q1 - 1.5 * IQR
1299
+ upper_bound = Q3 + 1.5 * IQR
1300
+
1301
+ score_anomalies = df_eval[(df_eval['overall_score'] < lower_bound) | (df_eval['overall_score'] > upper_bound)]
1302
+ if len(score_anomalies) > 0:
1303
+ anomalies.append(f"{len(score_anomalies)} evaluations with unusual scores detected")
1304
+
1305
+ # Response time anomalies
1306
+ time_Q1 = df_eval['execution_time_ms'].quantile(0.25)
1307
+ time_Q3 = df_eval['execution_time_ms'].quantile(0.75)
1308
+ time_IQR = time_Q3 - time_Q1
1309
+ time_upper = time_Q3 + 1.5 * time_IQR
1310
+
1311
+ time_anomalies = df_eval[df_eval['execution_time_ms'] > time_upper]
1312
+ if len(time_anomalies) > 0:
1313
+ anomalies.append(f"{len(time_anomalies)} evaluations with unusually long response times")
1314
+
1315
+ # Safety anomalies
1316
+ safety_failures = df_eval[df_eval['guardrails_passed'] == False]
1317
+ if len(safety_failures) > len(df_eval) * 0.1: # More than 10% failures
1318
+ anomalies.append(f"High safety failure rate: {len(safety_failures)} failures out of {len(df_eval)} evaluations")
1319
+
1320
+ return anomalies
1321
+
1322
+ def simulate_realtime_data(self):
1323
+ """Simulate real-time data update"""
1324
+ import random
1325
+
1326
+ return {
1327
+ 'score': random.uniform(7.0, 9.5),
1328
+ 'score_delta': random.uniform(-0.5, 0.5),
1329
+ 'time': random.uniform(500, 2000),
1330
+ 'time_delta': random.uniform(-200, 200),
1331
+ 'safe': random.choice([True, True, True, False]) # 75% safe
1332
+ }
1333
+
1334
+ def show_workflow_visualization(self, data: Dict[str, pd.DataFrame]):
1335
+ """Show workflow visualization with queries, scores, latency, hallucination, and token consumption"""
1336
+ st.header("πŸ”„ Workflow Visualization")
1337
+
1338
+ df_eval = data['evaluations']
1339
+ if df_eval.empty:
1340
+ st.warning("No evaluation data available for workflow visualization.")
1341
+ return
1342
+
1343
+ # Create workflow selection
1344
+ col1, col2 = st.columns([1, 1])
1345
+
1346
+ with col1:
1347
+ sessions = df_eval['session_id'].unique()
1348
+ selected_session = st.selectbox("Select Session", sessions, key="workflow_session")
1349
+
1350
+ with col2:
1351
+ agents = df_eval['agent_name'].unique()
1352
+ selected_agent = st.selectbox("Select Agent (Optional)", ['All'] + list(agents), key="workflow_agent")
1353
+
1354
+ # Filter data
1355
+ session_data = df_eval[df_eval['session_id'] == selected_session]
1356
+ if selected_agent != 'All':
1357
+ session_data = session_data[session_data['agent_name'] == selected_agent]
1358
+
1359
+ if session_data.empty:
1360
+ st.warning("No data found for selected filters.")
1361
+ return
1362
+
1363
+ # Create workflow diagram
1364
+ st.subheader("πŸ“Š Workflow Flow Diagram")
1365
+
1366
+ # Generate Mermaid diagram
1367
+ mermaid_diagram = self.create_workflow_diagram(session_data)
1368
+
1369
+ # Display the diagram using markdown (since create_diagram might not be available)
1370
+ st.markdown("```mermaid\n" + mermaid_diagram + "\n```")
1371
+
1372
+ # Workflow metrics overview
1373
+ st.subheader("πŸ“ˆ Session Metrics Overview")
1374
+
1375
+ col1, col2, col3, col4 = st.columns(4)
1376
+
1377
+ with col1:
1378
+ avg_score = session_data['overall_score'].mean()
1379
+ st.metric("Avg Overall Score", f"{avg_score:.2f}/10",
1380
+ delta=f"{avg_score - 7.5:.2f}" if avg_score > 7.5 else f"{avg_score - 7.5:.2f}")
1381
+
1382
+ with col2:
1383
+ avg_latency = session_data['execution_time_ms'].mean()
1384
+ st.metric("Avg Response Time", f"{avg_latency:.0f}ms",
1385
+ delta=f"{avg_latency - 3000:.0f}ms" if avg_latency < 3000 else f"+{avg_latency - 3000:.0f}ms")
1386
+
1387
+ with col3:
1388
+ avg_hallucination = session_data['hallucination_score'].mean() if 'hallucination_score' in session_data.columns else 0
1389
+ st.metric("Avg Hallucination", f"{avg_hallucination:.2f}/10",
1390
+ delta=f"{5.0 - avg_hallucination:.2f}" if avg_hallucination < 5.0 else f"-{avg_hallucination - 5.0:.2f}")
1391
+
1392
+ with col4:
1393
+ total_tokens = session_data['total_tokens'].sum() if 'total_tokens' in session_data.columns else 0
1394
+ total_cost = session_data['cost_usd'].sum() if 'cost_usd' in session_data.columns else 0
1395
+ st.metric("Total Cost", f"${total_cost:.4f}", f"{total_tokens:,} tokens")
1396
+
1397
+ # Detailed workflow steps
1398
+ st.subheader("πŸ” Detailed Workflow Steps")
1399
+
1400
+ for idx, row in session_data.iterrows():
1401
+ with st.expander(f"Step {idx + 1}: {row['agent_name']} - Score: {row['overall_score']:.2f}/10"):
1402
+
1403
+ # Query and Response
1404
+ col1, col2 = st.columns([1, 1])
1405
+
1406
+ with col1:
1407
+ st.markdown("**Query:**")
1408
+ st.write(row['query'])
1409
+
1410
+ # Performance metrics
1411
+ st.markdown("**Performance Metrics:**")
1412
+ metrics_data = {
1413
+ 'Overall Score': row['overall_score'],
1414
+ 'Relevance': row['relevance_score'],
1415
+ 'Accuracy': row['accuracy_score'],
1416
+ 'Completeness': row['completeness_score'],
1417
+ 'Coherence': row['coherence_score'],
1418
+ 'Hallucination': row.get('hallucination_score', 0),
1419
+ 'Safety': row['safety_score']
1420
+ }
1421
+
1422
+ # Create a bar chart for scores
1423
+ import plotly.graph_objects as go
1424
+ fig = go.Figure(data=[
1425
+ go.Bar(x=list(metrics_data.keys()), y=list(metrics_data.values()),
1426
+ marker_color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2'])
1427
+ ])
1428
+ fig.update_layout(
1429
+ title="Score Breakdown",
1430
+ yaxis_title="Score (0-10)",
1431
+ height=300,
1432
+ showlegend=False
1433
+ )
1434
+ st.plotly_chart(fig, use_container_width=True)
1435
+
1436
+ with col2:
1437
+ st.markdown("**Response:**")
1438
+ st.write(row['response'])
1439
+
1440
+ # Token and cost information
1441
+ st.markdown("**Resource Consumption:**")
1442
+
1443
+ token_col1, token_col2 = st.columns(2)
1444
+ with token_col1:
1445
+ input_tokens = row.get('input_tokens', 0)
1446
+ output_tokens = row.get('output_tokens', 0)
1447
+ st.metric("Input Tokens", f"{input_tokens:,}")
1448
+ st.metric("Output Tokens", f"{output_tokens:,}")
1449
+
1450
+ with token_col2:
1451
+ total_tokens = row.get('total_tokens', 0)
1452
+ cost = row.get('cost_usd', 0)
1453
+ st.metric("Total Tokens", f"{total_tokens:,}")
1454
+ st.metric("Cost", f"${cost:.4f}")
1455
+
1456
+ # Execution details
1457
+ st.markdown("**Execution Details:**")
1458
+ exec_time = row['execution_time_ms']
1459
+ llm_provider = row.get('llm_provider', 'Unknown')
1460
+ model_name = row.get('model_name', 'Unknown')
1461
+
1462
+ st.write(f"⏱️ **Execution Time:** {exec_time:.0f}ms")
1463
+ st.write(f"πŸ€– **LLM Provider:** {llm_provider}")
1464
+ st.write(f"🧠 **Model:** {model_name}")
1465
+ st.write(f"πŸ›‘οΈ **Safety Passed:** {'βœ…' if row['guardrails_passed'] else '❌'}")
1466
+
1467
+ # Comparative analysis
1468
+ st.subheader("πŸ“Š Comparative Analysis")
1469
+
1470
+ # Create comparison charts
1471
+ col1, col2 = st.columns(2)
1472
+
1473
+ with col1:
1474
+ # Score comparison
1475
+ fig = go.Figure()
1476
+
1477
+ score_columns = ['overall_score', 'relevance_score', 'accuracy_score', 'completeness_score', 'coherence_score']
1478
+ if 'hallucination_score' in session_data.columns:
1479
+ score_columns.append('hallucination_score')
1480
+
1481
+ for i, (idx, row) in enumerate(session_data.iterrows()):
1482
+ fig.add_trace(go.Scatterpolar(
1483
+ r=[row[col] for col in score_columns],
1484
+ theta=[col.replace('_score', '').title() for col in score_columns],
1485
+ fill='toself',
1486
+ name=f"{row['agent_name']} - Step {i+1}"
1487
+ ))
1488
+
1489
+ fig.update_layout(
1490
+ polar=dict(
1491
+ radialaxis=dict(
1492
+ visible=True,
1493
+ range=[0, 10]
1494
+ )),
1495
+ showlegend=True,
1496
+ title="Score Comparison Radar Chart"
1497
+ )
1498
+ st.plotly_chart(fig, use_container_width=True)
1499
+
1500
+ with col2:
1501
+ # Token consumption over steps
1502
+ if 'total_tokens' in session_data.columns:
1503
+ fig = go.Figure()
1504
+
1505
+ steps = [f"Step {i+1}" for i in range(len(session_data))]
1506
+ fig.add_trace(go.Bar(
1507
+ x=steps,
1508
+ y=session_data['total_tokens'],
1509
+ name='Total Tokens',
1510
+ marker_color='lightblue'
1511
+ ))
1512
+
1513
+ fig.add_trace(go.Scatter(
1514
+ x=steps,
1515
+ y=session_data['execution_time_ms'],
1516
+ yaxis='y2',
1517
+ name='Response Time (ms)',
1518
+ line=dict(color='red', width=2),
1519
+ mode='lines+markers'
1520
+ ))
1521
+
1522
+ fig.update_layout(
1523
+ title="Token Consumption vs Response Time",
1524
+ xaxis_title="Workflow Steps",
1525
+ yaxis_title="Total Tokens",
1526
+ yaxis2=dict(
1527
+ title="Response Time (ms)",
1528
+ overlaying='y',
1529
+ side='right'
1530
+ ),
1531
+ height=400
1532
+ )
1533
+ st.plotly_chart(fig, use_container_width=True)
1534
+
1535
+ # Session summary
1536
+ st.subheader("πŸ“‹ Session Summary")
1537
+
1538
+ summary_col1, summary_col2, summary_col3 = st.columns(3)
1539
+
1540
+ with summary_col1:
1541
+ st.markdown("**Quality Metrics:**")
1542
+ st.write(f"β€’ Average Overall Score: {session_data['overall_score'].mean():.2f}/10")
1543
+ st.write(f"β€’ Best Performing Step: {session_data.loc[session_data['overall_score'].idxmax(), 'agent_name']}")
1544
+ st.write(f"β€’ Consistency (Std Dev): {session_data['overall_score'].std():.2f}")
1545
+
1546
+ with summary_col2:
1547
+ st.markdown("**Performance Metrics:**")
1548
+ st.write(f"β€’ Total Execution Time: {session_data['execution_time_ms'].sum():.0f}ms")
1549
+ st.write(f"β€’ Average Response Time: {session_data['execution_time_ms'].mean():.0f}ms")
1550
+ st.write(f"β€’ Fastest Step: {session_data['execution_time_ms'].min():.0f}ms")
1551
+
1552
+ with summary_col3:
1553
+ st.markdown("**Resource Usage:**")
1554
+ if 'total_tokens' in session_data.columns:
1555
+ st.write(f"β€’ Total Tokens Used: {session_data['total_tokens'].sum():,}")
1556
+ st.write(f"β€’ Total Cost: ${session_data['cost_usd'].sum():.4f}")
1557
+ st.write(f"β€’ Avg Cost per Query: ${session_data['cost_usd'].mean():.4f}")
1558
+ else:
1559
+ st.write("β€’ Token data not available")
1560
+
1561
+ # Export functionality
1562
+ st.subheader("πŸ“€ Export Workflow Data")
1563
+
1564
+ if st.button("Export Session Data to CSV", key="export_workflow"):
1565
+ csv_data = session_data.to_csv(index=False)
1566
+ st.download_button(
1567
+ label="Download CSV",
1568
+ data=csv_data,
1569
+ file_name=f"workflow_session_{selected_session}.csv",
1570
+ mime="text/csv"
1571
+ )
1572
+
1573
+ def create_workflow_diagram(self, session_data):
1574
+ """Create a Mermaid workflow diagram"""
1575
+ diagram = "graph TD\n"
1576
+ diagram += " Start([Session Start])\n"
1577
+
1578
+ for i, (idx, row) in enumerate(session_data.iterrows()):
1579
+ step_id = f"Step{i+1}"
1580
+ agent_name = row['agent_name'].replace(' ', '_')
1581
+ score = row['overall_score']
1582
+ exec_time = row['execution_time_ms']
1583
+
1584
+ # Color based on score
1585
+ if score >= 8.5:
1586
+ color = "fill:#90EE90" # Light green
1587
+ elif score >= 7.0:
1588
+ color = "fill:#FFE4B5" # Light orange
1589
+ else:
1590
+ color = "fill:#FFB6C1" # Light pink
1591
+
1592
+ diagram += f" {step_id}[\"{agent_name}<br/>Score: {score:.1f}/10<br/>Time: {exec_time:.0f}ms\"]\n"
1593
+ diagram += f" {step_id} --> {step_id}_result{{Result}}\n"
1594
+
1595
+ if i == 0:
1596
+ diagram += f" Start --> {step_id}\n"
1597
+ else:
1598
+ prev_step = f"Step{i}"
1599
+ diagram += f" {prev_step}_result --> {step_id}\n"
1600
+
1601
+ # Add styling
1602
+ diagram += f" class {step_id} stepClass;\n"
1603
+
1604
+ # Add end node
1605
+ last_step = f"Step{len(session_data)}"
1606
+ diagram += f" {last_step}_result --> End([Session End])\n"
1607
+
1608
+ # Add class definitions
1609
+ diagram += " classDef stepClass fill:#e1f5fe,stroke:#01579b,stroke-width:2px;\n"
1610
+
1611
+ return diagram
1612
+
1613
  def run(self):
1614
  """Run the dashboard"""
1615
  st.title("πŸ€– Multi-Agent System Dashboard - Demo")
 
1644
  if filters.get('safety_only', False):
1645
  df = df[df['guardrails_passed'] == True]
1646
 
1647
+ # Performance tier filter
1648
+ if filters.get('performance_tier') != "All":
1649
+ if filters['performance_tier'] == "Excellent (8.5+)":
1650
+ df = df[df['overall_score'] >= 8.5]
1651
+ elif filters['performance_tier'] == "Good (7.0-8.5)":
1652
+ df = df[(df['overall_score'] >= 7.0) & (df['overall_score'] < 8.5)]
1653
+ elif filters['performance_tier'] == "Needs Improvement (<7.0)":
1654
+ df = df[df['overall_score'] < 7.0]
1655
+
1656
+ # Response time filter
1657
+ if 'max_response_time' in filters:
1658
+ df = df[df['execution_time_ms'] <= filters['max_response_time']]
1659
+
1660
+ # Provider filter
1661
+ if 'providers' in filters and filters['providers']:
1662
+ df = df[df['llm_provider'].isin(filters['providers'])]
1663
+
1664
  filtered_data['evaluations'] = df
1665
 
1666
  # Create tabs
1667
+ tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
1668
  "πŸ“ˆ Executive Summary",
1669
  "πŸ€– Agent Performance",
1670
  "πŸ›‘οΈ Safety Analysis",
1671
+ "πŸ“ Response Analysis",
1672
+ "πŸ”¬ Advanced Analytics",
1673
+ "πŸ”„ Workflow Visualization"
1674
  ])
1675
 
1676
  with tab1:
 
1685
  with tab4:
1686
  self.show_response_analysis(filtered_data)
1687
 
1688
+ with tab5:
1689
+ self.show_advanced_analytics(filtered_data)
1690
+
1691
+ with tab6:
1692
+ self.show_workflow_visualization(filtered_data)
1693
+
1694
+ # Quick actions sidebar
1695
+ st.sidebar.markdown("---")
1696
+ st.sidebar.markdown("### ⚑ Quick Actions")
1697
+
1698
+ if st.sidebar.button("πŸ“Š Generate Report"):
1699
+ st.sidebar.success("πŸ“„ Report generated!")
1700
+ # Could generate PDF report here
1701
+
1702
+ if st.sidebar.button("πŸ”„ Refresh Data"):
1703
+ st.sidebar.success("πŸ”„ Data refreshed!")
1704
+ st.experimental_rerun()
1705
+
1706
+ if st.sidebar.button("πŸ“§ Send Alert"):
1707
+ st.sidebar.success("πŸ“§ Alert sent to team!")
1708
+
1709
+ # Data summary in sidebar
1710
+ if not filtered_data['evaluations'].empty:
1711
+ st.sidebar.markdown("### πŸ“ˆ Current Session")
1712
+ st.sidebar.metric("Filtered Records", len(filtered_data['evaluations']))
1713
+ st.sidebar.metric("Avg Score", f"{filtered_data['evaluations']['overall_score'].mean():.2f}")
1714
+ st.sidebar.metric("Success Rate", f"{(filtered_data['evaluations']['guardrails_passed'].sum() / len(filtered_data['evaluations']) * 100):.1f}%")
1715
+
1716
  # Footer
1717
  st.markdown("---")
1718
+ col1, col2, col3 = st.columns(3)
1719
+
1720
+ with col1:
1721
+ st.markdown("πŸš€ **Multi-Agent System Dashboard**")
1722
+
1723
+ with col2:
1724
+ st.markdown("Built with Streamlit & Plotly")
1725
+
1726
+ with col3:
1727
+ if st.button("ℹ️ About"):
1728
+ st.info("""
1729
+ **Multi-Agent System Dashboard v2.0**
1730
+
1731
+ Features:
1732
+ - πŸ“Š Real-time monitoring
1733
+ - πŸ€– AI-powered insights
1734
+ - πŸ” Advanced analytics
1735
+ - πŸ“ Response tracing
1736
+ - πŸ›‘οΈ Safety monitoring
1737
+ - πŸ“ˆ Performance benchmarking
1738
+
1739
+ Built for production-grade multi-agent systems.
1740
+ """)
1741
 
1742
  if __name__ == "__main__":
1743
  dashboard = HuggingFaceDashboard()