Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Simplified Hugging Face Spaces compatible Multi-Agent System Dashboard | |
| """ | |
| import os | |
| import sys | |
| import tempfile | |
| import sqlite3 | |
| from pathlib import Path | |
| import warnings | |
| from datetime import datetime, timedelta | |
| import random | |
| # Suppress warnings | |
| warnings.filterwarnings('ignore') | |
| # Set environment variables for Hugging Face Spaces | |
| os.environ['STREAMLIT_SERVER_HEADLESS'] = 'true' | |
| os.environ['STREAMLIT_SERVER_PORT'] = '7860' | |
| os.environ['STREAMLIT_BROWSER_GATHER_USAGE_STATS'] = 'false' | |
| # Import streamlit first and set page config | |
| import streamlit as st | |
| st.set_page_config( | |
| page_title="π€ Multi-Agent System Dashboard", | |
| page_icon="π€", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Import other required modules | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import json | |
| import numpy as np | |
| from typing import Dict, List, Any | |
| class SimpleDashboard: | |
| def __init__(self): | |
| # Use temp directory for database | |
| temp_dir = tempfile.gettempdir() | |
| self.db_path = os.path.join(temp_dir, "evaluation_logs.db") | |
| try: | |
| self.setup_demo_data() | |
| except Exception as e: | |
| st.error(f"Setup error: {str(e)}") | |
| self.create_fallback_data() | |
| def create_fallback_data(self): | |
| """Create fallback data if database fails""" | |
| st.warning("Using fallback demo data") | |
| # Create sample data directly | |
| agents = ["Diet Agent", "Support Agent", "Queries Agent"] | |
| data = [] | |
| for i in range(50): | |
| base_score = random.uniform(7.0, 9.5) | |
| accuracy = random.uniform(7.0, 9.5) | |
| data.append({ | |
| 'id': i, | |
| 'session_id': f"session_{random.randint(1000, 9999)}", | |
| 'agent_name': random.choice(agents), | |
| 'query': f"Sample query {i}", | |
| 'response': f"Sample response {i} with detailed information and comprehensive guidance...", | |
| 'overall_score': base_score, | |
| 'relevance_score': random.uniform(7.0, 9.5), | |
| 'accuracy_score': accuracy, | |
| 'completeness_score': random.uniform(7.0, 9.5), | |
| 'coherence_score': random.uniform(7.0, 9.5), | |
| 'hallucination_score': max(0, min(10, 10 - accuracy + random.uniform(-1.0, 1.0))), | |
| 'guardrails_passed': True, | |
| 'safety_score': random.uniform(8.0, 10.0), | |
| 'execution_time_ms': random.uniform(500, 2000), | |
| 'input_tokens': random.randint(20, 100), | |
| 'output_tokens': random.randint(100, 500), | |
| 'total_tokens': random.randint(120, 600), | |
| 'cost_usd': random.uniform(0.001, 0.02), | |
| 'llm_provider': random.choice(["azure", "openai", "anthropic"]), | |
| 'model_name': 'gpt-4o', | |
| 'timestamp': datetime.now() - timedelta(days=random.randint(0, 30)) | |
| }) | |
| self.fallback_df = pd.DataFrame(data) | |
| self.use_fallback = True | |
| def setup_demo_data(self): | |
| """Setup demo database""" | |
| self.use_fallback = False | |
| if not os.path.exists(self.db_path): | |
| self.create_demo_database() | |
| def create_demo_database(self): | |
| """Create demo database""" | |
| conn = sqlite3.connect(self.db_path) | |
| cursor = conn.cursor() | |
| # Create table | |
| cursor.execute(''' | |
| CREATE TABLE IF NOT EXISTS evaluation_logs ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| session_id TEXT NOT NULL, | |
| agent_name TEXT NOT NULL, | |
| query TEXT NOT NULL, | |
| response TEXT, | |
| overall_score REAL, | |
| relevance_score REAL, | |
| accuracy_score REAL, | |
| completeness_score REAL, | |
| coherence_score REAL, | |
| hallucination_score REAL, | |
| guardrails_passed BOOLEAN, | |
| safety_score REAL, | |
| execution_time_ms REAL, | |
| input_tokens INTEGER, | |
| output_tokens INTEGER, | |
| total_tokens INTEGER, | |
| cost_usd REAL, | |
| llm_provider TEXT, | |
| model_name TEXT, | |
| timestamp DATETIME DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| ''') | |
| # Insert demo data | |
| agents = ["Diet Agent", "Support Agent", "Queries Agent"] | |
| sample_queries = { | |
| "Diet Agent": [ | |
| "What's a healthy meal plan for weight loss?", | |
| "Can you suggest low-carb breakfast options?", | |
| "What are the benefits of intermittent fasting?", | |
| "How much protein should I eat daily?", | |
| "What foods are good for heart health?" | |
| ], | |
| "Support Agent": [ | |
| "I'm having trouble sleeping, can you help?", | |
| "How do I manage work stress?", | |
| "I feel overwhelmed with my tasks", | |
| "Can you help me organize my schedule?", | |
| "How to improve my productivity?" | |
| ], | |
| "Queries Agent": [ | |
| "What are the latest developments in AI?", | |
| "How does blockchain technology work?", | |
| "What is quantum computing?", | |
| "Explain machine learning algorithms", | |
| "What are the benefits of cloud computing?" | |
| ] | |
| } | |
| for i in range(100): | |
| session_id = f"session_{random.randint(1000, 9999)}" | |
| agent = random.choice(agents) | |
| query = random.choice(sample_queries[agent]) | |
| # Generate comprehensive response | |
| response_templates = { | |
| "Diet Agent": [ | |
| "Thank you for your question about nutrition and dietary guidance. I'd be happy to help you develop a healthier relationship with food and create sustainable eating habits.", | |
| "I understand you're looking for dietary advice, and I'm here to provide evidence-based nutritional guidance tailored to your specific needs and goals." | |
| ], | |
| "Support Agent": [ | |
| "I appreciate you reaching out for support. It takes courage to ask for help, and I'm here to provide you with practical strategies and emotional guidance.", | |
| "Thank you for sharing your concerns with me. I understand this can be challenging, and I want to help you work through this step by step with compassion and understanding." | |
| ], | |
| "Queries Agent": [ | |
| "Excellent question! This is a fascinating topic that involves cutting-edge technology and has significant implications for our future. Let me provide you with a comprehensive overview.", | |
| "Thank you for this thought-provoking question. This subject encompasses multiple disciplines and recent innovations. I'll break this down into key concepts and practical applications." | |
| ] | |
| } | |
| base_response = random.choice(response_templates[agent]) | |
| # Add detailed information | |
| if agent == "Diet Agent": | |
| details = "**Key Nutritional Recommendations:**\n\n1. **Whole Foods Focus**: Prioritize unprocessed foods like fresh fruits, vegetables, whole grains, lean proteins, and healthy fats.\n\n2. **Portion Control**: Use the plate method - fill half your plate with non-starchy vegetables, one quarter with lean protein, and one quarter with complex carbohydrates.\n\n3. **Hydration**: Aim for 8-10 glasses of water daily to support metabolism and overall health." | |
| elif agent == "Support Agent": | |
| details = "**Comprehensive Support Strategy:**\n\n**Immediate Coping Techniques:**\n1. **Deep Breathing**: Practice the 4-7-8 technique - inhale for 4 counts, hold for 7, exhale for 8.\n\n2. **Grounding Exercises**: Use the 5-4-3-2-1 method - identify 5 things you can see, 4 you can touch, 3 you can hear, 2 you can smell, and 1 you can taste.\n\n**Long-term Strategies:**\n- Establish a consistent daily routine\n- Practice mindfulness meditation for 10-15 minutes daily" | |
| else: # Queries Agent | |
| details = "**Technical Deep Dive:**\n\n**Fundamental Concepts:**\nThis technology represents a convergence of multiple disciplines including computer science, mathematics, engineering, and domain-specific expertise.\n\n**Current Implementation:**\n1. **Healthcare**: AI-powered diagnostic tools and personalized treatment plans\n2. **Finance**: Algorithmic trading and fraud detection\n3. **Transportation**: Autonomous vehicles and traffic optimization" | |
| response = f"{base_response}\n\n{details}" | |
| # Generate realistic scores | |
| base_score = random.uniform(7.0, 9.5) | |
| relevance_score = max(0, min(10, base_score + random.uniform(-0.3, 0.3))) | |
| accuracy_score = max(0, min(10, base_score + random.uniform(-0.4, 0.2))) | |
| completeness_score = max(0, min(10, base_score + random.uniform(-0.5, 0.3))) | |
| coherence_score = max(0, min(10, base_score + random.uniform(-0.2, 0.4))) | |
| hallucination_score = max(0, min(10, 10 - accuracy_score + random.uniform(-1.0, 1.0))) | |
| # Generate token consumption | |
| response_length = len(response) | |
| input_tokens = int(len(query.split()) * 1.3) | |
| output_tokens = int(response_length / 4) | |
| total_tokens = input_tokens + output_tokens | |
| # Calculate cost | |
| llm_provider = random.choice(["azure", "openai", "anthropic"]) | |
| cost_per_1k = {"azure": 0.03, "openai": 0.03, "anthropic": 0.025} | |
| cost_usd = (total_tokens / 1000) * cost_per_1k[llm_provider] | |
| timestamp = datetime.now() - timedelta(days=random.randint(0, 30)) | |
| cursor.execute(''' | |
| INSERT INTO evaluation_logs ( | |
| session_id, agent_name, query, response, overall_score, | |
| relevance_score, accuracy_score, completeness_score, coherence_score, | |
| hallucination_score, guardrails_passed, safety_score, execution_time_ms, | |
| input_tokens, output_tokens, total_tokens, cost_usd, llm_provider, model_name, timestamp | |
| ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) | |
| ''', ( | |
| session_id, agent, query, response, base_score, | |
| relevance_score, accuracy_score, completeness_score, coherence_score, | |
| hallucination_score, random.choice([True, True, True, False]), # 75% pass rate | |
| random.uniform(8.0, 10.0), random.uniform(500, 2000), | |
| input_tokens, output_tokens, total_tokens, round(cost_usd, 4), | |
| llm_provider, "gpt-4o", timestamp.isoformat() | |
| )) | |
| conn.commit() | |
| conn.close() | |
| def load_data(self): | |
| """Load data""" | |
| if self.use_fallback: | |
| return self.fallback_df | |
| try: | |
| conn = sqlite3.connect(self.db_path) | |
| df = pd.read_sql_query("SELECT * FROM evaluation_logs ORDER BY timestamp DESC", conn) | |
| conn.close() | |
| if not df.empty: | |
| df['timestamp'] = pd.to_datetime(df['timestamp']) | |
| return df | |
| except Exception as e: | |
| st.error(f"Data loading error: {str(e)}") | |
| return pd.DataFrame() | |
| def show_overview(self, df): | |
| """Show overview tab""" | |
| st.header("π Executive Summary") | |
| if df.empty: | |
| st.warning("No data available") | |
| return | |
| # Key metrics | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("Total Evaluations", len(df)) | |
| with col2: | |
| avg_score = df['overall_score'].mean() | |
| st.metric("Average Score", f"{avg_score:.2f}/10") | |
| with col3: | |
| safety_rate = (df['guardrails_passed'].sum() / len(df)) * 100 | |
| st.metric("Safety Rate", f"{safety_rate:.1f}%") | |
| with col4: | |
| avg_time = df['execution_time_ms'].mean() / 1000 | |
| st.metric("Avg Response Time", f"{avg_time:.2f}s") | |
| # Charts | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("π Performance by Agent") | |
| agent_scores = df.groupby('agent_name')['overall_score'].mean().reset_index() | |
| fig = px.bar( | |
| agent_scores, | |
| x='agent_name', | |
| y='overall_score', | |
| title="Average Score by Agent", | |
| color='overall_score', | |
| color_continuous_scale='viridis' | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| st.subheader("π Score Distribution") | |
| fig = px.histogram( | |
| df, | |
| x='overall_score', | |
| nbins=20, | |
| title="Score Distribution", | |
| color_discrete_sequence=['#1f77b4'] | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| def show_agent_performance(self, df): | |
| """Show agent performance tab""" | |
| st.header("π€ Agent Performance Analysis") | |
| if df.empty: | |
| st.warning("No data available") | |
| return | |
| # Agent selector | |
| agents = df['agent_name'].unique() | |
| selected_agent = st.selectbox("Select Agent", ["All Agents"] + list(agents)) | |
| # Filter data | |
| if selected_agent != "All Agents": | |
| filtered_df = df[df['agent_name'] == selected_agent] | |
| else: | |
| filtered_df = df | |
| # Performance metrics | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("π― Score Breakdown") | |
| score_cols = ['relevance_score', 'accuracy_score', 'completeness_score', 'coherence_score'] | |
| available_scores = [col for col in score_cols if col in filtered_df.columns] | |
| if available_scores: | |
| avg_scores = filtered_df[available_scores].mean() | |
| fig = px.bar( | |
| x=avg_scores.index, | |
| y=avg_scores.values, | |
| title=f"Average Scores - {selected_agent}", | |
| labels={'x': 'Metric', 'y': 'Score'} | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| st.subheader("β±οΈ Response Time Analysis") | |
| fig = px.box( | |
| filtered_df, | |
| x='agent_name', | |
| y='execution_time_ms', | |
| title="Response Time Distribution" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Recent evaluations table | |
| st.subheader("π Recent Evaluations") | |
| display_cols = ['agent_name', 'query', 'overall_score', 'execution_time_ms', 'timestamp'] | |
| available_cols = [col for col in display_cols if col in filtered_df.columns] | |
| if available_cols: | |
| recent_data = filtered_df[available_cols].head(20) | |
| st.dataframe(recent_data, use_container_width=True) | |
| def show_response_analysis(self, df): | |
| """Show response analysis tab""" | |
| st.header("π Response Analysis & Tracing") | |
| if df.empty: | |
| st.warning("No data available") | |
| return | |
| # Response metrics | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| if 'response' in df.columns: | |
| avg_length = df['response'].str.len().mean() | |
| st.metric("Avg Response Length", f"{avg_length:.0f} chars") | |
| else: | |
| st.metric("Avg Response Length", "N/A") | |
| with col2: | |
| if 'response' in df.columns: | |
| avg_words = df['response'].str.split().str.len().mean() | |
| st.metric("Avg Word Count", f"{avg_words:.0f} words") | |
| else: | |
| st.metric("Avg Word Count", "N/A") | |
| with col3: | |
| response_rate = (df['response'].notna().sum() / len(df)) * 100 | |
| st.metric("Response Rate", f"{response_rate:.1f}%") | |
| # Search functionality | |
| st.subheader("π Search Responses") | |
| search_term = st.text_input("Search in responses:", placeholder="Enter keywords...") | |
| if search_term and 'response' in df.columns: | |
| mask = df['response'].str.contains(search_term, case=False, na=False) | |
| search_results = df[mask] | |
| else: | |
| search_results = df.head(10) | |
| # Display results | |
| if not search_results.empty: | |
| st.write(f"**Showing {len(search_results)} results**") | |
| for idx, row in search_results.iterrows(): | |
| with st.expander(f"π€ {row['agent_name']} - Score: {row['overall_score']:.1f}"): | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| st.write("**Query:**") | |
| st.write(row['query']) | |
| if 'response' in row and pd.notna(row['response']): | |
| st.write("**Response:**") | |
| st.write(row['response']) | |
| with col2: | |
| st.write("**Metrics:**") | |
| st.write(f"Overall Score: {row['overall_score']:.1f}/10") | |
| if 'execution_time_ms' in row: | |
| st.write(f"Response Time: {row['execution_time_ms']:.0f}ms") | |
| if 'timestamp' in row: | |
| st.write(f"Timestamp: {row['timestamp']}") | |
| def show_workflow_visualization(self, df): | |
| """Show workflow visualization tab""" | |
| st.header("π Workflow Visualization") | |
| if df.empty: | |
| st.warning("No data available for workflow visualization.") | |
| return | |
| # Session selection | |
| sessions = df['session_id'].unique() | |
| selected_session = st.selectbox("Select Session", sessions, key="workflow_session") | |
| # Filter data for selected session | |
| session_data = df[df['session_id'] == selected_session] | |
| if session_data.empty: | |
| st.warning("No data found for selected session.") | |
| return | |
| # Session metrics overview | |
| st.subheader("π Session Metrics Overview") | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| avg_score = session_data['overall_score'].mean() | |
| st.metric("Avg Overall Score", f"{avg_score:.2f}/10") | |
| with col2: | |
| avg_latency = session_data['execution_time_ms'].mean() | |
| st.metric("Avg Response Time", f"{avg_latency:.0f}ms") | |
| with col3: | |
| if 'hallucination_score' in session_data.columns: | |
| avg_hallucination = session_data['hallucination_score'].mean() | |
| st.metric("Avg Hallucination", f"{avg_hallucination:.2f}/10") | |
| else: | |
| st.metric("Avg Hallucination", "N/A") | |
| with col4: | |
| if 'total_tokens' in session_data.columns: | |
| total_tokens = session_data['total_tokens'].sum() | |
| total_cost = session_data['cost_usd'].sum() if 'cost_usd' in session_data.columns else 0 | |
| st.metric("Total Cost", f"${total_cost:.4f}", f"{total_tokens:,} tokens") | |
| else: | |
| st.metric("Total Cost", "N/A") | |
| # Workflow steps | |
| st.subheader("π Workflow Steps") | |
| for idx, (_, row) in enumerate(session_data.iterrows()): | |
| with st.expander(f"Step {idx + 1}: {row['agent_name']} - Score: {row['overall_score']:.2f}/10"): | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| st.markdown("**Query:**") | |
| st.write(row['query']) | |
| # Performance metrics chart | |
| st.markdown("**Performance Metrics:**") | |
| metrics_data = { | |
| 'Overall': row['overall_score'], | |
| 'Relevance': row.get('relevance_score', 0), | |
| 'Accuracy': row.get('accuracy_score', 0), | |
| 'Completeness': row.get('completeness_score', 0), | |
| 'Coherence': row.get('coherence_score', 0) | |
| } | |
| if 'hallucination_score' in row: | |
| metrics_data['Hallucination'] = row['hallucination_score'] | |
| fig = px.bar( | |
| x=list(metrics_data.keys()), | |
| y=list(metrics_data.values()), | |
| title="Score Breakdown", | |
| labels={'x': 'Metric', 'y': 'Score (0-10)'} | |
| ) | |
| fig.update_layout(height=300, showlegend=False) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| st.markdown("**Response:**") | |
| if pd.notna(row['response']): | |
| st.write(row['response']) | |
| else: | |
| st.write("No response available") | |
| # Resource consumption | |
| st.markdown("**Resource Consumption:**") | |
| if 'input_tokens' in row and pd.notna(row['input_tokens']): | |
| token_col1, token_col2 = st.columns(2) | |
| with token_col1: | |
| st.metric("Input Tokens", f"{int(row['input_tokens']):,}") | |
| st.metric("Output Tokens", f"{int(row.get('output_tokens', 0)):,}") | |
| with token_col2: | |
| st.metric("Total Tokens", f"{int(row.get('total_tokens', 0)):,}") | |
| st.metric("Cost", f"${row.get('cost_usd', 0):.4f}") | |
| # Execution details | |
| st.markdown("**Execution Details:**") | |
| st.write(f"β±οΈ **Execution Time:** {row['execution_time_ms']:.0f}ms") | |
| if 'llm_provider' in row: | |
| st.write(f"π€ **LLM Provider:** {row['llm_provider']}") | |
| if 'model_name' in row: | |
| st.write(f"π§ **Model:** {row['model_name']}") | |
| st.write(f"π‘οΈ **Safety Passed:** {'β ' if row['guardrails_passed'] else 'β'}") | |
| # Session summary | |
| st.subheader("π Session Summary") | |
| summary_col1, summary_col2, summary_col3 = st.columns(3) | |
| with summary_col1: | |
| st.markdown("**Quality Metrics:**") | |
| st.write(f"β’ Average Overall Score: {session_data['overall_score'].mean():.2f}/10") | |
| best_step = session_data.loc[session_data['overall_score'].idxmax()] | |
| st.write(f"β’ Best Performing Step: {best_step['agent_name']}") | |
| st.write(f"β’ Consistency (Std Dev): {session_data['overall_score'].std():.2f}") | |
| with summary_col2: | |
| st.markdown("**Performance Metrics:**") | |
| st.write(f"β’ Total Execution Time: {session_data['execution_time_ms'].sum():.0f}ms") | |
| st.write(f"β’ Average Response Time: {session_data['execution_time_ms'].mean():.0f}ms") | |
| st.write(f"β’ Fastest Step: {session_data['execution_time_ms'].min():.0f}ms") | |
| with summary_col3: | |
| st.markdown("**Resource Usage:**") | |
| if 'total_tokens' in session_data.columns: | |
| st.write(f"β’ Total Tokens Used: {session_data['total_tokens'].sum():,}") | |
| if 'cost_usd' in session_data.columns: | |
| st.write(f"β’ Total Cost: ${session_data['cost_usd'].sum():.4f}") | |
| st.write(f"β’ Avg Cost per Query: ${session_data['cost_usd'].mean():.4f}") | |
| else: | |
| st.write("β’ Token data not available") | |
| # Export functionality | |
| st.subheader("π€ Export Workflow Data") | |
| if st.button("Export Session Data to CSV", key="export_workflow"): | |
| csv_data = session_data.to_csv(index=False) | |
| st.download_button( | |
| label="Download CSV", | |
| data=csv_data, | |
| file_name=f"workflow_session_{selected_session}.csv", | |
| mime="text/csv" | |
| ) | |
| def run(self): | |
| """Run the dashboard""" | |
| st.title("π€ Multi-Agent System Dashboard") | |
| st.markdown("---") | |
| st.info("π **Welcome!** This dashboard showcases evaluation metrics for Diet, Support, and Queries agents.") | |
| # Load data | |
| df = self.load_data() | |
| # Create tabs | |
| tab1, tab2, tab3, tab4 = st.tabs([ | |
| "π Overview", | |
| "π€ Agent Performance", | |
| "π Response Analysis", | |
| "π Workflow Visualization" | |
| ]) | |
| with tab1: | |
| self.show_overview(df) | |
| with tab2: | |
| self.show_agent_performance(df) | |
| with tab3: | |
| self.show_response_analysis(df) | |
| with tab4: | |
| self.show_workflow_visualization(df) | |
| # Footer | |
| st.markdown("---") | |
| st.markdown("π **Multi-Agent System Dashboard** | Built with Streamlit & Plotly") | |
| # Run the dashboard | |
| try: | |
| dashboard = SimpleDashboard() | |
| dashboard.run() | |
| except Exception as e: | |
| st.error(f"Application Error: {str(e)}") | |
| st.info("Please refresh the page.") | |
| with st.expander("Debug Information"): | |
| st.code(f""" | |
| Error: {str(e)} | |
| Type: {type(e).__name__} | |
| Python: {sys.version} | |
| Working Dir: {os.getcwd()} | |
| Temp Dir: {tempfile.gettempdir()} | |
| """) |