Spaces:

saptyfun
/

multiagent

Sleeping

App Files Files Community

saptyfun commited on Jun 17, 2025

Commit

dd34aed

verified ·

1 Parent(s): 38e0063

Upload 2 files

Browse files

Files changed (1) hide show

src/app.py +334 -6

src/app.py CHANGED Viewed

@@ -93,7 +93,7 @@ class HuggingFaceDashboard:
         )
         ''')
-        # Create workflow_traces table
         cursor.execute('''
         CREATE TABLE IF NOT EXISTS workflow_traces (
             id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -103,6 +103,9 @@ class HuggingFaceDashboard:
             step_type TEXT,
             input_data TEXT,
             output_data TEXT,
             execution_time_ms REAL,
             error_occurred BOOLEAN DEFAULT FALSE,
             error_details TEXT,
@@ -110,6 +113,29 @@ class HuggingFaceDashboard:
         )
         ''')
         # Insert demo data
         self.insert_demo_data(cursor)
@@ -249,7 +275,33 @@ class HuggingFaceDashboard:
                 ]
             }
-            response = random.choice(response_templates[agent])
             # Generate correlated scores (realistic relationships)
             relevance_score = max(0, min(10, base_score + random.uniform(-0.3, 0.3)))
@@ -305,6 +357,85 @@ class HuggingFaceDashboard:
                 llm_provider, model_name, judge_reasoning, guardrails_failures, timestamp
             ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
             ''', eval_data)
     def safe_column_access(self, df: pd.DataFrame, column: str, default_value=None):
         """Safely access DataFrame columns"""
@@ -324,6 +455,7 @@ class HuggingFaceDashboard:
             # Base queries
             eval_query = "SELECT * FROM evaluation_logs"
             trace_query = "SELECT * FROM workflow_traces"
             # Apply filters
             conditions = []
@@ -345,11 +477,18 @@ class HuggingFaceDashboard:
             if conditions:
                 eval_query += " WHERE " + " AND ".join(conditions)
                 trace_query += " WHERE " + " AND ".join(conditions)
             # Load data
             evaluations = pd.read_sql_query(eval_query, conn, params=params)
             traces = pd.read_sql_query(trace_query, conn, params=params)
             conn.close()
             # Convert timestamp columns
@@ -357,15 +496,18 @@ class HuggingFaceDashboard:
                 evaluations['timestamp'] = pd.to_datetime(evaluations['timestamp'])
             if not traces.empty:
                 traces['timestamp'] = pd.to_datetime(traces['timestamp'])
             return {
                 'evaluations': evaluations,
-                'traces': traces
             }
         except Exception as e:
             st.error(f"Error loading data: {str(e)}")
-            return {'evaluations': pd.DataFrame(), 'traces': pd.DataFrame()}
     def create_sidebar_filters(self, data: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
         """Create sidebar filters"""
@@ -645,6 +787,188 @@ class HuggingFaceDashboard:
                          annotation_text="95% Target")
             st.plotly_chart(fig, use_container_width=True)
     def run(self):
         """Run the dashboard"""
         st.title("🤖 Multi-Agent System Dashboard - Demo")
@@ -682,10 +1006,11 @@ class HuggingFaceDashboard:
             filtered_data['evaluations'] = df
         # Create tabs
-        tab1, tab2, tab3 = st.tabs([
             "📈 Executive Summary",
             "🤖 Agent Performance",
-            "🛡️ Safety Analysis"
         ])
         with tab1:
@@ -697,6 +1022,9 @@ class HuggingFaceDashboard:
         with tab3:
             self.show_safety_analysis(filtered_data)
         # Footer
         st.markdown("---")
         st.markdown("🚀 **Multi-Agent System Dashboard** | Built with Streamlit & Plotly | Demo hosted on Hugging Face Spaces")

         )
         ''')
+        # Create workflow_traces table with enhanced response tracking
         cursor.execute('''
         CREATE TABLE IF NOT EXISTS workflow_traces (
             id INTEGER PRIMARY KEY AUTOINCREMENT,
             step_type TEXT,
             input_data TEXT,
             output_data TEXT,
+            response_metadata TEXT,
+            token_count INTEGER,
+            response_length INTEGER,
             execution_time_ms REAL,
             error_occurred BOOLEAN DEFAULT FALSE,
             error_details TEXT,
         )
         ''')
+        # Create response_analysis table for detailed response tracking
+        cursor.execute('''
+        CREATE TABLE IF NOT EXISTS response_analysis (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            evaluation_id INTEGER,
+            session_id TEXT NOT NULL,
+            agent_name TEXT NOT NULL,
+            response_text TEXT NOT NULL,
+            response_length INTEGER,
+            word_count INTEGER,
+            sentence_count INTEGER,
+            readability_score REAL,
+            sentiment_score REAL,
+            key_topics TEXT,
+            response_type TEXT,
+            contains_code BOOLEAN DEFAULT FALSE,
+            contains_links BOOLEAN DEFAULT FALSE,
+            language_detected TEXT DEFAULT 'en',
+            timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
+            FOREIGN KEY (evaluation_id) REFERENCES evaluation_logs (id)
+        )
+        ''')
         # Insert demo data
         self.insert_demo_data(cursor)
                 ]
             }
+            # Generate more detailed response based on agent type
+            base_response = random.choice(response_templates[agent])
+            # Add specific details based on agent type
+            if agent == "Diet Agent":
+                details = [
+                    "Key recommendations: 1) Focus on whole foods, 2) Control portions, 3) Stay hydrated",
+                    "Nutritional guidelines: Aim for 50% vegetables, 25% lean protein, 25% complex carbs",
+                    "Meal timing: Consider eating every 3-4 hours to maintain stable blood sugar",
+                    "Sample foods: Quinoa, salmon, leafy greens, berries, nuts, and legumes"
+                ]
+            elif agent == "Support Agent":
+                details = [
+                    "Action steps: 1) Identify triggers, 2) Develop coping strategies, 3) Practice regularly",
+                    "Techniques to try: Deep breathing, progressive muscle relaxation, mindfulness meditation",
+                    "Timeline: Start with 5-10 minutes daily, gradually increase as comfort grows",
+                    "Resources: Consider apps like Headspace, Calm, or consulting a professional"
+                ]
+            else:  # Queries Agent
+                details = [
+                    "Technical overview: This involves complex algorithms and data processing methods",
+                    "Current applications: Used in healthcare, finance, transportation, and entertainment",
+                    "Future implications: Expected to revolutionize how we work and interact with technology",
+                    "Key considerations: Privacy, security, ethical implications, and regulatory frameworks"
+                ]
+            response = f"{base_response}\n\n{random.choice(details)}"
             # Generate correlated scores (realistic relationships)
             relevance_score = max(0, min(10, base_score + random.uniform(-0.3, 0.3)))
                 llm_provider, model_name, judge_reasoning, guardrails_failures, timestamp
             ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
             ''', eval_data)
+            # Get the evaluation ID for response analysis
+            evaluation_id = cursor.lastrowid
+            # Insert detailed response analysis
+            self.insert_response_analysis(cursor, evaluation_id, session_id, agent, response, timestamp)
+    def insert_response_analysis(self, cursor, evaluation_id, session_id, agent_name, response_text, timestamp):
+        """Insert detailed response analysis data"""
+        import re
+        # Calculate response metrics
+        response_length = len(response_text)
+        word_count = len(response_text.split())
+        sentence_count = len(re.split(r'[.!?]+', response_text)) - 1
+        # Simple readability score (Flesch-like approximation)
+        if sentence_count > 0 and word_count > 0:
+            avg_sentence_length = word_count / sentence_count
+            readability_score = max(0, min(10, 10 - (avg_sentence_length - 15) * 0.1))
+        else:
+            readability_score = 5.0
+        # Simple sentiment analysis (based on positive/negative words)
+        positive_words = ['good', 'great', 'excellent', 'helpful', 'recommend', 'beneficial', 'effective', 'important', 'valuable', 'useful']
+        negative_words = ['bad', 'poor', 'difficult', 'problem', 'issue', 'concern', 'warning', 'avoid', 'risk', 'danger']
+        text_lower = response_text.lower()
+        positive_count = sum(1 for word in positive_words if word in text_lower)
+        negative_count = sum(1 for word in negative_words if word in text_lower)
+        if positive_count + negative_count > 0:
+            sentiment_score = (positive_count - negative_count) / (positive_count + negative_count) * 5 + 5
+        else:
+            sentiment_score = 5.0  # Neutral
+        # Extract key topics (simple keyword extraction)
+        keywords = []
+        if 'diet' in text_lower or 'food' in text_lower or 'nutrition' in text_lower:
+            keywords.append('nutrition')
+        if 'exercise' in text_lower or 'workout' in text_lower or 'fitness' in text_lower:
+            keywords.append('fitness')
+        if 'stress' in text_lower or 'anxiety' in text_lower or 'mental' in text_lower:
+            keywords.append('mental_health')
+        if 'technology' in text_lower or 'ai' in text_lower or 'algorithm' in text_lower:
+            keywords.append('technology')
+        if 'health' in text_lower or 'medical' in text_lower:
+            keywords.append('health')
+        key_topics = ','.join(keywords) if keywords else 'general'
+        # Determine response type
+        if '?' in response_text:
+            response_type = 'question'
+        elif any(word in text_lower for word in ['recommend', 'suggest', 'try', 'consider']):
+            response_type = 'recommendation'
+        elif any(word in text_lower for word in ['explain', 'definition', 'means', 'is']):
+            response_type = 'explanation'
+        else:
+            response_type = 'general'
+        # Check for code and links
+        contains_code = bool(re.search(r'```|`.*`|\bcode\b|\bfunction\b|\bclass\b', response_text))
+        contains_links = bool(re.search(r'http[s]?://|www\.|\.com|\.org', response_text))
+        # Insert response analysis
+        cursor.execute('''
+        INSERT INTO response_analysis (
+            evaluation_id, session_id, agent_name, response_text, response_length,
+            word_count, sentence_count, readability_score, sentiment_score,
+            key_topics, response_type, contains_code, contains_links,
+            language_detected, timestamp
+        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        ''', (
+            evaluation_id, session_id, agent_name, response_text, response_length,
+            word_count, sentence_count, readability_score, sentiment_score,
+            key_topics, response_type, contains_code, contains_links,
+            'en', timestamp.isoformat()
+        ))
     def safe_column_access(self, df: pd.DataFrame, column: str, default_value=None):
         """Safely access DataFrame columns"""
             # Base queries
             eval_query = "SELECT * FROM evaluation_logs"
             trace_query = "SELECT * FROM workflow_traces"
+            response_analysis_query = "SELECT * FROM response_analysis"
             # Apply filters
             conditions = []
             if conditions:
                 eval_query += " WHERE " + " AND ".join(conditions)
                 trace_query += " WHERE " + " AND ".join(conditions)
+                response_analysis_query += " WHERE " + " AND ".join(conditions)
             # Load data
             evaluations = pd.read_sql_query(eval_query, conn, params=params)
             traces = pd.read_sql_query(trace_query, conn, params=params)
+            # Load response analysis data (handle if table doesn't exist yet)
+            try:
+                response_analysis = pd.read_sql_query(response_analysis_query, conn, params=params)
+            except Exception:
+                response_analysis = pd.DataFrame()
             conn.close()
             # Convert timestamp columns
                 evaluations['timestamp'] = pd.to_datetime(evaluations['timestamp'])
             if not traces.empty:
                 traces['timestamp'] = pd.to_datetime(traces['timestamp'])
+            if not response_analysis.empty:
+                response_analysis['timestamp'] = pd.to_datetime(response_analysis['timestamp'])
             return {
                 'evaluations': evaluations,
+                'traces': traces,
+                'response_analysis': response_analysis
             }
         except Exception as e:
             st.error(f"Error loading data: {str(e)}")
+            return {'evaluations': pd.DataFrame(), 'traces': pd.DataFrame(), 'response_analysis': pd.DataFrame()}
     def create_sidebar_filters(self, data: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
         """Create sidebar filters"""
                          annotation_text="95% Target")
             st.plotly_chart(fig, use_container_width=True)
+    def show_response_analysis(self, data: Dict[str, pd.DataFrame]):
+        """Show detailed response analysis and tracing"""
+        st.header("📝 Response Analysis & Tracing")
+        if data['evaluations'].empty:
+            st.warning("No evaluation data available")
+            return
+        df_eval = data['evaluations']
+        df_analysis = data.get('response_analysis', pd.DataFrame())
+        # Response overview metrics
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            avg_response_length = df_eval['response'].str.len().mean() if 'response' in df_eval.columns else 0
+            st.metric("Avg Response Length", f"{avg_response_length:.0f} chars")
+        with col2:
+            if not df_analysis.empty:
+                avg_word_count = df_analysis['word_count'].mean()
+                st.metric("Avg Word Count", f"{avg_word_count:.0f} words")
+            else:
+                st.metric("Avg Word Count", "N/A")
+        with col3:
+            if not df_analysis.empty:
+                avg_readability = df_analysis['readability_score'].mean()
+                st.metric("Avg Readability", f"{avg_readability:.1f}/10")
+            else:
+                st.metric("Avg Readability", "N/A")
+        with col4:
+            if not df_analysis.empty:
+                avg_sentiment = df_analysis['sentiment_score'].mean()
+                st.metric("Avg Sentiment", f"{avg_sentiment:.1f}/10")
+            else:
+                st.metric("Avg Sentiment", "N/A")
+        # Response analysis charts
+        if not df_analysis.empty:
+            col1, col2 = st.columns(2)
+            with col1:
+                st.subheader("📊 Response Length Distribution")
+                fig = px.histogram(
+                    df_analysis,
+                    x='response_length',
+                    nbins=20,
+                    title="Response Length Distribution",
+                    labels={'response_length': 'Response Length (characters)', 'count': 'Frequency'}
+                )
+                st.plotly_chart(fig, use_container_width=True)
+            with col2:
+                st.subheader("📈 Readability vs Sentiment")
+                fig = px.scatter(
+                    df_analysis,
+                    x='readability_score',
+                    y='sentiment_score',
+                    color='agent_name',
+                    title="Readability vs Sentiment by Agent",
+                    labels={'readability_score': 'Readability Score', 'sentiment_score': 'Sentiment Score'}
+                )
+                st.plotly_chart(fig, use_container_width=True)
+            # Response type analysis
+            col1, col2 = st.columns(2)
+            with col1:
+                st.subheader("🏷️ Response Types")
+                response_types = df_analysis['response_type'].value_counts()
+                fig = px.pie(
+                    values=response_types.values,
+                    names=response_types.index,
+                    title="Distribution of Response Types"
+                )
+                st.plotly_chart(fig, use_container_width=True)
+            with col2:
+                st.subheader("🔍 Key Topics")
+                # Process key topics
+                all_topics = []
+                for topics in df_analysis['key_topics'].dropna():
+                    all_topics.extend(topics.split(','))
+                if all_topics:
+                    topic_counts = pd.Series(all_topics).value_counts().head(10)
+                    fig = px.bar(
+                        x=topic_counts.values,
+                        y=topic_counts.index,
+                        orientation='h',
+                        title="Top 10 Key Topics",
+                        labels={'x': 'Frequency', 'y': 'Topics'}
+                    )
+                    st.plotly_chart(fig, use_container_width=True)
+                else:
+                    st.info("No topic data available")
+        # Response tracing section
+        st.subheader("🔍 Response Tracing")
+        # Search functionality
+        search_term = st.text_input("🔍 Search in responses:", placeholder="Enter keywords to search...")
+        if search_term:
+            mask = df_eval['response'].str.contains(search_term, case=False, na=False)
+            filtered_responses = df_eval[mask]
+        else:
+            filtered_responses = df_eval.head(10)  # Show first 10 by default
+        # Display responses with details
+        if not filtered_responses.empty:
+            st.write(f"**Found {len(filtered_responses)} responses**")
+            for idx, row in filtered_responses.iterrows():
+                with st.expander(f"🤖 {row['agent_name']} - Session: {row['session_id'][:8]}... - Score: {row['overall_score']:.1f}"):
+                    col1, col2 = st.columns([2, 1])
+                    with col1:
+                        st.write("**Query:**")
+                        st.write(row['query'])
+                        st.write("**Response:**")
+                        st.write(row['response'])
+                    with col2:
+                        st.write("**Evaluation Scores:**")
+                        st.write(f"Overall: {row['overall_score']:.1f}/10")
+                        if 'relevance_score' in row:
+                            st.write(f"Relevance: {row['relevance_score']:.1f}/10")
+                        if 'accuracy_score' in row:
+                            st.write(f"Accuracy: {row['accuracy_score']:.1f}/10")
+                        if 'completeness_score' in row:
+                            st.write(f"Completeness: {row['completeness_score']:.1f}/10")
+                        if 'coherence_score' in row:
+                            st.write(f"Coherence: {row['coherence_score']:.1f}/10")
+                        st.write("**Metadata:**")
+                        st.write(f"Timestamp: {row['timestamp']}")
+                        st.write(f"Response Time: {row['execution_time_ms']:.0f}ms")
+                        st.write(f"Safety: {'✅ Passed' if row['guardrails_passed'] else '❌ Failed'}")
+                        # Show response analysis if available
+                        if not df_analysis.empty:
+                            analysis_row = df_analysis[df_analysis['evaluation_id'] == row['id']]
+                            if not analysis_row.empty:
+                                analysis = analysis_row.iloc[0]
+                                st.write("**Response Analysis:**")
+                                st.write(f"Length: {analysis['response_length']} chars")
+                                st.write(f"Words: {analysis['word_count']}")
+                                st.write(f"Readability: {analysis['readability_score']:.1f}/10")
+                                st.write(f"Sentiment: {analysis['sentiment_score']:.1f}/10")
+                                st.write(f"Type: {analysis['response_type']}")
+                                st.write(f"Topics: {analysis['key_topics']}")
+        else:
+            st.info("No responses found matching your search criteria.")
+        # Export response data
+        st.subheader("📤 Export Response Data")
+        col1, col2 = st.columns(2)
+        with col1:
+            if st.button("📊 Export Evaluation Data"):
+                csv = df_eval.to_csv(index=False)
+                st.download_button(
+                    label="Download CSV",
+                    data=csv,
+                    file_name="evaluation_responses.csv",
+                    mime="text/csv"
+                )
+        with col2:
+            if not df_analysis.empty and st.button("📈 Export Analysis Data"):
+                csv = df_analysis.to_csv(index=False)
+                st.download_button(
+                    label="Download CSV",
+                    data=csv,
+                    file_name="response_analysis.csv",
+                    mime="text/csv"
+                )
     def run(self):
         """Run the dashboard"""
         st.title("🤖 Multi-Agent System Dashboard - Demo")
             filtered_data['evaluations'] = df
         # Create tabs
+        tab1, tab2, tab3, tab4 = st.tabs([
             "📈 Executive Summary",
             "🤖 Agent Performance",
+            "🛡️ Safety Analysis",
+            "📝 Response Analysis"
         ])
         with tab1:
         with tab3:
             self.show_safety_analysis(filtered_data)
+        with tab4:
+            self.show_response_analysis(filtered_data)
         # Footer
         st.markdown("---")
         st.markdown("🚀 **Multi-Agent System Dashboard** | Built with Streamlit & Plotly | Demo hosted on Hugging Face Spaces")