saptyfun commited on
Commit
dd34aed
Β·
verified Β·
1 Parent(s): 38e0063

Upload 2 files

Browse files
Files changed (1) hide show
  1. src/app.py +334 -6
src/app.py CHANGED
@@ -93,7 +93,7 @@ class HuggingFaceDashboard:
93
  )
94
  ''')
95
 
96
- # Create workflow_traces table
97
  cursor.execute('''
98
  CREATE TABLE IF NOT EXISTS workflow_traces (
99
  id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -103,6 +103,9 @@ class HuggingFaceDashboard:
103
  step_type TEXT,
104
  input_data TEXT,
105
  output_data TEXT,
 
 
 
106
  execution_time_ms REAL,
107
  error_occurred BOOLEAN DEFAULT FALSE,
108
  error_details TEXT,
@@ -110,6 +113,29 @@ class HuggingFaceDashboard:
110
  )
111
  ''')
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  # Insert demo data
114
  self.insert_demo_data(cursor)
115
 
@@ -249,7 +275,33 @@ class HuggingFaceDashboard:
249
  ]
250
  }
251
 
252
- response = random.choice(response_templates[agent])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
  # Generate correlated scores (realistic relationships)
255
  relevance_score = max(0, min(10, base_score + random.uniform(-0.3, 0.3)))
@@ -305,6 +357,85 @@ class HuggingFaceDashboard:
305
  llm_provider, model_name, judge_reasoning, guardrails_failures, timestamp
306
  ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
307
  ''', eval_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
  def safe_column_access(self, df: pd.DataFrame, column: str, default_value=None):
310
  """Safely access DataFrame columns"""
@@ -324,6 +455,7 @@ class HuggingFaceDashboard:
324
  # Base queries
325
  eval_query = "SELECT * FROM evaluation_logs"
326
  trace_query = "SELECT * FROM workflow_traces"
 
327
 
328
  # Apply filters
329
  conditions = []
@@ -345,11 +477,18 @@ class HuggingFaceDashboard:
345
  if conditions:
346
  eval_query += " WHERE " + " AND ".join(conditions)
347
  trace_query += " WHERE " + " AND ".join(conditions)
 
348
 
349
  # Load data
350
  evaluations = pd.read_sql_query(eval_query, conn, params=params)
351
  traces = pd.read_sql_query(trace_query, conn, params=params)
352
 
 
 
 
 
 
 
353
  conn.close()
354
 
355
  # Convert timestamp columns
@@ -357,15 +496,18 @@ class HuggingFaceDashboard:
357
  evaluations['timestamp'] = pd.to_datetime(evaluations['timestamp'])
358
  if not traces.empty:
359
  traces['timestamp'] = pd.to_datetime(traces['timestamp'])
 
 
360
 
361
  return {
362
  'evaluations': evaluations,
363
- 'traces': traces
 
364
  }
365
 
366
  except Exception as e:
367
  st.error(f"Error loading data: {str(e)}")
368
- return {'evaluations': pd.DataFrame(), 'traces': pd.DataFrame()}
369
 
370
  def create_sidebar_filters(self, data: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
371
  """Create sidebar filters"""
@@ -645,6 +787,188 @@ class HuggingFaceDashboard:
645
  annotation_text="95% Target")
646
  st.plotly_chart(fig, use_container_width=True)
647
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
  def run(self):
649
  """Run the dashboard"""
650
  st.title("πŸ€– Multi-Agent System Dashboard - Demo")
@@ -682,10 +1006,11 @@ class HuggingFaceDashboard:
682
  filtered_data['evaluations'] = df
683
 
684
  # Create tabs
685
- tab1, tab2, tab3 = st.tabs([
686
  "πŸ“ˆ Executive Summary",
687
  "πŸ€– Agent Performance",
688
- "πŸ›‘οΈ Safety Analysis"
 
689
  ])
690
 
691
  with tab1:
@@ -697,6 +1022,9 @@ class HuggingFaceDashboard:
697
  with tab3:
698
  self.show_safety_analysis(filtered_data)
699
 
 
 
 
700
  # Footer
701
  st.markdown("---")
702
  st.markdown("πŸš€ **Multi-Agent System Dashboard** | Built with Streamlit & Plotly | Demo hosted on Hugging Face Spaces")
 
93
  )
94
  ''')
95
 
96
+ # Create workflow_traces table with enhanced response tracking
97
  cursor.execute('''
98
  CREATE TABLE IF NOT EXISTS workflow_traces (
99
  id INTEGER PRIMARY KEY AUTOINCREMENT,
 
103
  step_type TEXT,
104
  input_data TEXT,
105
  output_data TEXT,
106
+ response_metadata TEXT,
107
+ token_count INTEGER,
108
+ response_length INTEGER,
109
  execution_time_ms REAL,
110
  error_occurred BOOLEAN DEFAULT FALSE,
111
  error_details TEXT,
 
113
  )
114
  ''')
115
 
116
+ # Create response_analysis table for detailed response tracking
117
+ cursor.execute('''
118
+ CREATE TABLE IF NOT EXISTS response_analysis (
119
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
120
+ evaluation_id INTEGER,
121
+ session_id TEXT NOT NULL,
122
+ agent_name TEXT NOT NULL,
123
+ response_text TEXT NOT NULL,
124
+ response_length INTEGER,
125
+ word_count INTEGER,
126
+ sentence_count INTEGER,
127
+ readability_score REAL,
128
+ sentiment_score REAL,
129
+ key_topics TEXT,
130
+ response_type TEXT,
131
+ contains_code BOOLEAN DEFAULT FALSE,
132
+ contains_links BOOLEAN DEFAULT FALSE,
133
+ language_detected TEXT DEFAULT 'en',
134
+ timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
135
+ FOREIGN KEY (evaluation_id) REFERENCES evaluation_logs (id)
136
+ )
137
+ ''')
138
+
139
  # Insert demo data
140
  self.insert_demo_data(cursor)
141
 
 
275
  ]
276
  }
277
 
278
+ # Generate more detailed response based on agent type
279
+ base_response = random.choice(response_templates[agent])
280
+
281
+ # Add specific details based on agent type
282
+ if agent == "Diet Agent":
283
+ details = [
284
+ "Key recommendations: 1) Focus on whole foods, 2) Control portions, 3) Stay hydrated",
285
+ "Nutritional guidelines: Aim for 50% vegetables, 25% lean protein, 25% complex carbs",
286
+ "Meal timing: Consider eating every 3-4 hours to maintain stable blood sugar",
287
+ "Sample foods: Quinoa, salmon, leafy greens, berries, nuts, and legumes"
288
+ ]
289
+ elif agent == "Support Agent":
290
+ details = [
291
+ "Action steps: 1) Identify triggers, 2) Develop coping strategies, 3) Practice regularly",
292
+ "Techniques to try: Deep breathing, progressive muscle relaxation, mindfulness meditation",
293
+ "Timeline: Start with 5-10 minutes daily, gradually increase as comfort grows",
294
+ "Resources: Consider apps like Headspace, Calm, or consulting a professional"
295
+ ]
296
+ else: # Queries Agent
297
+ details = [
298
+ "Technical overview: This involves complex algorithms and data processing methods",
299
+ "Current applications: Used in healthcare, finance, transportation, and entertainment",
300
+ "Future implications: Expected to revolutionize how we work and interact with technology",
301
+ "Key considerations: Privacy, security, ethical implications, and regulatory frameworks"
302
+ ]
303
+
304
+ response = f"{base_response}\n\n{random.choice(details)}"
305
 
306
  # Generate correlated scores (realistic relationships)
307
  relevance_score = max(0, min(10, base_score + random.uniform(-0.3, 0.3)))
 
357
  llm_provider, model_name, judge_reasoning, guardrails_failures, timestamp
358
  ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
359
  ''', eval_data)
360
+
361
+ # Get the evaluation ID for response analysis
362
+ evaluation_id = cursor.lastrowid
363
+
364
+ # Insert detailed response analysis
365
+ self.insert_response_analysis(cursor, evaluation_id, session_id, agent, response, timestamp)
366
+
367
+ def insert_response_analysis(self, cursor, evaluation_id, session_id, agent_name, response_text, timestamp):
368
+ """Insert detailed response analysis data"""
369
+ import re
370
+
371
+ # Calculate response metrics
372
+ response_length = len(response_text)
373
+ word_count = len(response_text.split())
374
+ sentence_count = len(re.split(r'[.!?]+', response_text)) - 1
375
+
376
+ # Simple readability score (Flesch-like approximation)
377
+ if sentence_count > 0 and word_count > 0:
378
+ avg_sentence_length = word_count / sentence_count
379
+ readability_score = max(0, min(10, 10 - (avg_sentence_length - 15) * 0.1))
380
+ else:
381
+ readability_score = 5.0
382
+
383
+ # Simple sentiment analysis (based on positive/negative words)
384
+ positive_words = ['good', 'great', 'excellent', 'helpful', 'recommend', 'beneficial', 'effective', 'important', 'valuable', 'useful']
385
+ negative_words = ['bad', 'poor', 'difficult', 'problem', 'issue', 'concern', 'warning', 'avoid', 'risk', 'danger']
386
+
387
+ text_lower = response_text.lower()
388
+ positive_count = sum(1 for word in positive_words if word in text_lower)
389
+ negative_count = sum(1 for word in negative_words if word in text_lower)
390
+
391
+ if positive_count + negative_count > 0:
392
+ sentiment_score = (positive_count - negative_count) / (positive_count + negative_count) * 5 + 5
393
+ else:
394
+ sentiment_score = 5.0 # Neutral
395
+
396
+ # Extract key topics (simple keyword extraction)
397
+ keywords = []
398
+ if 'diet' in text_lower or 'food' in text_lower or 'nutrition' in text_lower:
399
+ keywords.append('nutrition')
400
+ if 'exercise' in text_lower or 'workout' in text_lower or 'fitness' in text_lower:
401
+ keywords.append('fitness')
402
+ if 'stress' in text_lower or 'anxiety' in text_lower or 'mental' in text_lower:
403
+ keywords.append('mental_health')
404
+ if 'technology' in text_lower or 'ai' in text_lower or 'algorithm' in text_lower:
405
+ keywords.append('technology')
406
+ if 'health' in text_lower or 'medical' in text_lower:
407
+ keywords.append('health')
408
+
409
+ key_topics = ','.join(keywords) if keywords else 'general'
410
+
411
+ # Determine response type
412
+ if '?' in response_text:
413
+ response_type = 'question'
414
+ elif any(word in text_lower for word in ['recommend', 'suggest', 'try', 'consider']):
415
+ response_type = 'recommendation'
416
+ elif any(word in text_lower for word in ['explain', 'definition', 'means', 'is']):
417
+ response_type = 'explanation'
418
+ else:
419
+ response_type = 'general'
420
+
421
+ # Check for code and links
422
+ contains_code = bool(re.search(r'```|`.*`|\bcode\b|\bfunction\b|\bclass\b', response_text))
423
+ contains_links = bool(re.search(r'http[s]?://|www\.|\.com|\.org', response_text))
424
+
425
+ # Insert response analysis
426
+ cursor.execute('''
427
+ INSERT INTO response_analysis (
428
+ evaluation_id, session_id, agent_name, response_text, response_length,
429
+ word_count, sentence_count, readability_score, sentiment_score,
430
+ key_topics, response_type, contains_code, contains_links,
431
+ language_detected, timestamp
432
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
433
+ ''', (
434
+ evaluation_id, session_id, agent_name, response_text, response_length,
435
+ word_count, sentence_count, readability_score, sentiment_score,
436
+ key_topics, response_type, contains_code, contains_links,
437
+ 'en', timestamp.isoformat()
438
+ ))
439
 
440
  def safe_column_access(self, df: pd.DataFrame, column: str, default_value=None):
441
  """Safely access DataFrame columns"""
 
455
  # Base queries
456
  eval_query = "SELECT * FROM evaluation_logs"
457
  trace_query = "SELECT * FROM workflow_traces"
458
+ response_analysis_query = "SELECT * FROM response_analysis"
459
 
460
  # Apply filters
461
  conditions = []
 
477
  if conditions:
478
  eval_query += " WHERE " + " AND ".join(conditions)
479
  trace_query += " WHERE " + " AND ".join(conditions)
480
+ response_analysis_query += " WHERE " + " AND ".join(conditions)
481
 
482
  # Load data
483
  evaluations = pd.read_sql_query(eval_query, conn, params=params)
484
  traces = pd.read_sql_query(trace_query, conn, params=params)
485
 
486
+ # Load response analysis data (handle if table doesn't exist yet)
487
+ try:
488
+ response_analysis = pd.read_sql_query(response_analysis_query, conn, params=params)
489
+ except Exception:
490
+ response_analysis = pd.DataFrame()
491
+
492
  conn.close()
493
 
494
  # Convert timestamp columns
 
496
  evaluations['timestamp'] = pd.to_datetime(evaluations['timestamp'])
497
  if not traces.empty:
498
  traces['timestamp'] = pd.to_datetime(traces['timestamp'])
499
+ if not response_analysis.empty:
500
+ response_analysis['timestamp'] = pd.to_datetime(response_analysis['timestamp'])
501
 
502
  return {
503
  'evaluations': evaluations,
504
+ 'traces': traces,
505
+ 'response_analysis': response_analysis
506
  }
507
 
508
  except Exception as e:
509
  st.error(f"Error loading data: {str(e)}")
510
+ return {'evaluations': pd.DataFrame(), 'traces': pd.DataFrame(), 'response_analysis': pd.DataFrame()}
511
 
512
  def create_sidebar_filters(self, data: Dict[str, pd.DataFrame]) -> Dict[str, Any]:
513
  """Create sidebar filters"""
 
787
  annotation_text="95% Target")
788
  st.plotly_chart(fig, use_container_width=True)
789
 
790
+ def show_response_analysis(self, data: Dict[str, pd.DataFrame]):
791
+ """Show detailed response analysis and tracing"""
792
+ st.header("πŸ“ Response Analysis & Tracing")
793
+
794
+ if data['evaluations'].empty:
795
+ st.warning("No evaluation data available")
796
+ return
797
+
798
+ df_eval = data['evaluations']
799
+ df_analysis = data.get('response_analysis', pd.DataFrame())
800
+
801
+ # Response overview metrics
802
+ col1, col2, col3, col4 = st.columns(4)
803
+
804
+ with col1:
805
+ avg_response_length = df_eval['response'].str.len().mean() if 'response' in df_eval.columns else 0
806
+ st.metric("Avg Response Length", f"{avg_response_length:.0f} chars")
807
+
808
+ with col2:
809
+ if not df_analysis.empty:
810
+ avg_word_count = df_analysis['word_count'].mean()
811
+ st.metric("Avg Word Count", f"{avg_word_count:.0f} words")
812
+ else:
813
+ st.metric("Avg Word Count", "N/A")
814
+
815
+ with col3:
816
+ if not df_analysis.empty:
817
+ avg_readability = df_analysis['readability_score'].mean()
818
+ st.metric("Avg Readability", f"{avg_readability:.1f}/10")
819
+ else:
820
+ st.metric("Avg Readability", "N/A")
821
+
822
+ with col4:
823
+ if not df_analysis.empty:
824
+ avg_sentiment = df_analysis['sentiment_score'].mean()
825
+ st.metric("Avg Sentiment", f"{avg_sentiment:.1f}/10")
826
+ else:
827
+ st.metric("Avg Sentiment", "N/A")
828
+
829
+ # Response analysis charts
830
+ if not df_analysis.empty:
831
+ col1, col2 = st.columns(2)
832
+
833
+ with col1:
834
+ st.subheader("πŸ“Š Response Length Distribution")
835
+ fig = px.histogram(
836
+ df_analysis,
837
+ x='response_length',
838
+ nbins=20,
839
+ title="Response Length Distribution",
840
+ labels={'response_length': 'Response Length (characters)', 'count': 'Frequency'}
841
+ )
842
+ st.plotly_chart(fig, use_container_width=True)
843
+
844
+ with col2:
845
+ st.subheader("πŸ“ˆ Readability vs Sentiment")
846
+ fig = px.scatter(
847
+ df_analysis,
848
+ x='readability_score',
849
+ y='sentiment_score',
850
+ color='agent_name',
851
+ title="Readability vs Sentiment by Agent",
852
+ labels={'readability_score': 'Readability Score', 'sentiment_score': 'Sentiment Score'}
853
+ )
854
+ st.plotly_chart(fig, use_container_width=True)
855
+
856
+ # Response type analysis
857
+ col1, col2 = st.columns(2)
858
+
859
+ with col1:
860
+ st.subheader("🏷️ Response Types")
861
+ response_types = df_analysis['response_type'].value_counts()
862
+ fig = px.pie(
863
+ values=response_types.values,
864
+ names=response_types.index,
865
+ title="Distribution of Response Types"
866
+ )
867
+ st.plotly_chart(fig, use_container_width=True)
868
+
869
+ with col2:
870
+ st.subheader("πŸ” Key Topics")
871
+ # Process key topics
872
+ all_topics = []
873
+ for topics in df_analysis['key_topics'].dropna():
874
+ all_topics.extend(topics.split(','))
875
+
876
+ if all_topics:
877
+ topic_counts = pd.Series(all_topics).value_counts().head(10)
878
+ fig = px.bar(
879
+ x=topic_counts.values,
880
+ y=topic_counts.index,
881
+ orientation='h',
882
+ title="Top 10 Key Topics",
883
+ labels={'x': 'Frequency', 'y': 'Topics'}
884
+ )
885
+ st.plotly_chart(fig, use_container_width=True)
886
+ else:
887
+ st.info("No topic data available")
888
+
889
+ # Response tracing section
890
+ st.subheader("πŸ” Response Tracing")
891
+
892
+ # Search functionality
893
+ search_term = st.text_input("πŸ” Search in responses:", placeholder="Enter keywords to search...")
894
+
895
+ if search_term:
896
+ mask = df_eval['response'].str.contains(search_term, case=False, na=False)
897
+ filtered_responses = df_eval[mask]
898
+ else:
899
+ filtered_responses = df_eval.head(10) # Show first 10 by default
900
+
901
+ # Display responses with details
902
+ if not filtered_responses.empty:
903
+ st.write(f"**Found {len(filtered_responses)} responses**")
904
+
905
+ for idx, row in filtered_responses.iterrows():
906
+ with st.expander(f"πŸ€– {row['agent_name']} - Session: {row['session_id'][:8]}... - Score: {row['overall_score']:.1f}"):
907
+ col1, col2 = st.columns([2, 1])
908
+
909
+ with col1:
910
+ st.write("**Query:**")
911
+ st.write(row['query'])
912
+
913
+ st.write("**Response:**")
914
+ st.write(row['response'])
915
+
916
+ with col2:
917
+ st.write("**Evaluation Scores:**")
918
+ st.write(f"Overall: {row['overall_score']:.1f}/10")
919
+ if 'relevance_score' in row:
920
+ st.write(f"Relevance: {row['relevance_score']:.1f}/10")
921
+ if 'accuracy_score' in row:
922
+ st.write(f"Accuracy: {row['accuracy_score']:.1f}/10")
923
+ if 'completeness_score' in row:
924
+ st.write(f"Completeness: {row['completeness_score']:.1f}/10")
925
+ if 'coherence_score' in row:
926
+ st.write(f"Coherence: {row['coherence_score']:.1f}/10")
927
+
928
+ st.write("**Metadata:**")
929
+ st.write(f"Timestamp: {row['timestamp']}")
930
+ st.write(f"Response Time: {row['execution_time_ms']:.0f}ms")
931
+ st.write(f"Safety: {'βœ… Passed' if row['guardrails_passed'] else '❌ Failed'}")
932
+
933
+ # Show response analysis if available
934
+ if not df_analysis.empty:
935
+ analysis_row = df_analysis[df_analysis['evaluation_id'] == row['id']]
936
+ if not analysis_row.empty:
937
+ analysis = analysis_row.iloc[0]
938
+ st.write("**Response Analysis:**")
939
+ st.write(f"Length: {analysis['response_length']} chars")
940
+ st.write(f"Words: {analysis['word_count']}")
941
+ st.write(f"Readability: {analysis['readability_score']:.1f}/10")
942
+ st.write(f"Sentiment: {analysis['sentiment_score']:.1f}/10")
943
+ st.write(f"Type: {analysis['response_type']}")
944
+ st.write(f"Topics: {analysis['key_topics']}")
945
+ else:
946
+ st.info("No responses found matching your search criteria.")
947
+
948
+ # Export response data
949
+ st.subheader("πŸ“€ Export Response Data")
950
+ col1, col2 = st.columns(2)
951
+
952
+ with col1:
953
+ if st.button("πŸ“Š Export Evaluation Data"):
954
+ csv = df_eval.to_csv(index=False)
955
+ st.download_button(
956
+ label="Download CSV",
957
+ data=csv,
958
+ file_name="evaluation_responses.csv",
959
+ mime="text/csv"
960
+ )
961
+
962
+ with col2:
963
+ if not df_analysis.empty and st.button("πŸ“ˆ Export Analysis Data"):
964
+ csv = df_analysis.to_csv(index=False)
965
+ st.download_button(
966
+ label="Download CSV",
967
+ data=csv,
968
+ file_name="response_analysis.csv",
969
+ mime="text/csv"
970
+ )
971
+
972
  def run(self):
973
  """Run the dashboard"""
974
  st.title("πŸ€– Multi-Agent System Dashboard - Demo")
 
1006
  filtered_data['evaluations'] = df
1007
 
1008
  # Create tabs
1009
+ tab1, tab2, tab3, tab4 = st.tabs([
1010
  "πŸ“ˆ Executive Summary",
1011
  "πŸ€– Agent Performance",
1012
+ "πŸ›‘οΈ Safety Analysis",
1013
+ "πŸ“ Response Analysis"
1014
  ])
1015
 
1016
  with tab1:
 
1022
  with tab3:
1023
  self.show_safety_analysis(filtered_data)
1024
 
1025
+ with tab4:
1026
+ self.show_response_analysis(filtered_data)
1027
+
1028
  # Footer
1029
  st.markdown("---")
1030
  st.markdown("πŸš€ **Multi-Agent System Dashboard** | Built with Streamlit & Plotly | Demo hosted on Hugging Face Spaces")