jmisak commited on
Commit
8056e83
·
verified ·
1 Parent(s): db39ccf

Upload 5 files

Browse files
conversation_analytics.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Conversation Analytics - Multi-session analysis and insights
3
+ """
4
+ import json
5
+ import sys
6
+ import os
7
+ from typing import List, Dict, Optional
8
+ from collections import Counter
9
+ from datetime import datetime
10
+
11
+ # Add parent directory to path for imports
12
+ sys.path.insert(0, os.path.dirname(__file__))
13
+
14
+ from conversation_session import ConversationSession
15
+ from llm_backend import LLMBackend
16
+
17
+
18
+ class ConversationAnalytics:
19
+ """
20
+ Analyze multiple conversation sessions to identify patterns,
21
+ themes, and insights across interviews.
22
+ """
23
+
24
+ def __init__(self, llm_backend: Optional[LLMBackend] = None):
25
+ self.llm = llm_backend
26
+ self.sessions: List[ConversationSession] = []
27
+
28
+ def load_sessions(self, session_data_list: List[Dict]) -> int:
29
+ """
30
+ Load multiple sessions from dictionaries.
31
+
32
+ Args:
33
+ session_data_list: List of session dictionaries
34
+
35
+ Returns:
36
+ Number of sessions loaded
37
+ """
38
+ self.sessions = []
39
+ for session_data in session_data_list:
40
+ try:
41
+ session = ConversationSession.from_dict(session_data)
42
+ self.sessions.append(session)
43
+ except Exception as e:
44
+ print(f"Error loading session: {e}")
45
+ continue
46
+
47
+ return len(self.sessions)
48
+
49
+ def get_aggregate_stats(self) -> Dict:
50
+ """Get aggregate statistics across all sessions"""
51
+ if not self.sessions:
52
+ return {}
53
+
54
+ total_turns = sum(s.get_turn_count() for s in self.sessions)
55
+ total_user_turns = sum(len([t for t in s.conversation_history if t.role == "user"])
56
+ for s in self.sessions)
57
+ total_ai_turns = sum(len([t for t in s.conversation_history if t.role == "ai"])
58
+ for s in self.sessions)
59
+
60
+ # Calculate response lengths
61
+ all_user_responses = []
62
+ for session in self.sessions:
63
+ all_user_responses.extend([len(t.content) for t in session.conversation_history
64
+ if t.role == "user"])
65
+
66
+ avg_response_length = sum(all_user_responses) / len(all_user_responses) if all_user_responses else 0
67
+
68
+ # Calculate durations
69
+ all_durations = [s._calculate_duration_minutes() for s in self.sessions]
70
+ avg_duration = sum(all_durations) / len(all_durations) if all_durations else 0
71
+
72
+ # Status breakdown
73
+ status_counts = Counter(s.status for s in self.sessions)
74
+
75
+ return {
76
+ "total_sessions": len(self.sessions),
77
+ "total_turns": total_turns,
78
+ "total_user_turns": total_user_turns,
79
+ "total_ai_turns": total_ai_turns,
80
+ "avg_turns_per_session": total_turns / len(self.sessions),
81
+ "avg_response_length": avg_response_length,
82
+ "avg_duration_minutes": avg_duration,
83
+ "total_duration_minutes": sum(all_durations),
84
+ "status_breakdown": dict(status_counts),
85
+ "completed_sessions": status_counts.get("completed", 0),
86
+ "active_sessions": status_counts.get("active", 0),
87
+ "abandoned_sessions": status_counts.get("abandoned", 0)
88
+ }
89
+
90
+ def extract_all_responses(self) -> List[str]:
91
+ """Extract all user responses from all sessions"""
92
+ responses = []
93
+ for session in self.sessions:
94
+ for turn in session.conversation_history:
95
+ if turn.role == "user":
96
+ responses.append(turn.content)
97
+ return responses
98
+
99
+ def identify_common_keywords(self, top_n: int = 20) -> List[tuple]:
100
+ """
101
+ Identify most common keywords across all user responses.
102
+
103
+ Args:
104
+ top_n: Number of top keywords to return
105
+
106
+ Returns:
107
+ List of (keyword, count) tuples
108
+ """
109
+ responses = self.extract_all_responses()
110
+
111
+ # Simple keyword extraction (filter common words)
112
+ stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
113
+ 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
114
+ 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
115
+ 'should', 'may', 'might', 'can', 'it', 'this', 'that', 'these', 'those',
116
+ 'i', 'you', 'he', 'she', 'we', 'they', 'my', 'your', 'his', 'her', 'our',
117
+ 'their', 'me', 'him', 'her', 'us', 'them'}
118
+
119
+ all_words = []
120
+ for response in responses:
121
+ words = response.lower().split()
122
+ # Filter out stop words and short words
123
+ filtered_words = [w.strip('.,!?;:"()[]{}') for w in words
124
+ if len(w) > 3 and w.lower() not in stop_words]
125
+ all_words.extend(filtered_words)
126
+
127
+ word_counts = Counter(all_words)
128
+ return word_counts.most_common(top_n)
129
+
130
+ def generate_cross_session_insights(self) -> str:
131
+ """
132
+ Generate AI-powered insights across all sessions.
133
+
134
+ Returns:
135
+ Markdown formatted insights report
136
+ """
137
+ if not self.llm:
138
+ return "❌ LLM backend required for cross-session insights"
139
+
140
+ if not self.sessions:
141
+ return "❌ No sessions to analyze"
142
+
143
+ # Collect all user responses
144
+ all_responses = self.extract_all_responses()
145
+
146
+ if len(all_responses) < 10:
147
+ return "❌ Need at least 10 responses across sessions for meaningful analysis"
148
+
149
+ # Sample responses (to avoid token limits)
150
+ sample_size = min(50, len(all_responses))
151
+ import random
152
+ sampled_responses = random.sample(all_responses, sample_size) if len(all_responses) > sample_size else all_responses
153
+
154
+ responses_text = "\n\n".join([f"Response {i+1}: {r}" for i, r in enumerate(sampled_responses)])
155
+
156
+ system_prompt = """You are analyzing multiple qualitative research interview sessions.
157
+
158
+ Identify patterns, themes, and insights across all the responses provided. Focus on:
159
+ 1. **Common Themes**: What topics come up repeatedly?
160
+ 2. **Sentiment Patterns**: Overall sentiment and emotional tone
161
+ 3. **Key Insights**: Important discoveries or patterns
162
+ 4. **Notable Quotes**: Particularly insightful or representative responses
163
+ 5. **Recommendations**: What actions should researchers take based on these findings?
164
+
165
+ Provide a comprehensive analysis in a professional report format."""
166
+
167
+ user_prompt = f"""Analyze these {len(sampled_responses)} interview responses from {len(self.sessions)} different sessions:
168
+
169
+ {responses_text}
170
+
171
+ Generate a comprehensive cross-session analysis report."""
172
+
173
+ messages = [
174
+ {"role": "system", "content": system_prompt},
175
+ {"role": "user", "content": user_prompt}
176
+ ]
177
+
178
+ try:
179
+ insights = self.llm.generate(messages, max_tokens=1000, temperature=0.5)
180
+ return insights.strip()
181
+ except Exception as e:
182
+ return f"❌ Error generating insights: {str(e)}"
183
+
184
+ def generate_comprehensive_report(self) -> str:
185
+ """
186
+ Generate a comprehensive markdown report of multi-session analysis.
187
+
188
+ Returns:
189
+ Markdown formatted report
190
+ """
191
+ if not self.sessions:
192
+ return "# Multi-Session Analysis Report\n\n❌ No sessions loaded for analysis."
193
+
194
+ stats = self.get_aggregate_stats()
195
+ keywords = self.identify_common_keywords(15)
196
+
197
+ report = f"""# Multi-Session Conversation Analysis Report
198
+
199
+ **Generated:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
200
+
201
+ ---
202
+
203
+ ## 📊 Aggregate Statistics
204
+
205
+ **Session Overview:**
206
+ - Total Sessions Analyzed: **{stats['total_sessions']}**
207
+ - Completed Sessions: **{stats['completed_sessions']}**
208
+ - Active Sessions: **{stats['active_sessions']}**
209
+ - Abandoned Sessions: **{stats['abandoned_sessions']}**
210
+
211
+ **Conversation Metrics:**
212
+ - Total Conversation Turns: **{stats['total_turns']}**
213
+ - User Responses: **{stats['total_user_turns']}**
214
+ - AI Questions: **{stats['total_ai_turns']}**
215
+ - Average Turns per Session: **{stats['avg_turns_per_session']:.1f}**
216
+
217
+ **Quality Indicators:**
218
+ - Average Response Length: **{stats['avg_response_length']:.0f} characters**
219
+ - Average Session Duration: **{stats['avg_duration_minutes']:.1f} minutes**
220
+ - Total Interview Time: **{stats['total_duration_minutes']:.1f} minutes** ({stats['total_duration_minutes']/60:.1f} hours)
221
+
222
+ ---
223
+
224
+ ## 🔑 Common Keywords & Topics
225
+
226
+ Top keywords mentioned across all sessions:
227
+
228
+ """
229
+
230
+ for i, (keyword, count) in enumerate(keywords, 1):
231
+ report += f"{i}. **{keyword}** - mentioned {count} times\n"
232
+
233
+ report += "\n---\n\n## 💡 Cross-Session Insights\n\n"
234
+
235
+ if self.llm:
236
+ report += "*Generating AI-powered insights...*\n\n"
237
+ insights = self.generate_cross_session_insights()
238
+ report += insights
239
+ else:
240
+ report += "*AI insights unavailable (LLM backend not configured)*\n\n"
241
+ report += "**Manual Analysis Recommended:**\n"
242
+ report += "- Review individual session transcripts\n"
243
+ report += "- Look for patterns in the common keywords above\n"
244
+ report += "- Compare responses across different respondent demographics\n"
245
+
246
+ report += "\n\n---\n\n## 📋 Session Details\n\n"
247
+
248
+ for i, session in enumerate(self.sessions, 1):
249
+ stats = session.get_summary_stats()
250
+ report += f"""### Session {i}: {session.flow_name}
251
+ - **Session ID:** `{session.id}`
252
+ - **Status:** {session.status}
253
+ - **Duration:** {stats['duration_minutes']:.1f} minutes
254
+ - **Turns:** {stats['total_turns']} ({stats['user_turns']} user, {stats['ai_turns']} AI)
255
+ - **Avg Response Length:** {stats['avg_user_response_length']:.0f} characters
256
+
257
+ """
258
+
259
+ report += "\n---\n\n## 🎯 Research Recommendations\n\n"
260
+ report += f"""Based on analysis of {stats['total_sessions']} sessions:
261
+
262
+ 1. **Data Quality:** {"✅ Good" if stats['completed_sessions'] / stats['total_sessions'] > 0.8 else "⚠️ Review incomplete sessions"}
263
+ 2. **Sample Size:** {"✅ Sufficient" if stats['total_sessions'] >= 10 else "⚠️ Consider conducting more interviews"}
264
+ 3. **Engagement:** {"✅ High" if stats['avg_response_length'] > 100 else "⚠️ Consider probing strategies"}
265
+ 4. **Duration:** {"✅ Appropriate" if 10 <= stats['avg_duration_minutes'] <= 30 else "⚠️ Review interview length"}
266
+
267
+ **Next Steps:**
268
+ - Export this report for team review
269
+ - Identify 2-3 key themes for deep-dive analysis
270
+ - Plan follow-up questions based on insights
271
+ - Consider additional interviews to explore emerging themes
272
+
273
+ ---
274
+
275
+ *This report was generated by Project Echo Multi-Session Analytics*
276
+ """
277
+
278
+ return report
279
+
280
+ def export_aggregated_data(self) -> Dict:
281
+ """
282
+ Export aggregated data in JSON format for further analysis.
283
+
284
+ Returns:
285
+ Dictionary with all aggregated data
286
+ """
287
+ return {
288
+ "generated_at": datetime.now().isoformat(),
289
+ "statistics": self.get_aggregate_stats(),
290
+ "keywords": self.identify_common_keywords(30),
291
+ "sessions": [
292
+ {
293
+ "id": s.id,
294
+ "flow_name": s.flow_name,
295
+ "status": s.status,
296
+ "started_at": s.started_at,
297
+ "ended_at": s.ended_at,
298
+ "turn_count": s.get_turn_count(),
299
+ "summary_stats": s.get_summary_stats()
300
+ }
301
+ for s in self.sessions
302
+ ]
303
+ }
conversation_flow.py CHANGED
@@ -5,6 +5,11 @@ import json
5
  import uuid
6
  from typing import Dict, List, Optional
7
  from datetime import datetime
 
 
 
 
 
8
 
9
 
10
  class ConversationNode:
 
5
  import uuid
6
  from typing import Dict, List, Optional
7
  from datetime import datetime
8
+ import sys
9
+ import os
10
+
11
+ # Add parent directory to path for imports
12
+ sys.path.insert(0, os.path.dirname(__file__))
13
 
14
 
15
  class ConversationNode:
conversation_moderator.py CHANGED
@@ -1,7 +1,13 @@
1
  """
2
  Conversation Moderator - AI-powered interview moderator
3
  """
 
 
4
  from typing import Dict, List, Optional, Tuple
 
 
 
 
5
  from llm_backend import LLMBackend
6
  from conversation_flow import ConversationFlow, ConversationNode
7
  from conversation_session import ConversationSession
 
1
  """
2
  Conversation Moderator - AI-powered interview moderator
3
  """
4
+ import sys
5
+ import os
6
  from typing import Dict, List, Optional, Tuple
7
+
8
+ # Add parent directory to path for imports
9
+ sys.path.insert(0, os.path.dirname(__file__))
10
+
11
  from llm_backend import LLMBackend
12
  from conversation_flow import ConversationFlow, ConversationNode
13
  from conversation_session import ConversationSession
data_analyzer.py CHANGED
@@ -2,8 +2,14 @@
2
  Data Analysis Module - AI-assisted analysis of survey responses
3
  """
4
  import json
 
 
5
  from typing import Dict, List, Optional
6
  from collections import Counter
 
 
 
 
7
  from llm_backend import LLMBackend
8
 
9
 
 
2
  Data Analysis Module - AI-assisted analysis of survey responses
3
  """
4
  import json
5
+ import sys
6
+ import os
7
  from typing import Dict, List, Optional
8
  from collections import Counter
9
+
10
+ # Add parent directory to path for imports
11
+ sys.path.insert(0, os.path.dirname(__file__))
12
+
13
  from llm_backend import LLMBackend
14
 
15
 
survey_generator.py CHANGED
@@ -2,7 +2,13 @@
2
  Survey Generation Module - Generate AI-powered surveys from outlines
3
  """
4
  import json
 
 
5
  from typing import List, Dict, Optional
 
 
 
 
6
  from llm_backend import LLMBackend
7
 
8
 
 
2
  Survey Generation Module - Generate AI-powered surveys from outlines
3
  """
4
  import json
5
+ import sys
6
+ import os
7
  from typing import List, Dict, Optional
8
+
9
+ # Add parent directory to path for imports
10
+ sys.path.insert(0, os.path.dirname(__file__))
11
+
12
  from llm_backend import LLMBackend
13
 
14