ProjectEcho / conversation_analytics.py
jmisak's picture
Upload 5 files
8056e83 verified
"""
Conversation Analytics - Multi-session analysis and insights
"""
import json
import sys
import os
from typing import List, Dict, Optional
from collections import Counter
from datetime import datetime
# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(__file__))
from conversation_session import ConversationSession
from llm_backend import LLMBackend
class ConversationAnalytics:
"""
Analyze multiple conversation sessions to identify patterns,
themes, and insights across interviews.
"""
def __init__(self, llm_backend: Optional[LLMBackend] = None):
self.llm = llm_backend
self.sessions: List[ConversationSession] = []
def load_sessions(self, session_data_list: List[Dict]) -> int:
"""
Load multiple sessions from dictionaries.
Args:
session_data_list: List of session dictionaries
Returns:
Number of sessions loaded
"""
self.sessions = []
for session_data in session_data_list:
try:
session = ConversationSession.from_dict(session_data)
self.sessions.append(session)
except Exception as e:
print(f"Error loading session: {e}")
continue
return len(self.sessions)
def get_aggregate_stats(self) -> Dict:
"""Get aggregate statistics across all sessions"""
if not self.sessions:
return {}
total_turns = sum(s.get_turn_count() for s in self.sessions)
total_user_turns = sum(len([t for t in s.conversation_history if t.role == "user"])
for s in self.sessions)
total_ai_turns = sum(len([t for t in s.conversation_history if t.role == "ai"])
for s in self.sessions)
# Calculate response lengths
all_user_responses = []
for session in self.sessions:
all_user_responses.extend([len(t.content) for t in session.conversation_history
if t.role == "user"])
avg_response_length = sum(all_user_responses) / len(all_user_responses) if all_user_responses else 0
# Calculate durations
all_durations = [s._calculate_duration_minutes() for s in self.sessions]
avg_duration = sum(all_durations) / len(all_durations) if all_durations else 0
# Status breakdown
status_counts = Counter(s.status for s in self.sessions)
return {
"total_sessions": len(self.sessions),
"total_turns": total_turns,
"total_user_turns": total_user_turns,
"total_ai_turns": total_ai_turns,
"avg_turns_per_session": total_turns / len(self.sessions),
"avg_response_length": avg_response_length,
"avg_duration_minutes": avg_duration,
"total_duration_minutes": sum(all_durations),
"status_breakdown": dict(status_counts),
"completed_sessions": status_counts.get("completed", 0),
"active_sessions": status_counts.get("active", 0),
"abandoned_sessions": status_counts.get("abandoned", 0)
}
def extract_all_responses(self) -> List[str]:
"""Extract all user responses from all sessions"""
responses = []
for session in self.sessions:
for turn in session.conversation_history:
if turn.role == "user":
responses.append(turn.content)
return responses
def identify_common_keywords(self, top_n: int = 20) -> List[tuple]:
"""
Identify most common keywords across all user responses.
Args:
top_n: Number of top keywords to return
Returns:
List of (keyword, count) tuples
"""
responses = self.extract_all_responses()
# Simple keyword extraction (filter common words)
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
'should', 'may', 'might', 'can', 'it', 'this', 'that', 'these', 'those',
'i', 'you', 'he', 'she', 'we', 'they', 'my', 'your', 'his', 'her', 'our',
'their', 'me', 'him', 'her', 'us', 'them'}
all_words = []
for response in responses:
words = response.lower().split()
# Filter out stop words and short words
filtered_words = [w.strip('.,!?;:"()[]{}') for w in words
if len(w) > 3 and w.lower() not in stop_words]
all_words.extend(filtered_words)
word_counts = Counter(all_words)
return word_counts.most_common(top_n)
def generate_cross_session_insights(self) -> str:
"""
Generate AI-powered insights across all sessions.
Returns:
Markdown formatted insights report
"""
if not self.llm:
return "❌ LLM backend required for cross-session insights"
if not self.sessions:
return "❌ No sessions to analyze"
# Collect all user responses
all_responses = self.extract_all_responses()
if len(all_responses) < 10:
return "❌ Need at least 10 responses across sessions for meaningful analysis"
# Sample responses (to avoid token limits)
sample_size = min(50, len(all_responses))
import random
sampled_responses = random.sample(all_responses, sample_size) if len(all_responses) > sample_size else all_responses
responses_text = "\n\n".join([f"Response {i+1}: {r}" for i, r in enumerate(sampled_responses)])
system_prompt = """You are analyzing multiple qualitative research interview sessions.
Identify patterns, themes, and insights across all the responses provided. Focus on:
1. **Common Themes**: What topics come up repeatedly?
2. **Sentiment Patterns**: Overall sentiment and emotional tone
3. **Key Insights**: Important discoveries or patterns
4. **Notable Quotes**: Particularly insightful or representative responses
5. **Recommendations**: What actions should researchers take based on these findings?
Provide a comprehensive analysis in a professional report format."""
user_prompt = f"""Analyze these {len(sampled_responses)} interview responses from {len(self.sessions)} different sessions:
{responses_text}
Generate a comprehensive cross-session analysis report."""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
try:
insights = self.llm.generate(messages, max_tokens=1000, temperature=0.5)
return insights.strip()
except Exception as e:
return f"❌ Error generating insights: {str(e)}"
def generate_comprehensive_report(self) -> str:
"""
Generate a comprehensive markdown report of multi-session analysis.
Returns:
Markdown formatted report
"""
if not self.sessions:
return "# Multi-Session Analysis Report\n\n❌ No sessions loaded for analysis."
stats = self.get_aggregate_stats()
keywords = self.identify_common_keywords(15)
report = f"""# Multi-Session Conversation Analysis Report
**Generated:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
---
## πŸ“Š Aggregate Statistics
**Session Overview:**
- Total Sessions Analyzed: **{stats['total_sessions']}**
- Completed Sessions: **{stats['completed_sessions']}**
- Active Sessions: **{stats['active_sessions']}**
- Abandoned Sessions: **{stats['abandoned_sessions']}**
**Conversation Metrics:**
- Total Conversation Turns: **{stats['total_turns']}**
- User Responses: **{stats['total_user_turns']}**
- AI Questions: **{stats['total_ai_turns']}**
- Average Turns per Session: **{stats['avg_turns_per_session']:.1f}**
**Quality Indicators:**
- Average Response Length: **{stats['avg_response_length']:.0f} characters**
- Average Session Duration: **{stats['avg_duration_minutes']:.1f} minutes**
- Total Interview Time: **{stats['total_duration_minutes']:.1f} minutes** ({stats['total_duration_minutes']/60:.1f} hours)
---
## πŸ”‘ Common Keywords & Topics
Top keywords mentioned across all sessions:
"""
for i, (keyword, count) in enumerate(keywords, 1):
report += f"{i}. **{keyword}** - mentioned {count} times\n"
report += "\n---\n\n## πŸ’‘ Cross-Session Insights\n\n"
if self.llm:
report += "*Generating AI-powered insights...*\n\n"
insights = self.generate_cross_session_insights()
report += insights
else:
report += "*AI insights unavailable (LLM backend not configured)*\n\n"
report += "**Manual Analysis Recommended:**\n"
report += "- Review individual session transcripts\n"
report += "- Look for patterns in the common keywords above\n"
report += "- Compare responses across different respondent demographics\n"
report += "\n\n---\n\n## πŸ“‹ Session Details\n\n"
for i, session in enumerate(self.sessions, 1):
stats = session.get_summary_stats()
report += f"""### Session {i}: {session.flow_name}
- **Session ID:** `{session.id}`
- **Status:** {session.status}
- **Duration:** {stats['duration_minutes']:.1f} minutes
- **Turns:** {stats['total_turns']} ({stats['user_turns']} user, {stats['ai_turns']} AI)
- **Avg Response Length:** {stats['avg_user_response_length']:.0f} characters
"""
report += "\n---\n\n## 🎯 Research Recommendations\n\n"
report += f"""Based on analysis of {stats['total_sessions']} sessions:
1. **Data Quality:** {"βœ… Good" if stats['completed_sessions'] / stats['total_sessions'] > 0.8 else "⚠️ Review incomplete sessions"}
2. **Sample Size:** {"βœ… Sufficient" if stats['total_sessions'] >= 10 else "⚠️ Consider conducting more interviews"}
3. **Engagement:** {"βœ… High" if stats['avg_response_length'] > 100 else "⚠️ Consider probing strategies"}
4. **Duration:** {"βœ… Appropriate" if 10 <= stats['avg_duration_minutes'] <= 30 else "⚠️ Review interview length"}
**Next Steps:**
- Export this report for team review
- Identify 2-3 key themes for deep-dive analysis
- Plan follow-up questions based on insights
- Consider additional interviews to explore emerging themes
---
*This report was generated by Project Echo Multi-Session Analytics*
"""
return report
def export_aggregated_data(self) -> Dict:
"""
Export aggregated data in JSON format for further analysis.
Returns:
Dictionary with all aggregated data
"""
return {
"generated_at": datetime.now().isoformat(),
"statistics": self.get_aggregate_stats(),
"keywords": self.identify_common_keywords(30),
"sessions": [
{
"id": s.id,
"flow_name": s.flow_name,
"status": s.status,
"started_at": s.started_at,
"ended_at": s.ended_at,
"turn_count": s.get_turn_count(),
"summary_stats": s.get_summary_stats()
}
for s in self.sessions
]
}