Spaces:

jmisak
/

ProjectEcho

Sleeping

App Files Files Community

ProjectEcho / conversation_analytics.py

jmisak

Upload 5 files

8056e83 verified about 2 months ago

raw

history blame contribute delete

11.9 kB

	"""
	Conversation Analytics - Multi-session analysis and insights
	"""
	import json
	import sys
	import os
	from typing import List, Dict, Optional
	from collections import Counter
	from datetime import datetime

	# Add parent directory to path for imports
	sys.path.insert(0, os.path.dirname(__file__))

	from conversation_session import ConversationSession
	from llm_backend import LLMBackend


	class ConversationAnalytics:
	"""
	Analyze multiple conversation sessions to identify patterns,
	themes, and insights across interviews.
	"""

	def __init__(self, llm_backend: Optional[LLMBackend] = None):
	self.llm = llm_backend
	self.sessions: List[ConversationSession] = []

	def load_sessions(self, session_data_list: List[Dict]) -> int:
	"""
	Load multiple sessions from dictionaries.

	Args:
	session_data_list: List of session dictionaries

	Returns:
	Number of sessions loaded
	"""
	self.sessions = []
	for session_data in session_data_list:
	try:
	session = ConversationSession.from_dict(session_data)
	self.sessions.append(session)
	except Exception as e:
	print(f"Error loading session: {e}")
	continue

	return len(self.sessions)

	def get_aggregate_stats(self) -> Dict:
	"""Get aggregate statistics across all sessions"""
	if not self.sessions:
	return {}

	total_turns = sum(s.get_turn_count() for s in self.sessions)
	total_user_turns = sum(len([t for t in s.conversation_history if t.role == "user"])
	for s in self.sessions)
	total_ai_turns = sum(len([t for t in s.conversation_history if t.role == "ai"])
	for s in self.sessions)

	# Calculate response lengths
	all_user_responses = []
	for session in self.sessions:
	all_user_responses.extend([len(t.content) for t in session.conversation_history
	if t.role == "user"])

	avg_response_length = sum(all_user_responses) / len(all_user_responses) if all_user_responses else 0

	# Calculate durations
	all_durations = [s._calculate_duration_minutes() for s in self.sessions]
	avg_duration = sum(all_durations) / len(all_durations) if all_durations else 0

	# Status breakdown
	status_counts = Counter(s.status for s in self.sessions)

	return {
	"total_sessions": len(self.sessions),
	"total_turns": total_turns,
	"total_user_turns": total_user_turns,
	"total_ai_turns": total_ai_turns,
	"avg_turns_per_session": total_turns / len(self.sessions),
	"avg_response_length": avg_response_length,
	"avg_duration_minutes": avg_duration,
	"total_duration_minutes": sum(all_durations),
	"status_breakdown": dict(status_counts),
	"completed_sessions": status_counts.get("completed", 0),
	"active_sessions": status_counts.get("active", 0),
	"abandoned_sessions": status_counts.get("abandoned", 0)
	}

	def extract_all_responses(self) -> List[str]:
	"""Extract all user responses from all sessions"""
	responses = []
	for session in self.sessions:
	for turn in session.conversation_history:
	if turn.role == "user":
	responses.append(turn.content)
	return responses

	def identify_common_keywords(self, top_n: int = 20) -> List[tuple]:
	"""
	Identify most common keywords across all user responses.

	Args:
	top_n: Number of top keywords to return

	Returns:
	List of (keyword, count) tuples
	"""
	responses = self.extract_all_responses()

	# Simple keyword extraction (filter common words)
	stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
	'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
	'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
	'should', 'may', 'might', 'can', 'it', 'this', 'that', 'these', 'those',
	'i', 'you', 'he', 'she', 'we', 'they', 'my', 'your', 'his', 'her', 'our',
	'their', 'me', 'him', 'her', 'us', 'them'}

	all_words = []
	for response in responses:
	words = response.lower().split()
	# Filter out stop words and short words
	filtered_words = [w.strip('.,!?;:"()[]{}') for w in words
	if len(w) > 3 and w.lower() not in stop_words]
	all_words.extend(filtered_words)

	word_counts = Counter(all_words)
	return word_counts.most_common(top_n)

	def generate_cross_session_insights(self) -> str:
	"""
	Generate AI-powered insights across all sessions.

	Returns:
	Markdown formatted insights report
	"""
	if not self.llm:
	return "❌ LLM backend required for cross-session insights"

	if not self.sessions:
	return "❌ No sessions to analyze"

	# Collect all user responses
	all_responses = self.extract_all_responses()

	if len(all_responses) < 10:
	return "❌ Need at least 10 responses across sessions for meaningful analysis"

	# Sample responses (to avoid token limits)
	sample_size = min(50, len(all_responses))
	import random
	sampled_responses = random.sample(all_responses, sample_size) if len(all_responses) > sample_size else all_responses

	responses_text = "\n\n".join([f"Response {i+1}: {r}" for i, r in enumerate(sampled_responses)])

	system_prompt = """You are analyzing multiple qualitative research interview sessions.

	Identify patterns, themes, and insights across all the responses provided. Focus on:
	1. Common Themes: What topics come up repeatedly?
	2. Sentiment Patterns: Overall sentiment and emotional tone
	3. Key Insights: Important discoveries or patterns
	4. Notable Quotes: Particularly insightful or representative responses
	5. Recommendations: What actions should researchers take based on these findings?

	Provide a comprehensive analysis in a professional report format."""

	user_prompt = f"""Analyze these {len(sampled_responses)} interview responses from {len(self.sessions)} different sessions:

	{responses_text}

	Generate a comprehensive cross-session analysis report."""

	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt}
	]

	try:
	insights = self.llm.generate(messages, max_tokens=1000, temperature=0.5)
	return insights.strip()
	except Exception as e:
	return f"❌ Error generating insights: {str(e)}"

	def generate_comprehensive_report(self) -> str:
	"""
	Generate a comprehensive markdown report of multi-session analysis.

	Returns:
	Markdown formatted report
	"""
	if not self.sessions:
	return "# Multi-Session Analysis Report\n\n❌ No sessions loaded for analysis."

	stats = self.get_aggregate_stats()
	keywords = self.identify_common_keywords(15)

	report = f"""# Multi-Session Conversation Analysis Report

	Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}

	---

	## 📊 Aggregate Statistics

	Session Overview:
	- Total Sessions Analyzed: {stats['total_sessions']}
	- Completed Sessions: {stats['completed_sessions']}
	- Active Sessions: {stats['active_sessions']}
	- Abandoned Sessions: {stats['abandoned_sessions']}

	Conversation Metrics:
	- Total Conversation Turns: {stats['total_turns']}
	- User Responses: {stats['total_user_turns']}
	- AI Questions: {stats['total_ai_turns']}
	- Average Turns per Session: {stats['avg_turns_per_session']:.1f}

	Quality Indicators:
	- Average Response Length: {stats['avg_response_length']:.0f} characters
	- Average Session Duration: {stats['avg_duration_minutes']:.1f} minutes
	- Total Interview Time: {stats['total_duration_minutes']:.1f} minutes ({stats['total_duration_minutes']/60:.1f} hours)

	---

	## 🔑 Common Keywords & Topics

	Top keywords mentioned across all sessions:

	"""

	for i, (keyword, count) in enumerate(keywords, 1):
	report += f"{i}. {keyword} - mentioned {count} times\n"

	report += "\n---\n\n## 💡 Cross-Session Insights\n\n"

	if self.llm:
	report += "Generating AI-powered insights...\n\n"
	insights = self.generate_cross_session_insights()
	report += insights
	else:
	report += "AI insights unavailable (LLM backend not configured)\n\n"
	report += "Manual Analysis Recommended:\n"
	report += "- Review individual session transcripts\n"
	report += "- Look for patterns in the common keywords above\n"
	report += "- Compare responses across different respondent demographics\n"

	report += "\n\n---\n\n## 📋 Session Details\n\n"

	for i, session in enumerate(self.sessions, 1):
	stats = session.get_summary_stats()
	report += f"""### Session {i}: {session.flow_name}
	- Session ID: `{session.id}`
	- Status: {session.status}
	- Duration: {stats['duration_minutes']:.1f} minutes
	- Turns: {stats['total_turns']} ({stats['user_turns']} user, {stats['ai_turns']} AI)
	- Avg Response Length: {stats['avg_user_response_length']:.0f} characters

	"""

	report += "\n---\n\n## 🎯 Research Recommendations\n\n"
	report += f"""Based on analysis of {stats['total_sessions']} sessions:

	1. Data Quality: {"✅ Good" if stats['completed_sessions'] / stats['total_sessions'] > 0.8 else "⚠️ Review incomplete sessions"}
	2. Sample Size: {"✅ Sufficient" if stats['total_sessions'] >= 10 else "⚠️ Consider conducting more interviews"}
	3. Engagement: {"✅ High" if stats['avg_response_length'] > 100 else "⚠️ Consider probing strategies"}
	4. Duration: {"✅ Appropriate" if 10 <= stats['avg_duration_minutes'] <= 30 else "⚠️ Review interview length"}

	Next Steps:
	- Export this report for team review
	- Identify 2-3 key themes for deep-dive analysis
	- Plan follow-up questions based on insights
	- Consider additional interviews to explore emerging themes

	---

	This report was generated by Project Echo Multi-Session Analytics
	"""

	return report

	def export_aggregated_data(self) -> Dict:
	"""
	Export aggregated data in JSON format for further analysis.

	Returns:
	Dictionary with all aggregated data
	"""
	return {
	"generated_at": datetime.now().isoformat(),
	"statistics": self.get_aggregate_stats(),
	"keywords": self.identify_common_keywords(30),
	"sessions": [
	{
	"id": s.id,
	"flow_name": s.flow_name,
	"status": s.status,
	"started_at": s.started_at,
	"ended_at": s.ended_at,
	"turn_count": s.get_turn_count(),
	"summary_stats": s.get_summary_stats()
	}
	for s in self.sessions
	]
	}