Spaces:
Sleeping
Sleeping
| """ | |
| Conversation Analytics - Multi-session analysis and insights | |
| """ | |
| import json | |
| import sys | |
| import os | |
| from typing import List, Dict, Optional | |
| from collections import Counter | |
| from datetime import datetime | |
| # Add parent directory to path for imports | |
| sys.path.insert(0, os.path.dirname(__file__)) | |
| from conversation_session import ConversationSession | |
| from llm_backend import LLMBackend | |
| class ConversationAnalytics: | |
| """ | |
| Analyze multiple conversation sessions to identify patterns, | |
| themes, and insights across interviews. | |
| """ | |
| def __init__(self, llm_backend: Optional[LLMBackend] = None): | |
| self.llm = llm_backend | |
| self.sessions: List[ConversationSession] = [] | |
| def load_sessions(self, session_data_list: List[Dict]) -> int: | |
| """ | |
| Load multiple sessions from dictionaries. | |
| Args: | |
| session_data_list: List of session dictionaries | |
| Returns: | |
| Number of sessions loaded | |
| """ | |
| self.sessions = [] | |
| for session_data in session_data_list: | |
| try: | |
| session = ConversationSession.from_dict(session_data) | |
| self.sessions.append(session) | |
| except Exception as e: | |
| print(f"Error loading session: {e}") | |
| continue | |
| return len(self.sessions) | |
| def get_aggregate_stats(self) -> Dict: | |
| """Get aggregate statistics across all sessions""" | |
| if not self.sessions: | |
| return {} | |
| total_turns = sum(s.get_turn_count() for s in self.sessions) | |
| total_user_turns = sum(len([t for t in s.conversation_history if t.role == "user"]) | |
| for s in self.sessions) | |
| total_ai_turns = sum(len([t for t in s.conversation_history if t.role == "ai"]) | |
| for s in self.sessions) | |
| # Calculate response lengths | |
| all_user_responses = [] | |
| for session in self.sessions: | |
| all_user_responses.extend([len(t.content) for t in session.conversation_history | |
| if t.role == "user"]) | |
| avg_response_length = sum(all_user_responses) / len(all_user_responses) if all_user_responses else 0 | |
| # Calculate durations | |
| all_durations = [s._calculate_duration_minutes() for s in self.sessions] | |
| avg_duration = sum(all_durations) / len(all_durations) if all_durations else 0 | |
| # Status breakdown | |
| status_counts = Counter(s.status for s in self.sessions) | |
| return { | |
| "total_sessions": len(self.sessions), | |
| "total_turns": total_turns, | |
| "total_user_turns": total_user_turns, | |
| "total_ai_turns": total_ai_turns, | |
| "avg_turns_per_session": total_turns / len(self.sessions), | |
| "avg_response_length": avg_response_length, | |
| "avg_duration_minutes": avg_duration, | |
| "total_duration_minutes": sum(all_durations), | |
| "status_breakdown": dict(status_counts), | |
| "completed_sessions": status_counts.get("completed", 0), | |
| "active_sessions": status_counts.get("active", 0), | |
| "abandoned_sessions": status_counts.get("abandoned", 0) | |
| } | |
| def extract_all_responses(self) -> List[str]: | |
| """Extract all user responses from all sessions""" | |
| responses = [] | |
| for session in self.sessions: | |
| for turn in session.conversation_history: | |
| if turn.role == "user": | |
| responses.append(turn.content) | |
| return responses | |
| def identify_common_keywords(self, top_n: int = 20) -> List[tuple]: | |
| """ | |
| Identify most common keywords across all user responses. | |
| Args: | |
| top_n: Number of top keywords to return | |
| Returns: | |
| List of (keyword, count) tuples | |
| """ | |
| responses = self.extract_all_responses() | |
| # Simple keyword extraction (filter common words) | |
| stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', | |
| 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been', | |
| 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', | |
| 'should', 'may', 'might', 'can', 'it', 'this', 'that', 'these', 'those', | |
| 'i', 'you', 'he', 'she', 'we', 'they', 'my', 'your', 'his', 'her', 'our', | |
| 'their', 'me', 'him', 'her', 'us', 'them'} | |
| all_words = [] | |
| for response in responses: | |
| words = response.lower().split() | |
| # Filter out stop words and short words | |
| filtered_words = [w.strip('.,!?;:"()[]{}') for w in words | |
| if len(w) > 3 and w.lower() not in stop_words] | |
| all_words.extend(filtered_words) | |
| word_counts = Counter(all_words) | |
| return word_counts.most_common(top_n) | |
| def generate_cross_session_insights(self) -> str: | |
| """ | |
| Generate AI-powered insights across all sessions. | |
| Returns: | |
| Markdown formatted insights report | |
| """ | |
| if not self.llm: | |
| return "β LLM backend required for cross-session insights" | |
| if not self.sessions: | |
| return "β No sessions to analyze" | |
| # Collect all user responses | |
| all_responses = self.extract_all_responses() | |
| if len(all_responses) < 10: | |
| return "β Need at least 10 responses across sessions for meaningful analysis" | |
| # Sample responses (to avoid token limits) | |
| sample_size = min(50, len(all_responses)) | |
| import random | |
| sampled_responses = random.sample(all_responses, sample_size) if len(all_responses) > sample_size else all_responses | |
| responses_text = "\n\n".join([f"Response {i+1}: {r}" for i, r in enumerate(sampled_responses)]) | |
| system_prompt = """You are analyzing multiple qualitative research interview sessions. | |
| Identify patterns, themes, and insights across all the responses provided. Focus on: | |
| 1. **Common Themes**: What topics come up repeatedly? | |
| 2. **Sentiment Patterns**: Overall sentiment and emotional tone | |
| 3. **Key Insights**: Important discoveries or patterns | |
| 4. **Notable Quotes**: Particularly insightful or representative responses | |
| 5. **Recommendations**: What actions should researchers take based on these findings? | |
| Provide a comprehensive analysis in a professional report format.""" | |
| user_prompt = f"""Analyze these {len(sampled_responses)} interview responses from {len(self.sessions)} different sessions: | |
| {responses_text} | |
| Generate a comprehensive cross-session analysis report.""" | |
| messages = [ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_prompt} | |
| ] | |
| try: | |
| insights = self.llm.generate(messages, max_tokens=1000, temperature=0.5) | |
| return insights.strip() | |
| except Exception as e: | |
| return f"β Error generating insights: {str(e)}" | |
| def generate_comprehensive_report(self) -> str: | |
| """ | |
| Generate a comprehensive markdown report of multi-session analysis. | |
| Returns: | |
| Markdown formatted report | |
| """ | |
| if not self.sessions: | |
| return "# Multi-Session Analysis Report\n\nβ No sessions loaded for analysis." | |
| stats = self.get_aggregate_stats() | |
| keywords = self.identify_common_keywords(15) | |
| report = f"""# Multi-Session Conversation Analysis Report | |
| **Generated:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} | |
| --- | |
| ## π Aggregate Statistics | |
| **Session Overview:** | |
| - Total Sessions Analyzed: **{stats['total_sessions']}** | |
| - Completed Sessions: **{stats['completed_sessions']}** | |
| - Active Sessions: **{stats['active_sessions']}** | |
| - Abandoned Sessions: **{stats['abandoned_sessions']}** | |
| **Conversation Metrics:** | |
| - Total Conversation Turns: **{stats['total_turns']}** | |
| - User Responses: **{stats['total_user_turns']}** | |
| - AI Questions: **{stats['total_ai_turns']}** | |
| - Average Turns per Session: **{stats['avg_turns_per_session']:.1f}** | |
| **Quality Indicators:** | |
| - Average Response Length: **{stats['avg_response_length']:.0f} characters** | |
| - Average Session Duration: **{stats['avg_duration_minutes']:.1f} minutes** | |
| - Total Interview Time: **{stats['total_duration_minutes']:.1f} minutes** ({stats['total_duration_minutes']/60:.1f} hours) | |
| --- | |
| ## π Common Keywords & Topics | |
| Top keywords mentioned across all sessions: | |
| """ | |
| for i, (keyword, count) in enumerate(keywords, 1): | |
| report += f"{i}. **{keyword}** - mentioned {count} times\n" | |
| report += "\n---\n\n## π‘ Cross-Session Insights\n\n" | |
| if self.llm: | |
| report += "*Generating AI-powered insights...*\n\n" | |
| insights = self.generate_cross_session_insights() | |
| report += insights | |
| else: | |
| report += "*AI insights unavailable (LLM backend not configured)*\n\n" | |
| report += "**Manual Analysis Recommended:**\n" | |
| report += "- Review individual session transcripts\n" | |
| report += "- Look for patterns in the common keywords above\n" | |
| report += "- Compare responses across different respondent demographics\n" | |
| report += "\n\n---\n\n## π Session Details\n\n" | |
| for i, session in enumerate(self.sessions, 1): | |
| stats = session.get_summary_stats() | |
| report += f"""### Session {i}: {session.flow_name} | |
| - **Session ID:** `{session.id}` | |
| - **Status:** {session.status} | |
| - **Duration:** {stats['duration_minutes']:.1f} minutes | |
| - **Turns:** {stats['total_turns']} ({stats['user_turns']} user, {stats['ai_turns']} AI) | |
| - **Avg Response Length:** {stats['avg_user_response_length']:.0f} characters | |
| """ | |
| report += "\n---\n\n## π― Research Recommendations\n\n" | |
| report += f"""Based on analysis of {stats['total_sessions']} sessions: | |
| 1. **Data Quality:** {"β Good" if stats['completed_sessions'] / stats['total_sessions'] > 0.8 else "β οΈ Review incomplete sessions"} | |
| 2. **Sample Size:** {"β Sufficient" if stats['total_sessions'] >= 10 else "β οΈ Consider conducting more interviews"} | |
| 3. **Engagement:** {"β High" if stats['avg_response_length'] > 100 else "β οΈ Consider probing strategies"} | |
| 4. **Duration:** {"β Appropriate" if 10 <= stats['avg_duration_minutes'] <= 30 else "β οΈ Review interview length"} | |
| **Next Steps:** | |
| - Export this report for team review | |
| - Identify 2-3 key themes for deep-dive analysis | |
| - Plan follow-up questions based on insights | |
| - Consider additional interviews to explore emerging themes | |
| --- | |
| *This report was generated by Project Echo Multi-Session Analytics* | |
| """ | |
| return report | |
| def export_aggregated_data(self) -> Dict: | |
| """ | |
| Export aggregated data in JSON format for further analysis. | |
| Returns: | |
| Dictionary with all aggregated data | |
| """ | |
| return { | |
| "generated_at": datetime.now().isoformat(), | |
| "statistics": self.get_aggregate_stats(), | |
| "keywords": self.identify_common_keywords(30), | |
| "sessions": [ | |
| { | |
| "id": s.id, | |
| "flow_name": s.flow_name, | |
| "status": s.status, | |
| "started_at": s.started_at, | |
| "ended_at": s.ended_at, | |
| "turn_count": s.get_turn_count(), | |
| "summary_stats": s.get_summary_stats() | |
| } | |
| for s in self.sessions | |
| ] | |
| } | |