""" Data Analysis Module - AI-assisted analysis of survey responses """ import json import sys import os from typing import Dict, List, Optional from collections import Counter # Add parent directory to path for imports sys.path.insert(0, os.path.dirname(__file__)) from llm_backend import LLMBackend class DataAnalyzer: """ Analyzes survey responses to uncover key findings, trends, and patterns. Provides AI-assisted summaries for qualitative research data. """ def __init__(self, llm_backend: LLMBackend): self.llm = llm_backend def analyze_responses(self, responses: List[Dict], questions: List[Dict] = None) -> Dict: """ Comprehensive analysis of survey responses. Args: responses: List of response dictionaries questions: Optional list of questions for context Returns: Analysis results including themes, sentiment, and insights """ if not responses: return {"error": "No responses to analyze"} analysis = { "summary": {}, "themes": [], "sentiment": {}, "key_insights": [], "response_count": len(responses) } # Generate overall summary analysis["summary"] = self._generate_summary(responses, questions) # Extract themes analysis["themes"] = self._extract_themes(responses) # Analyze sentiment analysis["sentiment"] = self._analyze_sentiment(responses) # Generate key insights analysis["key_insights"] = self._generate_insights(responses, questions) # Add quantitative stats if applicable analysis["statistics"] = self._compute_statistics(responses, questions) return analysis def _generate_summary(self, responses: List[Dict], questions: List[Dict] = None) -> Dict: """Generate an executive summary of responses""" # Prepare context response_texts = self._extract_text_responses(responses) sample_size = min(50, len(response_texts)) # Use sample for large datasets sample_responses = response_texts[:sample_size] context = f"Total responses: {len(responses)}\n\n" if questions: context += "Questions asked:\n" for i, q in enumerate(questions[:10], 1): # Limit to first 10 questions context += f"{i}. {q.get('question_text', '')}\n" context += "\n" context += "Sample responses:\n" for i, resp in enumerate(sample_responses, 1): context += f"{i}. {resp[:200]}...\n" # Truncate long responses prompt = f"""Task: Analyze survey responses and generate an executive summary {context} **Your Analysis Should Include:** 1. **Overview:** A clear, concise high-level summary of what the data reveals (2-3 sentences) 2. **Key Patterns:** Main patterns, trends, or recurring themes observed across responses 3. **Notable Findings:** Interesting, surprising, or unexpected discoveries in the data 4. **Response Quality:** Assessment of how thoughtful, engaged, and detailed the responses are **Output Format:** Respond ONLY with valid JSON: {{ "overview": "Clear summary of overall findings", "key_patterns": ["pattern 1", "pattern 2", "pattern 3"], "notable_findings": ["surprising finding 1", "unexpected discovery"], "response_quality": "Assessment of engagement level" }} **Important:** Ensure your response is valid JSON that can be parsed. Do not include any text outside the JSON object.""" messages = [ {"role": "system", "content": self._get_analyst_system_prompt()}, {"role": "user", "content": prompt} ] try: response = self.llm.generate(messages, max_tokens=1000, temperature=0.5) return self._parse_json_response(response) except Exception as e: return {"error": f"Summary generation failed: {str(e)}"} def _extract_themes(self, responses: List[Dict], num_themes: int = 5) -> List[Dict]: """Extract main themes from responses using AI""" response_texts = self._extract_text_responses(responses) if not response_texts: return [] # Sample for large datasets sample_size = min(100, len(response_texts)) sample_responses = response_texts[:sample_size] prompt = f"""Task: Extract and analyze themes from survey responses **Data:** Analyzing {len(sample_responses)} survey responses Responses: {self._format_responses_for_prompt(sample_responses)} **Your Task:** Identify the top {num_themes} distinct themes that emerge from these responses. **For Each Theme, Provide:** 1. **Theme Name:** A short, memorable, and descriptive label 2. **Description:** Clear explanation of what this theme represents and its significance 3. **Prevalence:** Estimated percentage of responses that mention or relate to this theme 4. **Example Quotes:** 2-3 actual, representative quotes from responses that illustrate this theme **Output Format:** Respond ONLY with a valid JSON array: [ {{ "theme_name": "Clear, concise theme label", "description": "What this theme means and why it matters", "prevalence": "XX%", "example_quotes": ["exact quote from responses", "another quote"] }} ] **Important:** Ensure all responses are valid JSON. Do not include text outside the array.""" messages = [ {"role": "system", "content": self._get_analyst_system_prompt()}, {"role": "user", "content": prompt} ] try: response = self.llm.generate(messages, max_tokens=1500, temperature=0.6) themes = self._parse_json_response(response) if isinstance(themes, list): return themes return [] except Exception as e: return [{"error": f"Theme extraction failed: {str(e)}"}] def _analyze_sentiment(self, responses: List[Dict]) -> Dict: """Analyze overall sentiment of responses""" response_texts = self._extract_text_responses(responses) if not response_texts: return {} # Sample for analysis sample_size = min(100, len(response_texts)) sample_responses = response_texts[:sample_size] prompt = f"""Task: Analyze sentiment across survey responses **Data:** Analyzing sentiment in {len(sample_responses)} survey responses Responses: {self._format_responses_for_prompt(sample_responses)} **Your Task:** Conduct a comprehensive sentiment analysis of these responses. **Analysis Should Include:** 1. **Overall Sentiment:** The dominant sentiment tone (positive, negative, neutral, or mixed) 2. **Sentiment Distribution:** Estimated percentage breakdown across sentiment categories 3. **Emotional Tone:** Key emotions or emotional themes detected in responses 4. **Intensity:** The strength of the sentiments (low, moderate, or high) **Output Format:** Respond ONLY with valid JSON: {{ "overall_sentiment": "positive|negative|neutral|mixed", "distribution": {{ "positive": "XX%", "neutral": "XX%", "negative": "XX%" }}, "emotions": ["emotion1", "emotion2", "emotion3"], "intensity": "low|moderate|high" }} **Important:** Return only valid JSON. Do not include explanatory text.""" messages = [ {"role": "system", "content": self._get_analyst_system_prompt()}, {"role": "user", "content": prompt} ] try: response = self.llm.generate(messages, max_tokens=500, temperature=0.4) return self._parse_json_response(response) except Exception as e: return {"error": f"Sentiment analysis failed: {str(e)}"} def _generate_insights(self, responses: List[Dict], questions: List[Dict] = None) -> List[str]: """Generate actionable insights from the data""" response_texts = self._extract_text_responses(responses) if not response_texts: return [] sample_size = min(100, len(response_texts)) sample_responses = response_texts[:sample_size] context = f"Analyzing {len(responses)} survey responses.\n\n" if questions: context += "Research questions:\n" for i, q in enumerate(questions[:5], 1): context += f"{i}. {q.get('question_text', '')}\n" context += "\n" prompt = f"""{context} Sample responses: {self._format_responses_for_prompt(sample_responses)} **Task:** Extract key insights from this survey data **Generate 5-7 actionable insights** that address: - Understanding the target audience and their needs - Identifying opportunities for growth or improvement - Recognizing challenges or pain points - Understanding patterns, trends, and correlations - Informing strategic or product decisions **Insight Quality Criteria:** - **Specific:** Clear, concrete statements based on actual data patterns - **Actionable:** Can be used to inform decisions or actions - **Evidence-based:** Grounded in what respondents actually said - **Concise:** Clear and to the point (1-2 sentences each) **Output Format:** Respond ONLY with a valid JSON array of insight strings: ["Clear, actionable insight from the data", "Another specific insight", ...] **Important:** Return only JSON array. Do not include explanatory text.""" messages = [ {"role": "system", "content": self._get_analyst_system_prompt()}, {"role": "user", "content": prompt} ] try: response = self.llm.generate(messages, max_tokens=1000, temperature=0.6) insights = self._parse_json_response(response) if isinstance(insights, list): return insights return [] except Exception as e: return [f"Insight generation failed: {str(e)}"] def _compute_statistics(self, responses: List[Dict], questions: List[Dict] = None) -> Dict: """Compute basic statistics from responses""" stats = { "total_responses": len(responses), "response_lengths": {}, "completion_rate": "N/A" } # Calculate average response length response_texts = self._extract_text_responses(responses) if response_texts: lengths = [len(r.split()) for r in response_texts] stats["response_lengths"] = { "avg_words": sum(lengths) / len(lengths), "min_words": min(lengths), "max_words": max(lengths) } # Calculate completion rate if questions are provided if questions: total_questions = len(questions) completed_questions = 0 for response in responses: if isinstance(response, dict): completed_questions += len([v for v in response.values() if v]) if total_questions > 0: completion_rate = (completed_questions / (total_questions * len(responses))) * 100 stats["completion_rate"] = f"{completion_rate:.1f}%" return stats def generate_report(self, analysis_results: Dict, format: str = "markdown") -> str: """ Generate a formatted report from analysis results. Args: analysis_results: Results from analyze_responses() format: Output format (markdown, text, html) Returns: Formatted report string """ if format == "markdown": return self._generate_markdown_report(analysis_results) elif format == "html": return self._generate_html_report(analysis_results) else: return self._generate_text_report(analysis_results) def _generate_markdown_report(self, results: Dict) -> str: """Generate markdown formatted report""" report = "# Survey Analysis Report\n\n" # Summary section if "summary" in results and results["summary"]: report += "## Executive Summary\n\n" summary = results["summary"] if "overview" in summary: report += f"{summary['overview']}\n\n" if "key_patterns" in summary: report += "### Key Patterns\n" for pattern in summary["key_patterns"]: report += f"- {pattern}\n" report += "\n" # Statistics if "statistics" in results: report += "## Response Statistics\n\n" stats = results["statistics"] report += f"- Total Responses: {stats.get('total_responses', 'N/A')}\n" if "response_lengths" in stats: rl = stats["response_lengths"] report += f"- Average Response Length: {rl.get('avg_words', 0):.1f} words\n" report += f"- Completion Rate: {stats.get('completion_rate', 'N/A')}\n\n" # Themes if "themes" in results and results["themes"]: report += "## Main Themes\n\n" for i, theme in enumerate(results["themes"], 1): if isinstance(theme, dict) and "theme_name" in theme: report += f"### {i}. {theme['theme_name']}\n" report += f"{theme.get('description', '')}\n\n" report += f"**Prevalence:** {theme.get('prevalence', 'N/A')}\n\n" if "example_quotes" in theme: report += "**Example quotes:**\n" for quote in theme["example_quotes"]: report += f"> {quote}\n" report += "\n" # Sentiment if "sentiment" in results and results["sentiment"]: report += "## Sentiment Analysis\n\n" sent = results["sentiment"] report += f"**Overall Sentiment:** {sent.get('overall_sentiment', 'N/A')}\n\n" if "distribution" in sent: report += "**Distribution:**\n" for key, value in sent["distribution"].items(): report += f"- {key.title()}: {value}\n" report += "\n" # Key Insights if "key_insights" in results and results["key_insights"]: report += "## Key Insights\n\n" for i, insight in enumerate(results["key_insights"], 1): report += f"{i}. {insight}\n" report += "\n" return report def _generate_text_report(self, results: Dict) -> str: """Generate plain text report""" # Similar to markdown but without formatting return self._generate_markdown_report(results).replace("#", "").replace("**", "").replace(">", "") def _generate_html_report(self, results: Dict) -> str: """Generate HTML report""" # Convert markdown to basic HTML md_report = self._generate_markdown_report(results) # Basic conversion (for production, use a proper markdown-to-html library) html = md_report.replace("# ", "

").replace("\n\n", "

\n

") return f"{html}" def _get_analyst_system_prompt(self) -> str: """System prompt for analysis tasks""" return """You are an expert qualitative research analyst with deep expertise in: - Thematic analysis and coding - Sentiment analysis and emotional intelligence - Pattern recognition in qualitative data - Insight generation and strategic thinking - Survey research methodology Your analyses should be: - Objective and evidence-based - Nuanced and comprehensive - Actionable and clear - Grounded in the actual data provided Always respond with valid JSON when requested.""" def _extract_text_responses(self, responses: List[Dict]) -> List[str]: """Extract text from response objects""" texts = [] for response in responses: if isinstance(response, dict): # Extract all string values for value in response.values(): if isinstance(value, str) and value.strip(): texts.append(value.strip()) elif isinstance(response, str): texts.append(response.strip()) return texts def _format_responses_for_prompt(self, responses: List[str], max_responses: int = 50) -> str: """Format responses for inclusion in prompt""" formatted = [] for i, resp in enumerate(responses[:max_responses], 1): # Truncate very long responses truncated = resp[:300] + "..." if len(resp) > 300 else resp formatted.append(f"{i}. {truncated}") return "\n".join(formatted) def _parse_json_response(self, response: str): """Parse JSON from LLM response""" response = response.strip() # Handle code blocks if "```json" in response: start = response.find("```json") + 7 end = response.find("```", start) response = response[start:end].strip() elif "```" in response: start = response.find("```") + 3 end = response.find("```", start) response = response[start:end].strip() try: return json.loads(response) except json.JSONDecodeError: # Try to find JSON object or array if "{" in response: start = response.find("{") end = response.rfind("}") + 1 return json.loads(response[start:end]) elif "[" in response: start = response.find("[") end = response.rfind("]") + 1 return json.loads(response[start:end]) raise