Spaces:
Sleeping
Sleeping
| """ | |
| Data Analysis Module - AI-assisted analysis of survey responses | |
| """ | |
| import json | |
| from typing import Dict, List, Optional | |
| from collections import Counter | |
| from llm_backend import LLMBackend | |
| class DataAnalyzer: | |
| """ | |
| Analyzes survey responses to uncover key findings, trends, and patterns. | |
| Provides AI-assisted summaries for qualitative research data. | |
| """ | |
| def __init__(self, llm_backend: LLMBackend): | |
| self.llm = llm_backend | |
| def analyze_responses(self, responses: List[Dict], questions: List[Dict] = None) -> Dict: | |
| """ | |
| Comprehensive analysis of survey responses. | |
| Args: | |
| responses: List of response dictionaries | |
| questions: Optional list of questions for context | |
| Returns: | |
| Analysis results including themes, sentiment, and insights | |
| """ | |
| if not responses: | |
| return {"error": "No responses to analyze"} | |
| analysis = { | |
| "summary": {}, | |
| "themes": [], | |
| "sentiment": {}, | |
| "key_insights": [], | |
| "response_count": len(responses) | |
| } | |
| # Generate overall summary | |
| analysis["summary"] = self._generate_summary(responses, questions) | |
| # Extract themes | |
| analysis["themes"] = self._extract_themes(responses) | |
| # Analyze sentiment | |
| analysis["sentiment"] = self._analyze_sentiment(responses) | |
| # Generate key insights | |
| analysis["key_insights"] = self._generate_insights(responses, questions) | |
| # Add quantitative stats if applicable | |
| analysis["statistics"] = self._compute_statistics(responses, questions) | |
| return analysis | |
| def _generate_summary(self, responses: List[Dict], questions: List[Dict] = None) -> Dict: | |
| """Generate an executive summary of responses""" | |
| # Prepare context | |
| response_texts = self._extract_text_responses(responses) | |
| sample_size = min(50, len(response_texts)) # Use sample for large datasets | |
| sample_responses = response_texts[:sample_size] | |
| context = f"Total responses: {len(responses)}\n\n" | |
| if questions: | |
| context += "Questions asked:\n" | |
| for i, q in enumerate(questions[:10], 1): # Limit to first 10 questions | |
| context += f"{i}. {q.get('question_text', '')}\n" | |
| context += "\n" | |
| context += "Sample responses:\n" | |
| for i, resp in enumerate(sample_responses, 1): | |
| context += f"{i}. {resp[:200]}...\n" # Truncate long responses | |
| prompt = f"""Analyze the following survey responses and provide an executive summary. | |
| {context} | |
| Provide a summary that includes: | |
| 1. Overview: High-level summary of what the data shows (2-3 sentences) | |
| 2. Key patterns: Main patterns or trends observed | |
| 3. Notable findings: Interesting or unexpected discoveries | |
| 4. Response quality: Assessment of response depth and engagement | |
| Respond with a JSON object with these fields: | |
| {{ | |
| "overview": "...", | |
| "key_patterns": ["pattern 1", "pattern 2", ...], | |
| "notable_findings": ["finding 1", "finding 2", ...], | |
| "response_quality": "..." | |
| }}""" | |
| messages = [ | |
| {"role": "system", "content": self._get_analyst_system_prompt()}, | |
| {"role": "user", "content": prompt} | |
| ] | |
| try: | |
| response = self.llm.generate(messages, max_tokens=1000, temperature=0.5) | |
| return self._parse_json_response(response) | |
| except Exception as e: | |
| return {"error": f"Summary generation failed: {str(e)}"} | |
| def _extract_themes(self, responses: List[Dict], num_themes: int = 5) -> List[Dict]: | |
| """Extract main themes from responses using AI""" | |
| response_texts = self._extract_text_responses(responses) | |
| if not response_texts: | |
| return [] | |
| # Sample for large datasets | |
| sample_size = min(100, len(response_texts)) | |
| sample_responses = response_texts[:sample_size] | |
| prompt = f"""Analyze the following {len(sample_responses)} survey responses and identify the top {num_themes} themes. | |
| Responses: | |
| {self._format_responses_for_prompt(sample_responses)} | |
| For each theme, provide: | |
| 1. Theme name: A short, descriptive name | |
| 2. Description: What this theme represents | |
| 3. Prevalence: Estimated percentage of responses mentioning this theme | |
| 4. Example quotes: 2-3 representative quotes from the responses | |
| Respond with a JSON array of theme objects: | |
| [ | |
| {{ | |
| "theme_name": "...", | |
| "description": "...", | |
| "prevalence": "XX%", | |
| "example_quotes": ["quote 1", "quote 2"] | |
| }} | |
| ]""" | |
| messages = [ | |
| {"role": "system", "content": self._get_analyst_system_prompt()}, | |
| {"role": "user", "content": prompt} | |
| ] | |
| try: | |
| response = self.llm.generate(messages, max_tokens=1500, temperature=0.6) | |
| themes = self._parse_json_response(response) | |
| if isinstance(themes, list): | |
| return themes | |
| return [] | |
| except Exception as e: | |
| return [{"error": f"Theme extraction failed: {str(e)}"}] | |
| def _analyze_sentiment(self, responses: List[Dict]) -> Dict: | |
| """Analyze overall sentiment of responses""" | |
| response_texts = self._extract_text_responses(responses) | |
| if not response_texts: | |
| return {} | |
| # Sample for analysis | |
| sample_size = min(100, len(response_texts)) | |
| sample_responses = response_texts[:sample_size] | |
| prompt = f"""Analyze the sentiment of these {len(sample_responses)} survey responses. | |
| Responses: | |
| {self._format_responses_for_prompt(sample_responses)} | |
| Provide sentiment analysis including: | |
| 1. Overall sentiment: positive, negative, neutral, or mixed | |
| 2. Sentiment distribution: Estimated percentage breakdown | |
| 3. Emotional tone: Key emotions detected | |
| 4. Intensity: How strong the sentiments are | |
| Respond with JSON: | |
| {{ | |
| "overall_sentiment": "...", | |
| "distribution": {{ | |
| "positive": "XX%", | |
| "neutral": "XX%", | |
| "negative": "XX%" | |
| }}, | |
| "emotions": ["emotion1", "emotion2", ...], | |
| "intensity": "low|moderate|high" | |
| }}""" | |
| messages = [ | |
| {"role": "system", "content": self._get_analyst_system_prompt()}, | |
| {"role": "user", "content": prompt} | |
| ] | |
| try: | |
| response = self.llm.generate(messages, max_tokens=500, temperature=0.4) | |
| return self._parse_json_response(response) | |
| except Exception as e: | |
| return {"error": f"Sentiment analysis failed: {str(e)}"} | |
| def _generate_insights(self, responses: List[Dict], questions: List[Dict] = None) -> List[str]: | |
| """Generate actionable insights from the data""" | |
| response_texts = self._extract_text_responses(responses) | |
| if not response_texts: | |
| return [] | |
| sample_size = min(100, len(response_texts)) | |
| sample_responses = response_texts[:sample_size] | |
| context = f"Analyzing {len(responses)} survey responses.\n\n" | |
| if questions: | |
| context += "Research questions:\n" | |
| for i, q in enumerate(questions[:5], 1): | |
| context += f"{i}. {q.get('question_text', '')}\n" | |
| context += "\n" | |
| prompt = f"""{context} | |
| Sample responses: | |
| {self._format_responses_for_prompt(sample_responses)} | |
| Based on this data, provide 5-7 key insights that would be valuable for: | |
| - Understanding the target audience | |
| - Identifying opportunities or challenges | |
| - Informing strategic decisions | |
| - Recognizing patterns or trends | |
| Each insight should be: | |
| - Specific and actionable | |
| - Supported by the data | |
| - Clear and concise | |
| Respond with a JSON array of insight strings: | |
| ["insight 1", "insight 2", ...]""" | |
| messages = [ | |
| {"role": "system", "content": self._get_analyst_system_prompt()}, | |
| {"role": "user", "content": prompt} | |
| ] | |
| try: | |
| response = self.llm.generate(messages, max_tokens=1000, temperature=0.6) | |
| insights = self._parse_json_response(response) | |
| if isinstance(insights, list): | |
| return insights | |
| return [] | |
| except Exception as e: | |
| return [f"Insight generation failed: {str(e)}"] | |
| def _compute_statistics(self, responses: List[Dict], questions: List[Dict] = None) -> Dict: | |
| """Compute basic statistics from responses""" | |
| stats = { | |
| "total_responses": len(responses), | |
| "response_lengths": {}, | |
| "completion_rate": "N/A" | |
| } | |
| # Calculate average response length | |
| response_texts = self._extract_text_responses(responses) | |
| if response_texts: | |
| lengths = [len(r.split()) for r in response_texts] | |
| stats["response_lengths"] = { | |
| "avg_words": sum(lengths) / len(lengths), | |
| "min_words": min(lengths), | |
| "max_words": max(lengths) | |
| } | |
| # Calculate completion rate if questions are provided | |
| if questions: | |
| total_questions = len(questions) | |
| completed_questions = 0 | |
| for response in responses: | |
| if isinstance(response, dict): | |
| completed_questions += len([v for v in response.values() if v]) | |
| if total_questions > 0: | |
| completion_rate = (completed_questions / (total_questions * len(responses))) * 100 | |
| stats["completion_rate"] = f"{completion_rate:.1f}%" | |
| return stats | |
| def generate_report(self, analysis_results: Dict, format: str = "markdown") -> str: | |
| """ | |
| Generate a formatted report from analysis results. | |
| Args: | |
| analysis_results: Results from analyze_responses() | |
| format: Output format (markdown, text, html) | |
| Returns: | |
| Formatted report string | |
| """ | |
| if format == "markdown": | |
| return self._generate_markdown_report(analysis_results) | |
| elif format == "html": | |
| return self._generate_html_report(analysis_results) | |
| else: | |
| return self._generate_text_report(analysis_results) | |
| def _generate_markdown_report(self, results: Dict) -> str: | |
| """Generate markdown formatted report""" | |
| report = "# Survey Analysis Report\n\n" | |
| # Summary section | |
| if "summary" in results and results["summary"]: | |
| report += "## Executive Summary\n\n" | |
| summary = results["summary"] | |
| if "overview" in summary: | |
| report += f"{summary['overview']}\n\n" | |
| if "key_patterns" in summary: | |
| report += "### Key Patterns\n" | |
| for pattern in summary["key_patterns"]: | |
| report += f"- {pattern}\n" | |
| report += "\n" | |
| # Statistics | |
| if "statistics" in results: | |
| report += "## Response Statistics\n\n" | |
| stats = results["statistics"] | |
| report += f"- Total Responses: {stats.get('total_responses', 'N/A')}\n" | |
| if "response_lengths" in stats: | |
| rl = stats["response_lengths"] | |
| report += f"- Average Response Length: {rl.get('avg_words', 0):.1f} words\n" | |
| report += f"- Completion Rate: {stats.get('completion_rate', 'N/A')}\n\n" | |
| # Themes | |
| if "themes" in results and results["themes"]: | |
| report += "## Main Themes\n\n" | |
| for i, theme in enumerate(results["themes"], 1): | |
| if isinstance(theme, dict) and "theme_name" in theme: | |
| report += f"### {i}. {theme['theme_name']}\n" | |
| report += f"{theme.get('description', '')}\n\n" | |
| report += f"**Prevalence:** {theme.get('prevalence', 'N/A')}\n\n" | |
| if "example_quotes" in theme: | |
| report += "**Example quotes:**\n" | |
| for quote in theme["example_quotes"]: | |
| report += f"> {quote}\n" | |
| report += "\n" | |
| # Sentiment | |
| if "sentiment" in results and results["sentiment"]: | |
| report += "## Sentiment Analysis\n\n" | |
| sent = results["sentiment"] | |
| report += f"**Overall Sentiment:** {sent.get('overall_sentiment', 'N/A')}\n\n" | |
| if "distribution" in sent: | |
| report += "**Distribution:**\n" | |
| for key, value in sent["distribution"].items(): | |
| report += f"- {key.title()}: {value}\n" | |
| report += "\n" | |
| # Key Insights | |
| if "key_insights" in results and results["key_insights"]: | |
| report += "## Key Insights\n\n" | |
| for i, insight in enumerate(results["key_insights"], 1): | |
| report += f"{i}. {insight}\n" | |
| report += "\n" | |
| return report | |
| def _generate_text_report(self, results: Dict) -> str: | |
| """Generate plain text report""" | |
| # Similar to markdown but without formatting | |
| return self._generate_markdown_report(results).replace("#", "").replace("**", "").replace(">", "") | |
| def _generate_html_report(self, results: Dict) -> str: | |
| """Generate HTML report""" | |
| # Convert markdown to basic HTML | |
| md_report = self._generate_markdown_report(results) | |
| # Basic conversion (for production, use a proper markdown-to-html library) | |
| html = md_report.replace("# ", "<h1>").replace("\n\n", "</p>\n<p>") | |
| return f"<html><body>{html}</body></html>" | |
| def _get_analyst_system_prompt(self) -> str: | |
| """System prompt for analysis tasks""" | |
| return """You are an expert qualitative research analyst with deep expertise in: | |
| - Thematic analysis and coding | |
| - Sentiment analysis and emotional intelligence | |
| - Pattern recognition in qualitative data | |
| - Insight generation and strategic thinking | |
| - Survey research methodology | |
| Your analyses should be: | |
| - Objective and evidence-based | |
| - Nuanced and comprehensive | |
| - Actionable and clear | |
| - Grounded in the actual data provided | |
| Always respond with valid JSON when requested.""" | |
| def _extract_text_responses(self, responses: List[Dict]) -> List[str]: | |
| """Extract text from response objects""" | |
| texts = [] | |
| for response in responses: | |
| if isinstance(response, dict): | |
| # Extract all string values | |
| for value in response.values(): | |
| if isinstance(value, str) and value.strip(): | |
| texts.append(value.strip()) | |
| elif isinstance(response, str): | |
| texts.append(response.strip()) | |
| return texts | |
| def _format_responses_for_prompt(self, responses: List[str], max_responses: int = 50) -> str: | |
| """Format responses for inclusion in prompt""" | |
| formatted = [] | |
| for i, resp in enumerate(responses[:max_responses], 1): | |
| # Truncate very long responses | |
| truncated = resp[:300] + "..." if len(resp) > 300 else resp | |
| formatted.append(f"{i}. {truncated}") | |
| return "\n".join(formatted) | |
| def _parse_json_response(self, response: str): | |
| """Parse JSON from LLM response""" | |
| response = response.strip() | |
| # Handle code blocks | |
| if "```json" in response: | |
| start = response.find("```json") + 7 | |
| end = response.find("```", start) | |
| response = response[start:end].strip() | |
| elif "```" in response: | |
| start = response.find("```") + 3 | |
| end = response.find("```", start) | |
| response = response[start:end].strip() | |
| try: | |
| return json.loads(response) | |
| except json.JSONDecodeError: | |
| # Try to find JSON object or array | |
| if "{" in response: | |
| start = response.find("{") | |
| end = response.rfind("}") + 1 | |
| return json.loads(response[start:end]) | |
| elif "[" in response: | |
| start = response.find("[") | |
| end = response.rfind("]") + 1 | |
| return json.loads(response[start:end]) | |
| raise | |