Spaces:
Sleeping
Sleeping
| """ | |
| Data Analysis Module - AI-assisted analysis of survey responses | |
| """ | |
| import json | |
| import sys | |
| import os | |
| from typing import Dict, List, Optional | |
| from collections import Counter | |
| # Add parent directory to path for imports | |
| sys.path.insert(0, os.path.dirname(__file__)) | |
| from llm_backend import LLMBackend | |
| class DataAnalyzer: | |
| """ | |
| Analyzes survey responses to uncover key findings, trends, and patterns. | |
| Provides AI-assisted summaries for qualitative research data. | |
| """ | |
| def __init__(self, llm_backend: LLMBackend): | |
| self.llm = llm_backend | |
| def analyze_responses(self, responses: List[Dict], questions: List[Dict] = None) -> Dict: | |
| """ | |
| Comprehensive analysis of survey responses. | |
| Args: | |
| responses: List of response dictionaries | |
| questions: Optional list of questions for context | |
| Returns: | |
| Analysis results including themes, sentiment, and insights | |
| """ | |
| if not responses: | |
| return {"error": "No responses to analyze"} | |
| analysis = { | |
| "summary": {}, | |
| "themes": [], | |
| "sentiment": {}, | |
| "key_insights": [], | |
| "response_count": len(responses) | |
| } | |
| # Generate overall summary | |
| analysis["summary"] = self._generate_summary(responses, questions) | |
| # Extract themes | |
| analysis["themes"] = self._extract_themes(responses) | |
| # Analyze sentiment | |
| analysis["sentiment"] = self._analyze_sentiment(responses) | |
| # Generate key insights | |
| analysis["key_insights"] = self._generate_insights(responses, questions) | |
| # Add quantitative stats if applicable | |
| analysis["statistics"] = self._compute_statistics(responses, questions) | |
| return analysis | |
| def _generate_summary(self, responses: List[Dict], questions: List[Dict] = None) -> Dict: | |
| """Generate an executive summary of responses""" | |
| # Prepare context | |
| response_texts = self._extract_text_responses(responses) | |
| sample_size = min(50, len(response_texts)) # Use sample for large datasets | |
| sample_responses = response_texts[:sample_size] | |
| context = f"Total responses: {len(responses)}\n\n" | |
| if questions: | |
| context += "Questions asked:\n" | |
| for i, q in enumerate(questions[:10], 1): # Limit to first 10 questions | |
| context += f"{i}. {q.get('question_text', '')}\n" | |
| context += "\n" | |
| context += "Sample responses:\n" | |
| for i, resp in enumerate(sample_responses, 1): | |
| context += f"{i}. {resp[:200]}...\n" # Truncate long responses | |
| prompt = f"""Task: Analyze survey responses and generate an executive summary | |
| {context} | |
| **Your Analysis Should Include:** | |
| 1. **Overview:** A clear, concise high-level summary of what the data reveals (2-3 sentences) | |
| 2. **Key Patterns:** Main patterns, trends, or recurring themes observed across responses | |
| 3. **Notable Findings:** Interesting, surprising, or unexpected discoveries in the data | |
| 4. **Response Quality:** Assessment of how thoughtful, engaged, and detailed the responses are | |
| **Output Format:** Respond ONLY with valid JSON: | |
| {{ | |
| "overview": "Clear summary of overall findings", | |
| "key_patterns": ["pattern 1", "pattern 2", "pattern 3"], | |
| "notable_findings": ["surprising finding 1", "unexpected discovery"], | |
| "response_quality": "Assessment of engagement level" | |
| }} | |
| **Important:** Ensure your response is valid JSON that can be parsed. Do not include any text outside the JSON object.""" | |
| messages = [ | |
| {"role": "system", "content": self._get_analyst_system_prompt()}, | |
| {"role": "user", "content": prompt} | |
| ] | |
| try: | |
| response = self.llm.generate(messages, max_tokens=1000, temperature=0.5) | |
| return self._parse_json_response(response) | |
| except Exception as e: | |
| return {"error": f"Summary generation failed: {str(e)}"} | |
| def _extract_themes(self, responses: List[Dict], num_themes: int = 5) -> List[Dict]: | |
| """Extract main themes from responses using AI""" | |
| response_texts = self._extract_text_responses(responses) | |
| if not response_texts: | |
| return [] | |
| # Sample for large datasets | |
| sample_size = min(100, len(response_texts)) | |
| sample_responses = response_texts[:sample_size] | |
| prompt = f"""Task: Extract and analyze themes from survey responses | |
| **Data:** Analyzing {len(sample_responses)} survey responses | |
| Responses: | |
| {self._format_responses_for_prompt(sample_responses)} | |
| **Your Task:** Identify the top {num_themes} distinct themes that emerge from these responses. | |
| **For Each Theme, Provide:** | |
| 1. **Theme Name:** A short, memorable, and descriptive label | |
| 2. **Description:** Clear explanation of what this theme represents and its significance | |
| 3. **Prevalence:** Estimated percentage of responses that mention or relate to this theme | |
| 4. **Example Quotes:** 2-3 actual, representative quotes from responses that illustrate this theme | |
| **Output Format:** Respond ONLY with a valid JSON array: | |
| [ | |
| {{ | |
| "theme_name": "Clear, concise theme label", | |
| "description": "What this theme means and why it matters", | |
| "prevalence": "XX%", | |
| "example_quotes": ["exact quote from responses", "another quote"] | |
| }} | |
| ] | |
| **Important:** Ensure all responses are valid JSON. Do not include text outside the array.""" | |
| messages = [ | |
| {"role": "system", "content": self._get_analyst_system_prompt()}, | |
| {"role": "user", "content": prompt} | |
| ] | |
| try: | |
| response = self.llm.generate(messages, max_tokens=1500, temperature=0.6) | |
| themes = self._parse_json_response(response) | |
| if isinstance(themes, list): | |
| return themes | |
| return [] | |
| except Exception as e: | |
| return [{"error": f"Theme extraction failed: {str(e)}"}] | |
| def _analyze_sentiment(self, responses: List[Dict]) -> Dict: | |
| """Analyze overall sentiment of responses""" | |
| response_texts = self._extract_text_responses(responses) | |
| if not response_texts: | |
| return {} | |
| # Sample for analysis | |
| sample_size = min(100, len(response_texts)) | |
| sample_responses = response_texts[:sample_size] | |
| prompt = f"""Task: Analyze sentiment across survey responses | |
| **Data:** Analyzing sentiment in {len(sample_responses)} survey responses | |
| Responses: | |
| {self._format_responses_for_prompt(sample_responses)} | |
| **Your Task:** Conduct a comprehensive sentiment analysis of these responses. | |
| **Analysis Should Include:** | |
| 1. **Overall Sentiment:** The dominant sentiment tone (positive, negative, neutral, or mixed) | |
| 2. **Sentiment Distribution:** Estimated percentage breakdown across sentiment categories | |
| 3. **Emotional Tone:** Key emotions or emotional themes detected in responses | |
| 4. **Intensity:** The strength of the sentiments (low, moderate, or high) | |
| **Output Format:** Respond ONLY with valid JSON: | |
| {{ | |
| "overall_sentiment": "positive|negative|neutral|mixed", | |
| "distribution": {{ | |
| "positive": "XX%", | |
| "neutral": "XX%", | |
| "negative": "XX%" | |
| }}, | |
| "emotions": ["emotion1", "emotion2", "emotion3"], | |
| "intensity": "low|moderate|high" | |
| }} | |
| **Important:** Return only valid JSON. Do not include explanatory text.""" | |
| messages = [ | |
| {"role": "system", "content": self._get_analyst_system_prompt()}, | |
| {"role": "user", "content": prompt} | |
| ] | |
| try: | |
| response = self.llm.generate(messages, max_tokens=500, temperature=0.4) | |
| return self._parse_json_response(response) | |
| except Exception as e: | |
| return {"error": f"Sentiment analysis failed: {str(e)}"} | |
| def _generate_insights(self, responses: List[Dict], questions: List[Dict] = None) -> List[str]: | |
| """Generate actionable insights from the data""" | |
| response_texts = self._extract_text_responses(responses) | |
| if not response_texts: | |
| return [] | |
| sample_size = min(100, len(response_texts)) | |
| sample_responses = response_texts[:sample_size] | |
| context = f"Analyzing {len(responses)} survey responses.\n\n" | |
| if questions: | |
| context += "Research questions:\n" | |
| for i, q in enumerate(questions[:5], 1): | |
| context += f"{i}. {q.get('question_text', '')}\n" | |
| context += "\n" | |
| prompt = f"""{context} | |
| Sample responses: | |
| {self._format_responses_for_prompt(sample_responses)} | |
| **Task:** Extract key insights from this survey data | |
| **Generate 5-7 actionable insights** that address: | |
| - Understanding the target audience and their needs | |
| - Identifying opportunities for growth or improvement | |
| - Recognizing challenges or pain points | |
| - Understanding patterns, trends, and correlations | |
| - Informing strategic or product decisions | |
| **Insight Quality Criteria:** | |
| - **Specific:** Clear, concrete statements based on actual data patterns | |
| - **Actionable:** Can be used to inform decisions or actions | |
| - **Evidence-based:** Grounded in what respondents actually said | |
| - **Concise:** Clear and to the point (1-2 sentences each) | |
| **Output Format:** Respond ONLY with a valid JSON array of insight strings: | |
| ["Clear, actionable insight from the data", "Another specific insight", ...] | |
| **Important:** Return only JSON array. Do not include explanatory text.""" | |
| messages = [ | |
| {"role": "system", "content": self._get_analyst_system_prompt()}, | |
| {"role": "user", "content": prompt} | |
| ] | |
| try: | |
| response = self.llm.generate(messages, max_tokens=1000, temperature=0.6) | |
| insights = self._parse_json_response(response) | |
| if isinstance(insights, list): | |
| return insights | |
| return [] | |
| except Exception as e: | |
| return [f"Insight generation failed: {str(e)}"] | |
| def _compute_statistics(self, responses: List[Dict], questions: List[Dict] = None) -> Dict: | |
| """Compute basic statistics from responses""" | |
| stats = { | |
| "total_responses": len(responses), | |
| "response_lengths": {}, | |
| "completion_rate": "N/A" | |
| } | |
| # Calculate average response length | |
| response_texts = self._extract_text_responses(responses) | |
| if response_texts: | |
| lengths = [len(r.split()) for r in response_texts] | |
| stats["response_lengths"] = { | |
| "avg_words": sum(lengths) / len(lengths), | |
| "min_words": min(lengths), | |
| "max_words": max(lengths) | |
| } | |
| # Calculate completion rate if questions are provided | |
| if questions: | |
| total_questions = len(questions) | |
| completed_questions = 0 | |
| for response in responses: | |
| if isinstance(response, dict): | |
| completed_questions += len([v for v in response.values() if v]) | |
| if total_questions > 0: | |
| completion_rate = (completed_questions / (total_questions * len(responses))) * 100 | |
| stats["completion_rate"] = f"{completion_rate:.1f}%" | |
| return stats | |
| def generate_report(self, analysis_results: Dict, format: str = "markdown") -> str: | |
| """ | |
| Generate a formatted report from analysis results. | |
| Args: | |
| analysis_results: Results from analyze_responses() | |
| format: Output format (markdown, text, html) | |
| Returns: | |
| Formatted report string | |
| """ | |
| if format == "markdown": | |
| return self._generate_markdown_report(analysis_results) | |
| elif format == "html": | |
| return self._generate_html_report(analysis_results) | |
| else: | |
| return self._generate_text_report(analysis_results) | |
| def _generate_markdown_report(self, results: Dict) -> str: | |
| """Generate markdown formatted report""" | |
| report = "# Survey Analysis Report\n\n" | |
| # Summary section | |
| if "summary" in results and results["summary"]: | |
| report += "## Executive Summary\n\n" | |
| summary = results["summary"] | |
| if "overview" in summary: | |
| report += f"{summary['overview']}\n\n" | |
| if "key_patterns" in summary: | |
| report += "### Key Patterns\n" | |
| for pattern in summary["key_patterns"]: | |
| report += f"- {pattern}\n" | |
| report += "\n" | |
| # Statistics | |
| if "statistics" in results: | |
| report += "## Response Statistics\n\n" | |
| stats = results["statistics"] | |
| report += f"- Total Responses: {stats.get('total_responses', 'N/A')}\n" | |
| if "response_lengths" in stats: | |
| rl = stats["response_lengths"] | |
| report += f"- Average Response Length: {rl.get('avg_words', 0):.1f} words\n" | |
| report += f"- Completion Rate: {stats.get('completion_rate', 'N/A')}\n\n" | |
| # Themes | |
| if "themes" in results and results["themes"]: | |
| report += "## Main Themes\n\n" | |
| for i, theme in enumerate(results["themes"], 1): | |
| if isinstance(theme, dict) and "theme_name" in theme: | |
| report += f"### {i}. {theme['theme_name']}\n" | |
| report += f"{theme.get('description', '')}\n\n" | |
| report += f"**Prevalence:** {theme.get('prevalence', 'N/A')}\n\n" | |
| if "example_quotes" in theme: | |
| report += "**Example quotes:**\n" | |
| for quote in theme["example_quotes"]: | |
| report += f"> {quote}\n" | |
| report += "\n" | |
| # Sentiment | |
| if "sentiment" in results and results["sentiment"]: | |
| report += "## Sentiment Analysis\n\n" | |
| sent = results["sentiment"] | |
| report += f"**Overall Sentiment:** {sent.get('overall_sentiment', 'N/A')}\n\n" | |
| if "distribution" in sent: | |
| report += "**Distribution:**\n" | |
| for key, value in sent["distribution"].items(): | |
| report += f"- {key.title()}: {value}\n" | |
| report += "\n" | |
| # Key Insights | |
| if "key_insights" in results and results["key_insights"]: | |
| report += "## Key Insights\n\n" | |
| for i, insight in enumerate(results["key_insights"], 1): | |
| report += f"{i}. {insight}\n" | |
| report += "\n" | |
| return report | |
| def _generate_text_report(self, results: Dict) -> str: | |
| """Generate plain text report""" | |
| # Similar to markdown but without formatting | |
| return self._generate_markdown_report(results).replace("#", "").replace("**", "").replace(">", "") | |
| def _generate_html_report(self, results: Dict) -> str: | |
| """Generate HTML report""" | |
| # Convert markdown to basic HTML | |
| md_report = self._generate_markdown_report(results) | |
| # Basic conversion (for production, use a proper markdown-to-html library) | |
| html = md_report.replace("# ", "<h1>").replace("\n\n", "</p>\n<p>") | |
| return f"<html><body>{html}</body></html>" | |
| def _get_analyst_system_prompt(self) -> str: | |
| """System prompt for analysis tasks""" | |
| return """You are an expert qualitative research analyst with deep expertise in: | |
| - Thematic analysis and coding | |
| - Sentiment analysis and emotional intelligence | |
| - Pattern recognition in qualitative data | |
| - Insight generation and strategic thinking | |
| - Survey research methodology | |
| Your analyses should be: | |
| - Objective and evidence-based | |
| - Nuanced and comprehensive | |
| - Actionable and clear | |
| - Grounded in the actual data provided | |
| Always respond with valid JSON when requested.""" | |
| def _extract_text_responses(self, responses: List[Dict]) -> List[str]: | |
| """Extract text from response objects""" | |
| texts = [] | |
| for response in responses: | |
| if isinstance(response, dict): | |
| # Extract all string values | |
| for value in response.values(): | |
| if isinstance(value, str) and value.strip(): | |
| texts.append(value.strip()) | |
| elif isinstance(response, str): | |
| texts.append(response.strip()) | |
| return texts | |
| def _format_responses_for_prompt(self, responses: List[str], max_responses: int = 50) -> str: | |
| """Format responses for inclusion in prompt""" | |
| formatted = [] | |
| for i, resp in enumerate(responses[:max_responses], 1): | |
| # Truncate very long responses | |
| truncated = resp[:300] + "..." if len(resp) > 300 else resp | |
| formatted.append(f"{i}. {truncated}") | |
| return "\n".join(formatted) | |
| def _parse_json_response(self, response: str): | |
| """Parse JSON from LLM response""" | |
| response = response.strip() | |
| # Handle code blocks | |
| if "```json" in response: | |
| start = response.find("```json") + 7 | |
| end = response.find("```", start) | |
| response = response[start:end].strip() | |
| elif "```" in response: | |
| start = response.find("```") + 3 | |
| end = response.find("```", start) | |
| response = response[start:end].strip() | |
| try: | |
| return json.loads(response) | |
| except json.JSONDecodeError: | |
| # Try to find JSON object or array | |
| if "{" in response: | |
| start = response.find("{") | |
| end = response.rfind("}") + 1 | |
| return json.loads(response[start:end]) | |
| elif "[" in response: | |
| start = response.find("[") | |
| end = response.rfind("]") + 1 | |
| return json.loads(response[start:end]) | |
| raise | |