ProjectEcho / data_analyzer.py
jmisak's picture
Upload 5 files
8056e83 verified
"""
Data Analysis Module - AI-assisted analysis of survey responses
"""
import json
import sys
import os
from typing import Dict, List, Optional
from collections import Counter
# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(__file__))
from llm_backend import LLMBackend
class DataAnalyzer:
"""
Analyzes survey responses to uncover key findings, trends, and patterns.
Provides AI-assisted summaries for qualitative research data.
"""
def __init__(self, llm_backend: LLMBackend):
self.llm = llm_backend
def analyze_responses(self, responses: List[Dict], questions: List[Dict] = None) -> Dict:
"""
Comprehensive analysis of survey responses.
Args:
responses: List of response dictionaries
questions: Optional list of questions for context
Returns:
Analysis results including themes, sentiment, and insights
"""
if not responses:
return {"error": "No responses to analyze"}
analysis = {
"summary": {},
"themes": [],
"sentiment": {},
"key_insights": [],
"response_count": len(responses)
}
# Generate overall summary
analysis["summary"] = self._generate_summary(responses, questions)
# Extract themes
analysis["themes"] = self._extract_themes(responses)
# Analyze sentiment
analysis["sentiment"] = self._analyze_sentiment(responses)
# Generate key insights
analysis["key_insights"] = self._generate_insights(responses, questions)
# Add quantitative stats if applicable
analysis["statistics"] = self._compute_statistics(responses, questions)
return analysis
def _generate_summary(self, responses: List[Dict], questions: List[Dict] = None) -> Dict:
"""Generate an executive summary of responses"""
# Prepare context
response_texts = self._extract_text_responses(responses)
sample_size = min(50, len(response_texts)) # Use sample for large datasets
sample_responses = response_texts[:sample_size]
context = f"Total responses: {len(responses)}\n\n"
if questions:
context += "Questions asked:\n"
for i, q in enumerate(questions[:10], 1): # Limit to first 10 questions
context += f"{i}. {q.get('question_text', '')}\n"
context += "\n"
context += "Sample responses:\n"
for i, resp in enumerate(sample_responses, 1):
context += f"{i}. {resp[:200]}...\n" # Truncate long responses
prompt = f"""Task: Analyze survey responses and generate an executive summary
{context}
**Your Analysis Should Include:**
1. **Overview:** A clear, concise high-level summary of what the data reveals (2-3 sentences)
2. **Key Patterns:** Main patterns, trends, or recurring themes observed across responses
3. **Notable Findings:** Interesting, surprising, or unexpected discoveries in the data
4. **Response Quality:** Assessment of how thoughtful, engaged, and detailed the responses are
**Output Format:** Respond ONLY with valid JSON:
{{
"overview": "Clear summary of overall findings",
"key_patterns": ["pattern 1", "pattern 2", "pattern 3"],
"notable_findings": ["surprising finding 1", "unexpected discovery"],
"response_quality": "Assessment of engagement level"
}}
**Important:** Ensure your response is valid JSON that can be parsed. Do not include any text outside the JSON object."""
messages = [
{"role": "system", "content": self._get_analyst_system_prompt()},
{"role": "user", "content": prompt}
]
try:
response = self.llm.generate(messages, max_tokens=1000, temperature=0.5)
return self._parse_json_response(response)
except Exception as e:
return {"error": f"Summary generation failed: {str(e)}"}
def _extract_themes(self, responses: List[Dict], num_themes: int = 5) -> List[Dict]:
"""Extract main themes from responses using AI"""
response_texts = self._extract_text_responses(responses)
if not response_texts:
return []
# Sample for large datasets
sample_size = min(100, len(response_texts))
sample_responses = response_texts[:sample_size]
prompt = f"""Task: Extract and analyze themes from survey responses
**Data:** Analyzing {len(sample_responses)} survey responses
Responses:
{self._format_responses_for_prompt(sample_responses)}
**Your Task:** Identify the top {num_themes} distinct themes that emerge from these responses.
**For Each Theme, Provide:**
1. **Theme Name:** A short, memorable, and descriptive label
2. **Description:** Clear explanation of what this theme represents and its significance
3. **Prevalence:** Estimated percentage of responses that mention or relate to this theme
4. **Example Quotes:** 2-3 actual, representative quotes from responses that illustrate this theme
**Output Format:** Respond ONLY with a valid JSON array:
[
{{
"theme_name": "Clear, concise theme label",
"description": "What this theme means and why it matters",
"prevalence": "XX%",
"example_quotes": ["exact quote from responses", "another quote"]
}}
]
**Important:** Ensure all responses are valid JSON. Do not include text outside the array."""
messages = [
{"role": "system", "content": self._get_analyst_system_prompt()},
{"role": "user", "content": prompt}
]
try:
response = self.llm.generate(messages, max_tokens=1500, temperature=0.6)
themes = self._parse_json_response(response)
if isinstance(themes, list):
return themes
return []
except Exception as e:
return [{"error": f"Theme extraction failed: {str(e)}"}]
def _analyze_sentiment(self, responses: List[Dict]) -> Dict:
"""Analyze overall sentiment of responses"""
response_texts = self._extract_text_responses(responses)
if not response_texts:
return {}
# Sample for analysis
sample_size = min(100, len(response_texts))
sample_responses = response_texts[:sample_size]
prompt = f"""Task: Analyze sentiment across survey responses
**Data:** Analyzing sentiment in {len(sample_responses)} survey responses
Responses:
{self._format_responses_for_prompt(sample_responses)}
**Your Task:** Conduct a comprehensive sentiment analysis of these responses.
**Analysis Should Include:**
1. **Overall Sentiment:** The dominant sentiment tone (positive, negative, neutral, or mixed)
2. **Sentiment Distribution:** Estimated percentage breakdown across sentiment categories
3. **Emotional Tone:** Key emotions or emotional themes detected in responses
4. **Intensity:** The strength of the sentiments (low, moderate, or high)
**Output Format:** Respond ONLY with valid JSON:
{{
"overall_sentiment": "positive|negative|neutral|mixed",
"distribution": {{
"positive": "XX%",
"neutral": "XX%",
"negative": "XX%"
}},
"emotions": ["emotion1", "emotion2", "emotion3"],
"intensity": "low|moderate|high"
}}
**Important:** Return only valid JSON. Do not include explanatory text."""
messages = [
{"role": "system", "content": self._get_analyst_system_prompt()},
{"role": "user", "content": prompt}
]
try:
response = self.llm.generate(messages, max_tokens=500, temperature=0.4)
return self._parse_json_response(response)
except Exception as e:
return {"error": f"Sentiment analysis failed: {str(e)}"}
def _generate_insights(self, responses: List[Dict], questions: List[Dict] = None) -> List[str]:
"""Generate actionable insights from the data"""
response_texts = self._extract_text_responses(responses)
if not response_texts:
return []
sample_size = min(100, len(response_texts))
sample_responses = response_texts[:sample_size]
context = f"Analyzing {len(responses)} survey responses.\n\n"
if questions:
context += "Research questions:\n"
for i, q in enumerate(questions[:5], 1):
context += f"{i}. {q.get('question_text', '')}\n"
context += "\n"
prompt = f"""{context}
Sample responses:
{self._format_responses_for_prompt(sample_responses)}
**Task:** Extract key insights from this survey data
**Generate 5-7 actionable insights** that address:
- Understanding the target audience and their needs
- Identifying opportunities for growth or improvement
- Recognizing challenges or pain points
- Understanding patterns, trends, and correlations
- Informing strategic or product decisions
**Insight Quality Criteria:**
- **Specific:** Clear, concrete statements based on actual data patterns
- **Actionable:** Can be used to inform decisions or actions
- **Evidence-based:** Grounded in what respondents actually said
- **Concise:** Clear and to the point (1-2 sentences each)
**Output Format:** Respond ONLY with a valid JSON array of insight strings:
["Clear, actionable insight from the data", "Another specific insight", ...]
**Important:** Return only JSON array. Do not include explanatory text."""
messages = [
{"role": "system", "content": self._get_analyst_system_prompt()},
{"role": "user", "content": prompt}
]
try:
response = self.llm.generate(messages, max_tokens=1000, temperature=0.6)
insights = self._parse_json_response(response)
if isinstance(insights, list):
return insights
return []
except Exception as e:
return [f"Insight generation failed: {str(e)}"]
def _compute_statistics(self, responses: List[Dict], questions: List[Dict] = None) -> Dict:
"""Compute basic statistics from responses"""
stats = {
"total_responses": len(responses),
"response_lengths": {},
"completion_rate": "N/A"
}
# Calculate average response length
response_texts = self._extract_text_responses(responses)
if response_texts:
lengths = [len(r.split()) for r in response_texts]
stats["response_lengths"] = {
"avg_words": sum(lengths) / len(lengths),
"min_words": min(lengths),
"max_words": max(lengths)
}
# Calculate completion rate if questions are provided
if questions:
total_questions = len(questions)
completed_questions = 0
for response in responses:
if isinstance(response, dict):
completed_questions += len([v for v in response.values() if v])
if total_questions > 0:
completion_rate = (completed_questions / (total_questions * len(responses))) * 100
stats["completion_rate"] = f"{completion_rate:.1f}%"
return stats
def generate_report(self, analysis_results: Dict, format: str = "markdown") -> str:
"""
Generate a formatted report from analysis results.
Args:
analysis_results: Results from analyze_responses()
format: Output format (markdown, text, html)
Returns:
Formatted report string
"""
if format == "markdown":
return self._generate_markdown_report(analysis_results)
elif format == "html":
return self._generate_html_report(analysis_results)
else:
return self._generate_text_report(analysis_results)
def _generate_markdown_report(self, results: Dict) -> str:
"""Generate markdown formatted report"""
report = "# Survey Analysis Report\n\n"
# Summary section
if "summary" in results and results["summary"]:
report += "## Executive Summary\n\n"
summary = results["summary"]
if "overview" in summary:
report += f"{summary['overview']}\n\n"
if "key_patterns" in summary:
report += "### Key Patterns\n"
for pattern in summary["key_patterns"]:
report += f"- {pattern}\n"
report += "\n"
# Statistics
if "statistics" in results:
report += "## Response Statistics\n\n"
stats = results["statistics"]
report += f"- Total Responses: {stats.get('total_responses', 'N/A')}\n"
if "response_lengths" in stats:
rl = stats["response_lengths"]
report += f"- Average Response Length: {rl.get('avg_words', 0):.1f} words\n"
report += f"- Completion Rate: {stats.get('completion_rate', 'N/A')}\n\n"
# Themes
if "themes" in results and results["themes"]:
report += "## Main Themes\n\n"
for i, theme in enumerate(results["themes"], 1):
if isinstance(theme, dict) and "theme_name" in theme:
report += f"### {i}. {theme['theme_name']}\n"
report += f"{theme.get('description', '')}\n\n"
report += f"**Prevalence:** {theme.get('prevalence', 'N/A')}\n\n"
if "example_quotes" in theme:
report += "**Example quotes:**\n"
for quote in theme["example_quotes"]:
report += f"> {quote}\n"
report += "\n"
# Sentiment
if "sentiment" in results and results["sentiment"]:
report += "## Sentiment Analysis\n\n"
sent = results["sentiment"]
report += f"**Overall Sentiment:** {sent.get('overall_sentiment', 'N/A')}\n\n"
if "distribution" in sent:
report += "**Distribution:**\n"
for key, value in sent["distribution"].items():
report += f"- {key.title()}: {value}\n"
report += "\n"
# Key Insights
if "key_insights" in results and results["key_insights"]:
report += "## Key Insights\n\n"
for i, insight in enumerate(results["key_insights"], 1):
report += f"{i}. {insight}\n"
report += "\n"
return report
def _generate_text_report(self, results: Dict) -> str:
"""Generate plain text report"""
# Similar to markdown but without formatting
return self._generate_markdown_report(results).replace("#", "").replace("**", "").replace(">", "")
def _generate_html_report(self, results: Dict) -> str:
"""Generate HTML report"""
# Convert markdown to basic HTML
md_report = self._generate_markdown_report(results)
# Basic conversion (for production, use a proper markdown-to-html library)
html = md_report.replace("# ", "<h1>").replace("\n\n", "</p>\n<p>")
return f"<html><body>{html}</body></html>"
def _get_analyst_system_prompt(self) -> str:
"""System prompt for analysis tasks"""
return """You are an expert qualitative research analyst with deep expertise in:
- Thematic analysis and coding
- Sentiment analysis and emotional intelligence
- Pattern recognition in qualitative data
- Insight generation and strategic thinking
- Survey research methodology
Your analyses should be:
- Objective and evidence-based
- Nuanced and comprehensive
- Actionable and clear
- Grounded in the actual data provided
Always respond with valid JSON when requested."""
def _extract_text_responses(self, responses: List[Dict]) -> List[str]:
"""Extract text from response objects"""
texts = []
for response in responses:
if isinstance(response, dict):
# Extract all string values
for value in response.values():
if isinstance(value, str) and value.strip():
texts.append(value.strip())
elif isinstance(response, str):
texts.append(response.strip())
return texts
def _format_responses_for_prompt(self, responses: List[str], max_responses: int = 50) -> str:
"""Format responses for inclusion in prompt"""
formatted = []
for i, resp in enumerate(responses[:max_responses], 1):
# Truncate very long responses
truncated = resp[:300] + "..." if len(resp) > 300 else resp
formatted.append(f"{i}. {truncated}")
return "\n".join(formatted)
def _parse_json_response(self, response: str):
"""Parse JSON from LLM response"""
response = response.strip()
# Handle code blocks
if "```json" in response:
start = response.find("```json") + 7
end = response.find("```", start)
response = response[start:end].strip()
elif "```" in response:
start = response.find("```") + 3
end = response.find("```", start)
response = response[start:end].strip()
try:
return json.loads(response)
except json.JSONDecodeError:
# Try to find JSON object or array
if "{" in response:
start = response.find("{")
end = response.rfind("}") + 1
return json.loads(response[start:end])
elif "[" in response:
start = response.find("[")
end = response.rfind("]") + 1
return json.loads(response[start:end])
raise