ProjectEcho / data_analyzer.py
jmisak's picture
Upload 23 files
196c707 verified
raw
history blame
16.5 kB
"""
Data Analysis Module - AI-assisted analysis of survey responses
"""
import json
from typing import Dict, List, Optional
from collections import Counter
from llm_backend import LLMBackend
class DataAnalyzer:
"""
Analyzes survey responses to uncover key findings, trends, and patterns.
Provides AI-assisted summaries for qualitative research data.
"""
def __init__(self, llm_backend: LLMBackend):
self.llm = llm_backend
def analyze_responses(self, responses: List[Dict], questions: List[Dict] = None) -> Dict:
"""
Comprehensive analysis of survey responses.
Args:
responses: List of response dictionaries
questions: Optional list of questions for context
Returns:
Analysis results including themes, sentiment, and insights
"""
if not responses:
return {"error": "No responses to analyze"}
analysis = {
"summary": {},
"themes": [],
"sentiment": {},
"key_insights": [],
"response_count": len(responses)
}
# Generate overall summary
analysis["summary"] = self._generate_summary(responses, questions)
# Extract themes
analysis["themes"] = self._extract_themes(responses)
# Analyze sentiment
analysis["sentiment"] = self._analyze_sentiment(responses)
# Generate key insights
analysis["key_insights"] = self._generate_insights(responses, questions)
# Add quantitative stats if applicable
analysis["statistics"] = self._compute_statistics(responses, questions)
return analysis
def _generate_summary(self, responses: List[Dict], questions: List[Dict] = None) -> Dict:
"""Generate an executive summary of responses"""
# Prepare context
response_texts = self._extract_text_responses(responses)
sample_size = min(50, len(response_texts)) # Use sample for large datasets
sample_responses = response_texts[:sample_size]
context = f"Total responses: {len(responses)}\n\n"
if questions:
context += "Questions asked:\n"
for i, q in enumerate(questions[:10], 1): # Limit to first 10 questions
context += f"{i}. {q.get('question_text', '')}\n"
context += "\n"
context += "Sample responses:\n"
for i, resp in enumerate(sample_responses, 1):
context += f"{i}. {resp[:200]}...\n" # Truncate long responses
prompt = f"""Analyze the following survey responses and provide an executive summary.
{context}
Provide a summary that includes:
1. Overview: High-level summary of what the data shows (2-3 sentences)
2. Key patterns: Main patterns or trends observed
3. Notable findings: Interesting or unexpected discoveries
4. Response quality: Assessment of response depth and engagement
Respond with a JSON object with these fields:
{{
"overview": "...",
"key_patterns": ["pattern 1", "pattern 2", ...],
"notable_findings": ["finding 1", "finding 2", ...],
"response_quality": "..."
}}"""
messages = [
{"role": "system", "content": self._get_analyst_system_prompt()},
{"role": "user", "content": prompt}
]
try:
response = self.llm.generate(messages, max_tokens=1000, temperature=0.5)
return self._parse_json_response(response)
except Exception as e:
return {"error": f"Summary generation failed: {str(e)}"}
def _extract_themes(self, responses: List[Dict], num_themes: int = 5) -> List[Dict]:
"""Extract main themes from responses using AI"""
response_texts = self._extract_text_responses(responses)
if not response_texts:
return []
# Sample for large datasets
sample_size = min(100, len(response_texts))
sample_responses = response_texts[:sample_size]
prompt = f"""Analyze the following {len(sample_responses)} survey responses and identify the top {num_themes} themes.
Responses:
{self._format_responses_for_prompt(sample_responses)}
For each theme, provide:
1. Theme name: A short, descriptive name
2. Description: What this theme represents
3. Prevalence: Estimated percentage of responses mentioning this theme
4. Example quotes: 2-3 representative quotes from the responses
Respond with a JSON array of theme objects:
[
{{
"theme_name": "...",
"description": "...",
"prevalence": "XX%",
"example_quotes": ["quote 1", "quote 2"]
}}
]"""
messages = [
{"role": "system", "content": self._get_analyst_system_prompt()},
{"role": "user", "content": prompt}
]
try:
response = self.llm.generate(messages, max_tokens=1500, temperature=0.6)
themes = self._parse_json_response(response)
if isinstance(themes, list):
return themes
return []
except Exception as e:
return [{"error": f"Theme extraction failed: {str(e)}"}]
def _analyze_sentiment(self, responses: List[Dict]) -> Dict:
"""Analyze overall sentiment of responses"""
response_texts = self._extract_text_responses(responses)
if not response_texts:
return {}
# Sample for analysis
sample_size = min(100, len(response_texts))
sample_responses = response_texts[:sample_size]
prompt = f"""Analyze the sentiment of these {len(sample_responses)} survey responses.
Responses:
{self._format_responses_for_prompt(sample_responses)}
Provide sentiment analysis including:
1. Overall sentiment: positive, negative, neutral, or mixed
2. Sentiment distribution: Estimated percentage breakdown
3. Emotional tone: Key emotions detected
4. Intensity: How strong the sentiments are
Respond with JSON:
{{
"overall_sentiment": "...",
"distribution": {{
"positive": "XX%",
"neutral": "XX%",
"negative": "XX%"
}},
"emotions": ["emotion1", "emotion2", ...],
"intensity": "low|moderate|high"
}}"""
messages = [
{"role": "system", "content": self._get_analyst_system_prompt()},
{"role": "user", "content": prompt}
]
try:
response = self.llm.generate(messages, max_tokens=500, temperature=0.4)
return self._parse_json_response(response)
except Exception as e:
return {"error": f"Sentiment analysis failed: {str(e)}"}
def _generate_insights(self, responses: List[Dict], questions: List[Dict] = None) -> List[str]:
"""Generate actionable insights from the data"""
response_texts = self._extract_text_responses(responses)
if not response_texts:
return []
sample_size = min(100, len(response_texts))
sample_responses = response_texts[:sample_size]
context = f"Analyzing {len(responses)} survey responses.\n\n"
if questions:
context += "Research questions:\n"
for i, q in enumerate(questions[:5], 1):
context += f"{i}. {q.get('question_text', '')}\n"
context += "\n"
prompt = f"""{context}
Sample responses:
{self._format_responses_for_prompt(sample_responses)}
Based on this data, provide 5-7 key insights that would be valuable for:
- Understanding the target audience
- Identifying opportunities or challenges
- Informing strategic decisions
- Recognizing patterns or trends
Each insight should be:
- Specific and actionable
- Supported by the data
- Clear and concise
Respond with a JSON array of insight strings:
["insight 1", "insight 2", ...]"""
messages = [
{"role": "system", "content": self._get_analyst_system_prompt()},
{"role": "user", "content": prompt}
]
try:
response = self.llm.generate(messages, max_tokens=1000, temperature=0.6)
insights = self._parse_json_response(response)
if isinstance(insights, list):
return insights
return []
except Exception as e:
return [f"Insight generation failed: {str(e)}"]
def _compute_statistics(self, responses: List[Dict], questions: List[Dict] = None) -> Dict:
"""Compute basic statistics from responses"""
stats = {
"total_responses": len(responses),
"response_lengths": {},
"completion_rate": "N/A"
}
# Calculate average response length
response_texts = self._extract_text_responses(responses)
if response_texts:
lengths = [len(r.split()) for r in response_texts]
stats["response_lengths"] = {
"avg_words": sum(lengths) / len(lengths),
"min_words": min(lengths),
"max_words": max(lengths)
}
# Calculate completion rate if questions are provided
if questions:
total_questions = len(questions)
completed_questions = 0
for response in responses:
if isinstance(response, dict):
completed_questions += len([v for v in response.values() if v])
if total_questions > 0:
completion_rate = (completed_questions / (total_questions * len(responses))) * 100
stats["completion_rate"] = f"{completion_rate:.1f}%"
return stats
def generate_report(self, analysis_results: Dict, format: str = "markdown") -> str:
"""
Generate a formatted report from analysis results.
Args:
analysis_results: Results from analyze_responses()
format: Output format (markdown, text, html)
Returns:
Formatted report string
"""
if format == "markdown":
return self._generate_markdown_report(analysis_results)
elif format == "html":
return self._generate_html_report(analysis_results)
else:
return self._generate_text_report(analysis_results)
def _generate_markdown_report(self, results: Dict) -> str:
"""Generate markdown formatted report"""
report = "# Survey Analysis Report\n\n"
# Summary section
if "summary" in results and results["summary"]:
report += "## Executive Summary\n\n"
summary = results["summary"]
if "overview" in summary:
report += f"{summary['overview']}\n\n"
if "key_patterns" in summary:
report += "### Key Patterns\n"
for pattern in summary["key_patterns"]:
report += f"- {pattern}\n"
report += "\n"
# Statistics
if "statistics" in results:
report += "## Response Statistics\n\n"
stats = results["statistics"]
report += f"- Total Responses: {stats.get('total_responses', 'N/A')}\n"
if "response_lengths" in stats:
rl = stats["response_lengths"]
report += f"- Average Response Length: {rl.get('avg_words', 0):.1f} words\n"
report += f"- Completion Rate: {stats.get('completion_rate', 'N/A')}\n\n"
# Themes
if "themes" in results and results["themes"]:
report += "## Main Themes\n\n"
for i, theme in enumerate(results["themes"], 1):
if isinstance(theme, dict) and "theme_name" in theme:
report += f"### {i}. {theme['theme_name']}\n"
report += f"{theme.get('description', '')}\n\n"
report += f"**Prevalence:** {theme.get('prevalence', 'N/A')}\n\n"
if "example_quotes" in theme:
report += "**Example quotes:**\n"
for quote in theme["example_quotes"]:
report += f"> {quote}\n"
report += "\n"
# Sentiment
if "sentiment" in results and results["sentiment"]:
report += "## Sentiment Analysis\n\n"
sent = results["sentiment"]
report += f"**Overall Sentiment:** {sent.get('overall_sentiment', 'N/A')}\n\n"
if "distribution" in sent:
report += "**Distribution:**\n"
for key, value in sent["distribution"].items():
report += f"- {key.title()}: {value}\n"
report += "\n"
# Key Insights
if "key_insights" in results and results["key_insights"]:
report += "## Key Insights\n\n"
for i, insight in enumerate(results["key_insights"], 1):
report += f"{i}. {insight}\n"
report += "\n"
return report
def _generate_text_report(self, results: Dict) -> str:
"""Generate plain text report"""
# Similar to markdown but without formatting
return self._generate_markdown_report(results).replace("#", "").replace("**", "").replace(">", "")
def _generate_html_report(self, results: Dict) -> str:
"""Generate HTML report"""
# Convert markdown to basic HTML
md_report = self._generate_markdown_report(results)
# Basic conversion (for production, use a proper markdown-to-html library)
html = md_report.replace("# ", "<h1>").replace("\n\n", "</p>\n<p>")
return f"<html><body>{html}</body></html>"
def _get_analyst_system_prompt(self) -> str:
"""System prompt for analysis tasks"""
return """You are an expert qualitative research analyst with deep expertise in:
- Thematic analysis and coding
- Sentiment analysis and emotional intelligence
- Pattern recognition in qualitative data
- Insight generation and strategic thinking
- Survey research methodology
Your analyses should be:
- Objective and evidence-based
- Nuanced and comprehensive
- Actionable and clear
- Grounded in the actual data provided
Always respond with valid JSON when requested."""
def _extract_text_responses(self, responses: List[Dict]) -> List[str]:
"""Extract text from response objects"""
texts = []
for response in responses:
if isinstance(response, dict):
# Extract all string values
for value in response.values():
if isinstance(value, str) and value.strip():
texts.append(value.strip())
elif isinstance(response, str):
texts.append(response.strip())
return texts
def _format_responses_for_prompt(self, responses: List[str], max_responses: int = 50) -> str:
"""Format responses for inclusion in prompt"""
formatted = []
for i, resp in enumerate(responses[:max_responses], 1):
# Truncate very long responses
truncated = resp[:300] + "..." if len(resp) > 300 else resp
formatted.append(f"{i}. {truncated}")
return "\n".join(formatted)
def _parse_json_response(self, response: str):
"""Parse JSON from LLM response"""
response = response.strip()
# Handle code blocks
if "```json" in response:
start = response.find("```json") + 7
end = response.find("```", start)
response = response[start:end].strip()
elif "```" in response:
start = response.find("```") + 3
end = response.find("```", start)
response = response[start:end].strip()
try:
return json.loads(response)
except json.JSONDecodeError:
# Try to find JSON object or array
if "{" in response:
start = response.find("{")
end = response.rfind("}") + 1
return json.loads(response[start:end])
elif "[" in response:
start = response.find("[")
end = response.rfind("]") + 1
return json.loads(response[start:end])
raise