Spaces:

jmisak
/

ProjectEcho

Sleeping

App Files Files Community

ProjectEcho / data_analyzer.py

jmisak

Upload 23 files

196c707 verified 6 months ago

raw

history blame

16.5 kB

	"""
	Data Analysis Module - AI-assisted analysis of survey responses
	"""
	import json
	from typing import Dict, List, Optional
	from collections import Counter
	from llm_backend import LLMBackend


	class DataAnalyzer:
	"""
	Analyzes survey responses to uncover key findings, trends, and patterns.
	Provides AI-assisted summaries for qualitative research data.
	"""

	def __init__(self, llm_backend: LLMBackend):
	self.llm = llm_backend

	def analyze_responses(self, responses: List[Dict], questions: List[Dict] = None) -> Dict:
	"""
	Comprehensive analysis of survey responses.

	Args:
	responses: List of response dictionaries
	questions: Optional list of questions for context

	Returns:
	Analysis results including themes, sentiment, and insights
	"""
	if not responses:
	return {"error": "No responses to analyze"}

	analysis = {
	"summary": {},
	"themes": [],
	"sentiment": {},
	"key_insights": [],
	"response_count": len(responses)
	}

	# Generate overall summary
	analysis["summary"] = self._generate_summary(responses, questions)

	# Extract themes
	analysis["themes"] = self._extract_themes(responses)

	# Analyze sentiment
	analysis["sentiment"] = self._analyze_sentiment(responses)

	# Generate key insights
	analysis["key_insights"] = self._generate_insights(responses, questions)

	# Add quantitative stats if applicable
	analysis["statistics"] = self._compute_statistics(responses, questions)

	return analysis

	def _generate_summary(self, responses: List[Dict], questions: List[Dict] = None) -> Dict:
	"""Generate an executive summary of responses"""
	# Prepare context
	response_texts = self._extract_text_responses(responses)
	sample_size = min(50, len(response_texts)) # Use sample for large datasets
	sample_responses = response_texts[:sample_size]

	context = f"Total responses: {len(responses)}\n\n"
	if questions:
	context += "Questions asked:\n"
	for i, q in enumerate(questions[:10], 1): # Limit to first 10 questions
	context += f"{i}. {q.get('question_text', '')}\n"
	context += "\n"

	context += "Sample responses:\n"
	for i, resp in enumerate(sample_responses, 1):
	context += f"{i}. {resp[:200]}...\n" # Truncate long responses

	prompt = f"""Analyze the following survey responses and provide an executive summary.

	{context}

	Provide a summary that includes:
	1. Overview: High-level summary of what the data shows (2-3 sentences)
	2. Key patterns: Main patterns or trends observed
	3. Notable findings: Interesting or unexpected discoveries
	4. Response quality: Assessment of response depth and engagement

	Respond with a JSON object with these fields:
	{{
	"overview": "...",
	"key_patterns": ["pattern 1", "pattern 2", ...],
	"notable_findings": ["finding 1", "finding 2", ...],
	"response_quality": "..."
	}}"""

	messages = [
	{"role": "system", "content": self._get_analyst_system_prompt()},
	{"role": "user", "content": prompt}
	]

	try:
	response = self.llm.generate(messages, max_tokens=1000, temperature=0.5)
	return self._parse_json_response(response)
	except Exception as e:
	return {"error": f"Summary generation failed: {str(e)}"}

	def _extract_themes(self, responses: List[Dict], num_themes: int = 5) -> List[Dict]:
	"""Extract main themes from responses using AI"""
	response_texts = self._extract_text_responses(responses)

	if not response_texts:
	return []

	# Sample for large datasets
	sample_size = min(100, len(response_texts))
	sample_responses = response_texts[:sample_size]

	prompt = f"""Analyze the following {len(sample_responses)} survey responses and identify the top {num_themes} themes.

	Responses:
	{self._format_responses_for_prompt(sample_responses)}

	For each theme, provide:
	1. Theme name: A short, descriptive name
	2. Description: What this theme represents
	3. Prevalence: Estimated percentage of responses mentioning this theme
	4. Example quotes: 2-3 representative quotes from the responses

	Respond with a JSON array of theme objects:
	[
	{{
	"theme_name": "...",
	"description": "...",
	"prevalence": "XX%",
	"example_quotes": ["quote 1", "quote 2"]
	}}
	]"""

	messages = [
	{"role": "system", "content": self._get_analyst_system_prompt()},
	{"role": "user", "content": prompt}
	]

	try:
	response = self.llm.generate(messages, max_tokens=1500, temperature=0.6)
	themes = self._parse_json_response(response)
	if isinstance(themes, list):
	return themes
	return []
	except Exception as e:
	return [{"error": f"Theme extraction failed: {str(e)}"}]

	def _analyze_sentiment(self, responses: List[Dict]) -> Dict:
	"""Analyze overall sentiment of responses"""
	response_texts = self._extract_text_responses(responses)

	if not response_texts:
	return {}

	# Sample for analysis
	sample_size = min(100, len(response_texts))
	sample_responses = response_texts[:sample_size]

	prompt = f"""Analyze the sentiment of these {len(sample_responses)} survey responses.

	Responses:
	{self._format_responses_for_prompt(sample_responses)}

	Provide sentiment analysis including:
	1. Overall sentiment: positive, negative, neutral, or mixed
	2. Sentiment distribution: Estimated percentage breakdown
	3. Emotional tone: Key emotions detected
	4. Intensity: How strong the sentiments are

	Respond with JSON:
	{{
	"overall_sentiment": "...",
	"distribution": {{
	"positive": "XX%",
	"neutral": "XX%",
	"negative": "XX%"
	}},
	"emotions": ["emotion1", "emotion2", ...],
	"intensity": "low\|moderate\|high"
	}}"""

	messages = [
	{"role": "system", "content": self._get_analyst_system_prompt()},
	{"role": "user", "content": prompt}
	]

	try:
	response = self.llm.generate(messages, max_tokens=500, temperature=0.4)
	return self._parse_json_response(response)
	except Exception as e:
	return {"error": f"Sentiment analysis failed: {str(e)}"}

	def _generate_insights(self, responses: List[Dict], questions: List[Dict] = None) -> List[str]:
	"""Generate actionable insights from the data"""
	response_texts = self._extract_text_responses(responses)

	if not response_texts:
	return []

	sample_size = min(100, len(response_texts))
	sample_responses = response_texts[:sample_size]

	context = f"Analyzing {len(responses)} survey responses.\n\n"
	if questions:
	context += "Research questions:\n"
	for i, q in enumerate(questions[:5], 1):
	context += f"{i}. {q.get('question_text', '')}\n"
	context += "\n"

	prompt = f"""{context}

	Sample responses:
	{self._format_responses_for_prompt(sample_responses)}

	Based on this data, provide 5-7 key insights that would be valuable for:
	- Understanding the target audience
	- Identifying opportunities or challenges
	- Informing strategic decisions
	- Recognizing patterns or trends

	Each insight should be:
	- Specific and actionable
	- Supported by the data
	- Clear and concise

	Respond with a JSON array of insight strings:
	["insight 1", "insight 2", ...]"""

	messages = [
	{"role": "system", "content": self._get_analyst_system_prompt()},
	{"role": "user", "content": prompt}
	]

	try:
	response = self.llm.generate(messages, max_tokens=1000, temperature=0.6)
	insights = self._parse_json_response(response)
	if isinstance(insights, list):
	return insights
	return []
	except Exception as e:
	return [f"Insight generation failed: {str(e)}"]

	def _compute_statistics(self, responses: List[Dict], questions: List[Dict] = None) -> Dict:
	"""Compute basic statistics from responses"""
	stats = {
	"total_responses": len(responses),
	"response_lengths": {},
	"completion_rate": "N/A"
	}

	# Calculate average response length
	response_texts = self._extract_text_responses(responses)
	if response_texts:
	lengths = [len(r.split()) for r in response_texts]
	stats["response_lengths"] = {
	"avg_words": sum(lengths) / len(lengths),
	"min_words": min(lengths),
	"max_words": max(lengths)
	}

	# Calculate completion rate if questions are provided
	if questions:
	total_questions = len(questions)
	completed_questions = 0
	for response in responses:
	if isinstance(response, dict):
	completed_questions += len([v for v in response.values() if v])

	if total_questions > 0:
	completion_rate = (completed_questions / (total_questions * len(responses))) * 100
	stats["completion_rate"] = f"{completion_rate:.1f}%"

	return stats

	def generate_report(self, analysis_results: Dict, format: str = "markdown") -> str:
	"""
	Generate a formatted report from analysis results.

	Args:
	analysis_results: Results from analyze_responses()
	format: Output format (markdown, text, html)

	Returns:
	Formatted report string
	"""
	if format == "markdown":
	return self._generate_markdown_report(analysis_results)
	elif format == "html":
	return self._generate_html_report(analysis_results)
	else:
	return self._generate_text_report(analysis_results)

	def _generate_markdown_report(self, results: Dict) -> str:
	"""Generate markdown formatted report"""
	report = "# Survey Analysis Report\n\n"

	# Summary section
	if "summary" in results and results["summary"]:
	report += "## Executive Summary\n\n"
	summary = results["summary"]
	if "overview" in summary:
	report += f"{summary['overview']}\n\n"
	if "key_patterns" in summary:
	report += "### Key Patterns\n"
	for pattern in summary["key_patterns"]:
	report += f"- {pattern}\n"
	report += "\n"

	# Statistics
	if "statistics" in results:
	report += "## Response Statistics\n\n"
	stats = results["statistics"]
	report += f"- Total Responses: {stats.get('total_responses', 'N/A')}\n"
	if "response_lengths" in stats:
	rl = stats["response_lengths"]
	report += f"- Average Response Length: {rl.get('avg_words', 0):.1f} words\n"
	report += f"- Completion Rate: {stats.get('completion_rate', 'N/A')}\n\n"

	# Themes
	if "themes" in results and results["themes"]:
	report += "## Main Themes\n\n"
	for i, theme in enumerate(results["themes"], 1):
	if isinstance(theme, dict) and "theme_name" in theme:
	report += f"### {i}. {theme['theme_name']}\n"
	report += f"{theme.get('description', '')}\n\n"
	report += f"Prevalence: {theme.get('prevalence', 'N/A')}\n\n"
	if "example_quotes" in theme:
	report += "Example quotes:\n"
	for quote in theme["example_quotes"]:
	report += f"> {quote}\n"
	report += "\n"

	# Sentiment
	if "sentiment" in results and results["sentiment"]:
	report += "## Sentiment Analysis\n\n"
	sent = results["sentiment"]
	report += f"Overall Sentiment: {sent.get('overall_sentiment', 'N/A')}\n\n"
	if "distribution" in sent:
	report += "Distribution:\n"
	for key, value in sent["distribution"].items():
	report += f"- {key.title()}: {value}\n"
	report += "\n"

	# Key Insights
	if "key_insights" in results and results["key_insights"]:
	report += "## Key Insights\n\n"
	for i, insight in enumerate(results["key_insights"], 1):
	report += f"{i}. {insight}\n"
	report += "\n"

	return report

	def _generate_text_report(self, results: Dict) -> str:
	"""Generate plain text report"""
	# Similar to markdown but without formatting
	return self._generate_markdown_report(results).replace("#", "").replace("**", "").replace(">", "")

	def _generate_html_report(self, results: Dict) -> str:
	"""Generate HTML report"""
	# Convert markdown to basic HTML
	md_report = self._generate_markdown_report(results)
	# Basic conversion (for production, use a proper markdown-to-html library)
	html = md_report.replace("# ", "<h1>").replace("\n\n", "</p>\n<p>")
	return f"<html><body>{html}</body></html>"

	def _get_analyst_system_prompt(self) -> str:
	"""System prompt for analysis tasks"""
	return """You are an expert qualitative research analyst with deep expertise in:
	- Thematic analysis and coding
	- Sentiment analysis and emotional intelligence
	- Pattern recognition in qualitative data
	- Insight generation and strategic thinking
	- Survey research methodology

	Your analyses should be:
	- Objective and evidence-based
	- Nuanced and comprehensive
	- Actionable and clear
	- Grounded in the actual data provided

	Always respond with valid JSON when requested."""

	def _extract_text_responses(self, responses: List[Dict]) -> List[str]:
	"""Extract text from response objects"""
	texts = []
	for response in responses:
	if isinstance(response, dict):
	# Extract all string values
	for value in response.values():
	if isinstance(value, str) and value.strip():
	texts.append(value.strip())
	elif isinstance(response, str):
	texts.append(response.strip())
	return texts

	def _format_responses_for_prompt(self, responses: List[str], max_responses: int = 50) -> str:
	"""Format responses for inclusion in prompt"""
	formatted = []
	for i, resp in enumerate(responses[:max_responses], 1):
	# Truncate very long responses
	truncated = resp[:300] + "..." if len(resp) > 300 else resp
	formatted.append(f"{i}. {truncated}")
	return "\n".join(formatted)

	def _parse_json_response(self, response: str):
	"""Parse JSON from LLM response"""
	response = response.strip()

	# Handle code blocks
	if "```json" in response:
	start = response.find("```json") + 7
	end = response.find("```", start)
	response = response[start:end].strip()
	elif "```" in response:
	start = response.find("```") + 3
	end = response.find("```", start)
	response = response[start:end].strip()

	try:
	return json.loads(response)
	except json.JSONDecodeError:
	# Try to find JSON object or array
	if "{" in response:
	start = response.find("{")
	end = response.rfind("}") + 1
	return json.loads(response[start:end])
	elif "[" in response:
	start = response.find("[")
	end = response.rfind("]") + 1
	return json.loads(response[start:end])
	raise