""" Response analyzer for population query results Analyzes responses to extract positions, sentiment, themes, and statistics. """ import re from typing import List, Dict, Optional, Tuple from collections import Counter from dataclasses import dataclass from enum import Enum from .sampler import PopulationResponse class Position(str, Enum): """Position on a proposal/question""" STRONGLY_SUPPORT = "strongly_support" SUPPORT = "support" NEUTRAL = "neutral" OPPOSE = "oppose" STRONGLY_OPPOSE = "strongly_oppose" UNCLEAR = "unclear" class Sentiment(str, Enum): """Overall sentiment of response""" VERY_POSITIVE = "very_positive" POSITIVE = "positive" NEUTRAL = "neutral" NEGATIVE = "negative" VERY_NEGATIVE = "very_negative" MIXED = "mixed" @dataclass class ResponseAnalysis: """Analysis of a single response""" variant_id: int position: Position sentiment: Sentiment confidence_score: float # 0-1 key_themes: List[str] word_count: int @dataclass class PopulationResults: """Complete analysis results for a population query""" # Basic info question: str population_size: int base_persona_name: str # Individual analyses individual_analyses: List[ResponseAnalysis] # Position distribution position_counts: Dict[str, int] position_percentages: Dict[str, float] # Sentiment distribution sentiment_counts: Dict[str, int] sentiment_percentages: Dict[str, float] # Common themes top_themes: List[Tuple[str, int]] # (theme, count) theme_clusters: Dict[str, List[int]] # theme -> variant_ids # Statistics average_response_length: float median_response_length: float response_length_range: Tuple[int, int] # Sample responses sample_support: Optional[str] = None sample_oppose: Optional[str] = None sample_neutral: Optional[str] = None class ResponseAnalyzer: """Analyze population query responses""" # Keywords for position detection SUPPORT_KEYWORDS = [ "support", "agree", "favor", "approve", "endorse", "like", "positive", "good idea", "strongly support", "in favor", "beneficial", "excited" ] OPPOSE_KEYWORDS = [ "oppose", "disagree", "against", "reject", "disapprove", "don't support", "bad idea", "strongly oppose", "concerned", "worried", "problematic" ] NEUTRAL_KEYWORDS = [ "neutral", "mixed feelings", "depends", "conditional", "both sides", "need more information", "unclear", "unsure" ] # Keywords for sentiment POSITIVE_SENTIMENT = [ "great", "excellent", "wonderful", "fantastic", "love", "brilliant", "perfect", "amazing", "thrilled", "optimistic", "hopeful" ] NEGATIVE_SENTIMENT = [ "terrible", "awful", "horrible", "disaster", "hate", "concerned", "worried", "fearful", "pessimistic", "unfortunate" ] # Common urban planning themes THEME_KEYWORDS = { "affordability": ["affordable", "cost", "expensive", "price", "rent"], "sustainability": ["sustainable", "green", "environment", "climate", "carbon"], "equity": ["equity", "justice", "fair", "displacement", "gentrification"], "density": ["density", "crowded", "compact", "units"], "transit": ["transit", "bus", "train", "transportation", "commute"], "parking": ["parking", "cars", "vehicles", "garage"], "safety": ["safety", "crime", "secure", "dangerous"], "community": ["community", "neighbors", "neighborhood", "local"], "business": ["business", "economic", "commerce", "jobs", "employment"], "housing": ["housing", "homes", "residential", "apartments"], } def analyze_population( self, responses: List[PopulationResponse] ) -> PopulationResults: """ Analyze all responses from a population query Args: responses: List of PopulationResponse objects Returns: PopulationResults with complete analysis """ # Analyze each response individually individual_analyses = [ self.analyze_single_response(r) for r in responses ] # Calculate distributions position_counts = self._count_positions(individual_analyses) position_percentages = self._calc_percentages( position_counts, len(responses) ) sentiment_counts = self._count_sentiments(individual_analyses) sentiment_percentages = self._calc_percentages( sentiment_counts, len(responses) ) # Extract themes top_themes = self._extract_top_themes(responses, top_n=10) theme_clusters = self._cluster_by_themes(responses, individual_analyses) # Calculate statistics lengths = [len(r.response.split()) for r in responses] avg_length = sum(lengths) / len(lengths) if lengths else 0 sorted_lengths = sorted(lengths) median_length = sorted_lengths[len(sorted_lengths) // 2] if sorted_lengths else 0 # Get sample responses samples = self._get_sample_responses(responses, individual_analyses) return PopulationResults( question=responses[0].question if responses else "", population_size=len(responses), base_persona_name=responses[0].persona.name.split("_")[0] if responses else "", individual_analyses=individual_analyses, position_counts=position_counts, position_percentages=position_percentages, sentiment_counts=sentiment_counts, sentiment_percentages=sentiment_percentages, top_themes=top_themes, theme_clusters=theme_clusters, average_response_length=avg_length, median_response_length=median_length, response_length_range=(min(lengths) if lengths else 0, max(lengths) if lengths else 0), sample_support=samples.get("support"), sample_oppose=samples.get("oppose"), sample_neutral=samples.get("neutral"), ) def analyze_single_response( self, response: PopulationResponse ) -> ResponseAnalysis: """Analyze a single response""" text = response.response.lower() # Detect position position, confidence = self._detect_position(text) # Detect sentiment sentiment = self._detect_sentiment(text) # Extract key themes themes = self._extract_themes(text) # Word count word_count = len(response.response.split()) return ResponseAnalysis( variant_id=response.variant_id, position=position, sentiment=sentiment, confidence_score=confidence, key_themes=themes, word_count=word_count ) def _detect_position(self, text: str) -> Tuple[Position, float]: """Detect position from text with confidence score - improved accuracy""" # Strong indicators - look at first and last sentences (where positions are usually stated) sentences = text.split('.') first_sentence = sentences[0].lower() if sentences else "" last_sentence = sentences[-1].lower() if len(sentences) > 1 else "" # Check for clear positive statements in key positions strong_support_phrases = [ "i support", "i agree", "i approve", "i favor", "i endorse", "strongly support", "strongly agree", "in favor of", "this is a good", "this is beneficial", "i'm excited" ] strong_oppose_phrases = [ "i oppose", "i disagree", "i reject", "i'm against", "strongly oppose", "strongly disagree", "i cannot support", "i don't support", "i can't support", "this is a bad", "i'm concerned", "i'm worried", "i must oppose" ] # Check first and last sentences for strong indicators (weighted heavily) first_last_text = first_sentence + " " + last_sentence support_score = 0 oppose_score = 0 for phrase in strong_support_phrases: if phrase in first_last_text: support_score += 3 # Strong weight for clear statements for phrase in strong_oppose_phrases: if phrase in first_last_text: oppose_score += 3 # Strong weight for clear statements # Count keyword matches in full text (lower weight) support_count = sum( 1 for keyword in self.SUPPORT_KEYWORDS if keyword in text and not any(neg in text for neg in ["don't " + keyword, "can't " + keyword, "won't " + keyword]) ) oppose_count = sum( 1 for keyword in self.OPPOSE_KEYWORDS if keyword in text ) neutral_count = sum( 1 for keyword in self.NEUTRAL_KEYWORDS if keyword in text ) # Combine scores support_score += support_count oppose_score += oppose_count total_score = support_score + oppose_score + neutral_count if total_score == 0: return Position.UNCLEAR, 0.0 # Determine dominant position if support_score > oppose_score and support_score > neutral_count: confidence = support_score / max(total_score, 1) if support_score >= 5: return Position.STRONGLY_SUPPORT, min(confidence, 1.0) return Position.SUPPORT, min(confidence, 1.0) elif oppose_score > support_score and oppose_score > neutral_count: confidence = oppose_score / max(total_score, 1) if oppose_score >= 5: return Position.STRONGLY_OPPOSE, min(confidence, 1.0) return Position.OPPOSE, min(confidence, 1.0) elif neutral_count > 0: confidence = neutral_count / max(total_score, 1) return Position.NEUTRAL, min(confidence, 1.0) return Position.UNCLEAR, 0.3 def _detect_sentiment(self, text: str) -> Sentiment: """Detect overall sentiment""" positive_count = sum( 1 for keyword in self.POSITIVE_SENTIMENT if keyword in text ) negative_count = sum( 1 for keyword in self.NEGATIVE_SENTIMENT if keyword in text ) if positive_count > 0 and negative_count > 0: return Sentiment.MIXED if positive_count >= 3: return Sentiment.VERY_POSITIVE elif positive_count >= 1: return Sentiment.POSITIVE if negative_count >= 3: return Sentiment.VERY_NEGATIVE elif negative_count >= 1: return Sentiment.NEGATIVE return Sentiment.NEUTRAL def _extract_themes(self, text: str) -> List[str]: """Extract key themes from text""" themes = [] for theme, keywords in self.THEME_KEYWORDS.items(): if any(keyword in text for keyword in keywords): themes.append(theme) return themes def _count_positions( self, analyses: List[ResponseAnalysis] ) -> Dict[str, int]: """Count position occurrences""" return dict(Counter(a.position.value for a in analyses)) def _count_sentiments( self, analyses: List[ResponseAnalysis] ) -> Dict[str, int]: """Count sentiment occurrences""" return dict(Counter(a.sentiment.value for a in analyses)) def _calc_percentages( self, counts: Dict[str, int], total: int ) -> Dict[str, float]: """Calculate percentages from counts""" return { key: (count / total * 100) if total > 0 else 0 for key, count in counts.items() } def _extract_top_themes( self, responses: List[PopulationResponse], top_n: int = 10 ) -> List[Tuple[str, int]]: """Extract most common themes across all responses""" all_themes = [] for response in responses: themes = self._extract_themes(response.response.lower()) all_themes.extend(themes) theme_counts = Counter(all_themes) return theme_counts.most_common(top_n) def _cluster_by_themes( self, responses: List[PopulationResponse], analyses: List[ResponseAnalysis] ) -> Dict[str, List[int]]: """Group variant IDs by their key themes""" clusters = {} for analysis in analyses: for theme in analysis.key_themes: if theme not in clusters: clusters[theme] = [] clusters[theme].append(analysis.variant_id) return clusters def _get_sample_responses( self, responses: List[PopulationResponse], analyses: List[ResponseAnalysis] ) -> Dict[str, str]: """Get sample responses for each position""" samples = {} # Find first response of each type for response, analysis in zip(responses, analyses): if "support" not in samples and analysis.position in [Position.SUPPORT, Position.STRONGLY_SUPPORT]: samples["support"] = response.response elif "oppose" not in samples and analysis.position in [Position.OPPOSE, Position.STRONGLY_OPPOSE]: samples["oppose"] = response.response elif "neutral" not in samples and analysis.position == Position.NEUTRAL: samples["neutral"] = response.response return samples