Spaces:
Sleeping
Sleeping
| """ | |
| Response analyzer for population query results | |
| Analyzes responses to extract positions, sentiment, themes, and statistics. | |
| """ | |
| import re | |
| from typing import List, Dict, Optional, Tuple | |
| from collections import Counter | |
| from dataclasses import dataclass | |
| from enum import Enum | |
| from .sampler import PopulationResponse | |
| class Position(str, Enum): | |
| """Position on a proposal/question""" | |
| STRONGLY_SUPPORT = "strongly_support" | |
| SUPPORT = "support" | |
| NEUTRAL = "neutral" | |
| OPPOSE = "oppose" | |
| STRONGLY_OPPOSE = "strongly_oppose" | |
| UNCLEAR = "unclear" | |
| class Sentiment(str, Enum): | |
| """Overall sentiment of response""" | |
| VERY_POSITIVE = "very_positive" | |
| POSITIVE = "positive" | |
| NEUTRAL = "neutral" | |
| NEGATIVE = "negative" | |
| VERY_NEGATIVE = "very_negative" | |
| MIXED = "mixed" | |
| class ResponseAnalysis: | |
| """Analysis of a single response""" | |
| variant_id: int | |
| position: Position | |
| sentiment: Sentiment | |
| confidence_score: float # 0-1 | |
| key_themes: List[str] | |
| word_count: int | |
| class PopulationResults: | |
| """Complete analysis results for a population query""" | |
| # Basic info | |
| question: str | |
| population_size: int | |
| base_persona_name: str | |
| # Individual analyses | |
| individual_analyses: List[ResponseAnalysis] | |
| # Position distribution | |
| position_counts: Dict[str, int] | |
| position_percentages: Dict[str, float] | |
| # Sentiment distribution | |
| sentiment_counts: Dict[str, int] | |
| sentiment_percentages: Dict[str, float] | |
| # Common themes | |
| top_themes: List[Tuple[str, int]] # (theme, count) | |
| theme_clusters: Dict[str, List[int]] # theme -> variant_ids | |
| # Statistics | |
| average_response_length: float | |
| median_response_length: float | |
| response_length_range: Tuple[int, int] | |
| # Sample responses | |
| sample_support: Optional[str] = None | |
| sample_oppose: Optional[str] = None | |
| sample_neutral: Optional[str] = None | |
| class ResponseAnalyzer: | |
| """Analyze population query responses""" | |
| # Keywords for position detection | |
| SUPPORT_KEYWORDS = [ | |
| "support", "agree", "favor", "approve", "endorse", "like", "positive", | |
| "good idea", "strongly support", "in favor", "beneficial", "excited" | |
| ] | |
| OPPOSE_KEYWORDS = [ | |
| "oppose", "disagree", "against", "reject", "disapprove", "don't support", | |
| "bad idea", "strongly oppose", "concerned", "worried", "problematic" | |
| ] | |
| NEUTRAL_KEYWORDS = [ | |
| "neutral", "mixed feelings", "depends", "conditional", "both sides", | |
| "need more information", "unclear", "unsure" | |
| ] | |
| # Keywords for sentiment | |
| POSITIVE_SENTIMENT = [ | |
| "great", "excellent", "wonderful", "fantastic", "love", "brilliant", | |
| "perfect", "amazing", "thrilled", "optimistic", "hopeful" | |
| ] | |
| NEGATIVE_SENTIMENT = [ | |
| "terrible", "awful", "horrible", "disaster", "hate", "concerned", | |
| "worried", "fearful", "pessimistic", "unfortunate" | |
| ] | |
| # Common urban planning themes | |
| THEME_KEYWORDS = { | |
| "affordability": ["affordable", "cost", "expensive", "price", "rent"], | |
| "sustainability": ["sustainable", "green", "environment", "climate", "carbon"], | |
| "equity": ["equity", "justice", "fair", "displacement", "gentrification"], | |
| "density": ["density", "crowded", "compact", "units"], | |
| "transit": ["transit", "bus", "train", "transportation", "commute"], | |
| "parking": ["parking", "cars", "vehicles", "garage"], | |
| "safety": ["safety", "crime", "secure", "dangerous"], | |
| "community": ["community", "neighbors", "neighborhood", "local"], | |
| "business": ["business", "economic", "commerce", "jobs", "employment"], | |
| "housing": ["housing", "homes", "residential", "apartments"], | |
| } | |
| def analyze_population( | |
| self, | |
| responses: List[PopulationResponse] | |
| ) -> PopulationResults: | |
| """ | |
| Analyze all responses from a population query | |
| Args: | |
| responses: List of PopulationResponse objects | |
| Returns: | |
| PopulationResults with complete analysis | |
| """ | |
| # Analyze each response individually | |
| individual_analyses = [ | |
| self.analyze_single_response(r) for r in responses | |
| ] | |
| # Calculate distributions | |
| position_counts = self._count_positions(individual_analyses) | |
| position_percentages = self._calc_percentages( | |
| position_counts, len(responses) | |
| ) | |
| sentiment_counts = self._count_sentiments(individual_analyses) | |
| sentiment_percentages = self._calc_percentages( | |
| sentiment_counts, len(responses) | |
| ) | |
| # Extract themes | |
| top_themes = self._extract_top_themes(responses, top_n=10) | |
| theme_clusters = self._cluster_by_themes(responses, individual_analyses) | |
| # Calculate statistics | |
| lengths = [len(r.response.split()) for r in responses] | |
| avg_length = sum(lengths) / len(lengths) if lengths else 0 | |
| sorted_lengths = sorted(lengths) | |
| median_length = sorted_lengths[len(sorted_lengths) // 2] if sorted_lengths else 0 | |
| # Get sample responses | |
| samples = self._get_sample_responses(responses, individual_analyses) | |
| return PopulationResults( | |
| question=responses[0].question if responses else "", | |
| population_size=len(responses), | |
| base_persona_name=responses[0].persona.name.split("_")[0] if responses else "", | |
| individual_analyses=individual_analyses, | |
| position_counts=position_counts, | |
| position_percentages=position_percentages, | |
| sentiment_counts=sentiment_counts, | |
| sentiment_percentages=sentiment_percentages, | |
| top_themes=top_themes, | |
| theme_clusters=theme_clusters, | |
| average_response_length=avg_length, | |
| median_response_length=median_length, | |
| response_length_range=(min(lengths) if lengths else 0, max(lengths) if lengths else 0), | |
| sample_support=samples.get("support"), | |
| sample_oppose=samples.get("oppose"), | |
| sample_neutral=samples.get("neutral"), | |
| ) | |
| def analyze_single_response( | |
| self, | |
| response: PopulationResponse | |
| ) -> ResponseAnalysis: | |
| """Analyze a single response""" | |
| text = response.response.lower() | |
| # Detect position | |
| position, confidence = self._detect_position(text) | |
| # Detect sentiment | |
| sentiment = self._detect_sentiment(text) | |
| # Extract key themes | |
| themes = self._extract_themes(text) | |
| # Word count | |
| word_count = len(response.response.split()) | |
| return ResponseAnalysis( | |
| variant_id=response.variant_id, | |
| position=position, | |
| sentiment=sentiment, | |
| confidence_score=confidence, | |
| key_themes=themes, | |
| word_count=word_count | |
| ) | |
| def _detect_position(self, text: str) -> Tuple[Position, float]: | |
| """Detect position from text with confidence score - improved accuracy""" | |
| # Strong indicators - look at first and last sentences (where positions are usually stated) | |
| sentences = text.split('.') | |
| first_sentence = sentences[0].lower() if sentences else "" | |
| last_sentence = sentences[-1].lower() if len(sentences) > 1 else "" | |
| # Check for clear positive statements in key positions | |
| strong_support_phrases = [ | |
| "i support", "i agree", "i approve", "i favor", "i endorse", | |
| "strongly support", "strongly agree", "in favor of", | |
| "this is a good", "this is beneficial", "i'm excited" | |
| ] | |
| strong_oppose_phrases = [ | |
| "i oppose", "i disagree", "i reject", "i'm against", | |
| "strongly oppose", "strongly disagree", "i cannot support", | |
| "i don't support", "i can't support", "this is a bad", | |
| "i'm concerned", "i'm worried", "i must oppose" | |
| ] | |
| # Check first and last sentences for strong indicators (weighted heavily) | |
| first_last_text = first_sentence + " " + last_sentence | |
| support_score = 0 | |
| oppose_score = 0 | |
| for phrase in strong_support_phrases: | |
| if phrase in first_last_text: | |
| support_score += 3 # Strong weight for clear statements | |
| for phrase in strong_oppose_phrases: | |
| if phrase in first_last_text: | |
| oppose_score += 3 # Strong weight for clear statements | |
| # Count keyword matches in full text (lower weight) | |
| support_count = sum( | |
| 1 for keyword in self.SUPPORT_KEYWORDS | |
| if keyword in text and not any(neg in text for neg in ["don't " + keyword, "can't " + keyword, "won't " + keyword]) | |
| ) | |
| oppose_count = sum( | |
| 1 for keyword in self.OPPOSE_KEYWORDS | |
| if keyword in text | |
| ) | |
| neutral_count = sum( | |
| 1 for keyword in self.NEUTRAL_KEYWORDS | |
| if keyword in text | |
| ) | |
| # Combine scores | |
| support_score += support_count | |
| oppose_score += oppose_count | |
| total_score = support_score + oppose_score + neutral_count | |
| if total_score == 0: | |
| return Position.UNCLEAR, 0.0 | |
| # Determine dominant position | |
| if support_score > oppose_score and support_score > neutral_count: | |
| confidence = support_score / max(total_score, 1) | |
| if support_score >= 5: | |
| return Position.STRONGLY_SUPPORT, min(confidence, 1.0) | |
| return Position.SUPPORT, min(confidence, 1.0) | |
| elif oppose_score > support_score and oppose_score > neutral_count: | |
| confidence = oppose_score / max(total_score, 1) | |
| if oppose_score >= 5: | |
| return Position.STRONGLY_OPPOSE, min(confidence, 1.0) | |
| return Position.OPPOSE, min(confidence, 1.0) | |
| elif neutral_count > 0: | |
| confidence = neutral_count / max(total_score, 1) | |
| return Position.NEUTRAL, min(confidence, 1.0) | |
| return Position.UNCLEAR, 0.3 | |
| def _detect_sentiment(self, text: str) -> Sentiment: | |
| """Detect overall sentiment""" | |
| positive_count = sum( | |
| 1 for keyword in self.POSITIVE_SENTIMENT | |
| if keyword in text | |
| ) | |
| negative_count = sum( | |
| 1 for keyword in self.NEGATIVE_SENTIMENT | |
| if keyword in text | |
| ) | |
| if positive_count > 0 and negative_count > 0: | |
| return Sentiment.MIXED | |
| if positive_count >= 3: | |
| return Sentiment.VERY_POSITIVE | |
| elif positive_count >= 1: | |
| return Sentiment.POSITIVE | |
| if negative_count >= 3: | |
| return Sentiment.VERY_NEGATIVE | |
| elif negative_count >= 1: | |
| return Sentiment.NEGATIVE | |
| return Sentiment.NEUTRAL | |
| def _extract_themes(self, text: str) -> List[str]: | |
| """Extract key themes from text""" | |
| themes = [] | |
| for theme, keywords in self.THEME_KEYWORDS.items(): | |
| if any(keyword in text for keyword in keywords): | |
| themes.append(theme) | |
| return themes | |
| def _count_positions( | |
| self, | |
| analyses: List[ResponseAnalysis] | |
| ) -> Dict[str, int]: | |
| """Count position occurrences""" | |
| return dict(Counter(a.position.value for a in analyses)) | |
| def _count_sentiments( | |
| self, | |
| analyses: List[ResponseAnalysis] | |
| ) -> Dict[str, int]: | |
| """Count sentiment occurrences""" | |
| return dict(Counter(a.sentiment.value for a in analyses)) | |
| def _calc_percentages( | |
| self, | |
| counts: Dict[str, int], | |
| total: int | |
| ) -> Dict[str, float]: | |
| """Calculate percentages from counts""" | |
| return { | |
| key: (count / total * 100) if total > 0 else 0 | |
| for key, count in counts.items() | |
| } | |
| def _extract_top_themes( | |
| self, | |
| responses: List[PopulationResponse], | |
| top_n: int = 10 | |
| ) -> List[Tuple[str, int]]: | |
| """Extract most common themes across all responses""" | |
| all_themes = [] | |
| for response in responses: | |
| themes = self._extract_themes(response.response.lower()) | |
| all_themes.extend(themes) | |
| theme_counts = Counter(all_themes) | |
| return theme_counts.most_common(top_n) | |
| def _cluster_by_themes( | |
| self, | |
| responses: List[PopulationResponse], | |
| analyses: List[ResponseAnalysis] | |
| ) -> Dict[str, List[int]]: | |
| """Group variant IDs by their key themes""" | |
| clusters = {} | |
| for analysis in analyses: | |
| for theme in analysis.key_themes: | |
| if theme not in clusters: | |
| clusters[theme] = [] | |
| clusters[theme].append(analysis.variant_id) | |
| return clusters | |
| def _get_sample_responses( | |
| self, | |
| responses: List[PopulationResponse], | |
| analyses: List[ResponseAnalysis] | |
| ) -> Dict[str, str]: | |
| """Get sample responses for each position""" | |
| samples = {} | |
| # Find first response of each type | |
| for response, analysis in zip(responses, analyses): | |
| if "support" not in samples and analysis.position in [Position.SUPPORT, Position.STRONGLY_SUPPORT]: | |
| samples["support"] = response.response | |
| elif "oppose" not in samples and analysis.position in [Position.OPPOSE, Position.STRONGLY_OPPOSE]: | |
| samples["oppose"] = response.response | |
| elif "neutral" not in samples and analysis.position == Position.NEUTRAL: | |
| samples["neutral"] = response.response | |
| return samples | |