Claude
Fix Population Analysis bugs: question persistence and position detection
7e03ed8 unverified
"""
Response analyzer for population query results
Analyzes responses to extract positions, sentiment, themes, and statistics.
"""
import re
from typing import List, Dict, Optional, Tuple
from collections import Counter
from dataclasses import dataclass
from enum import Enum
from .sampler import PopulationResponse
class Position(str, Enum):
"""Position on a proposal/question"""
STRONGLY_SUPPORT = "strongly_support"
SUPPORT = "support"
NEUTRAL = "neutral"
OPPOSE = "oppose"
STRONGLY_OPPOSE = "strongly_oppose"
UNCLEAR = "unclear"
class Sentiment(str, Enum):
"""Overall sentiment of response"""
VERY_POSITIVE = "very_positive"
POSITIVE = "positive"
NEUTRAL = "neutral"
NEGATIVE = "negative"
VERY_NEGATIVE = "very_negative"
MIXED = "mixed"
@dataclass
class ResponseAnalysis:
"""Analysis of a single response"""
variant_id: int
position: Position
sentiment: Sentiment
confidence_score: float # 0-1
key_themes: List[str]
word_count: int
@dataclass
class PopulationResults:
"""Complete analysis results for a population query"""
# Basic info
question: str
population_size: int
base_persona_name: str
# Individual analyses
individual_analyses: List[ResponseAnalysis]
# Position distribution
position_counts: Dict[str, int]
position_percentages: Dict[str, float]
# Sentiment distribution
sentiment_counts: Dict[str, int]
sentiment_percentages: Dict[str, float]
# Common themes
top_themes: List[Tuple[str, int]] # (theme, count)
theme_clusters: Dict[str, List[int]] # theme -> variant_ids
# Statistics
average_response_length: float
median_response_length: float
response_length_range: Tuple[int, int]
# Sample responses
sample_support: Optional[str] = None
sample_oppose: Optional[str] = None
sample_neutral: Optional[str] = None
class ResponseAnalyzer:
"""Analyze population query responses"""
# Keywords for position detection
SUPPORT_KEYWORDS = [
"support", "agree", "favor", "approve", "endorse", "like", "positive",
"good idea", "strongly support", "in favor", "beneficial", "excited"
]
OPPOSE_KEYWORDS = [
"oppose", "disagree", "against", "reject", "disapprove", "don't support",
"bad idea", "strongly oppose", "concerned", "worried", "problematic"
]
NEUTRAL_KEYWORDS = [
"neutral", "mixed feelings", "depends", "conditional", "both sides",
"need more information", "unclear", "unsure"
]
# Keywords for sentiment
POSITIVE_SENTIMENT = [
"great", "excellent", "wonderful", "fantastic", "love", "brilliant",
"perfect", "amazing", "thrilled", "optimistic", "hopeful"
]
NEGATIVE_SENTIMENT = [
"terrible", "awful", "horrible", "disaster", "hate", "concerned",
"worried", "fearful", "pessimistic", "unfortunate"
]
# Common urban planning themes
THEME_KEYWORDS = {
"affordability": ["affordable", "cost", "expensive", "price", "rent"],
"sustainability": ["sustainable", "green", "environment", "climate", "carbon"],
"equity": ["equity", "justice", "fair", "displacement", "gentrification"],
"density": ["density", "crowded", "compact", "units"],
"transit": ["transit", "bus", "train", "transportation", "commute"],
"parking": ["parking", "cars", "vehicles", "garage"],
"safety": ["safety", "crime", "secure", "dangerous"],
"community": ["community", "neighbors", "neighborhood", "local"],
"business": ["business", "economic", "commerce", "jobs", "employment"],
"housing": ["housing", "homes", "residential", "apartments"],
}
def analyze_population(
self,
responses: List[PopulationResponse]
) -> PopulationResults:
"""
Analyze all responses from a population query
Args:
responses: List of PopulationResponse objects
Returns:
PopulationResults with complete analysis
"""
# Analyze each response individually
individual_analyses = [
self.analyze_single_response(r) for r in responses
]
# Calculate distributions
position_counts = self._count_positions(individual_analyses)
position_percentages = self._calc_percentages(
position_counts, len(responses)
)
sentiment_counts = self._count_sentiments(individual_analyses)
sentiment_percentages = self._calc_percentages(
sentiment_counts, len(responses)
)
# Extract themes
top_themes = self._extract_top_themes(responses, top_n=10)
theme_clusters = self._cluster_by_themes(responses, individual_analyses)
# Calculate statistics
lengths = [len(r.response.split()) for r in responses]
avg_length = sum(lengths) / len(lengths) if lengths else 0
sorted_lengths = sorted(lengths)
median_length = sorted_lengths[len(sorted_lengths) // 2] if sorted_lengths else 0
# Get sample responses
samples = self._get_sample_responses(responses, individual_analyses)
return PopulationResults(
question=responses[0].question if responses else "",
population_size=len(responses),
base_persona_name=responses[0].persona.name.split("_")[0] if responses else "",
individual_analyses=individual_analyses,
position_counts=position_counts,
position_percentages=position_percentages,
sentiment_counts=sentiment_counts,
sentiment_percentages=sentiment_percentages,
top_themes=top_themes,
theme_clusters=theme_clusters,
average_response_length=avg_length,
median_response_length=median_length,
response_length_range=(min(lengths) if lengths else 0, max(lengths) if lengths else 0),
sample_support=samples.get("support"),
sample_oppose=samples.get("oppose"),
sample_neutral=samples.get("neutral"),
)
def analyze_single_response(
self,
response: PopulationResponse
) -> ResponseAnalysis:
"""Analyze a single response"""
text = response.response.lower()
# Detect position
position, confidence = self._detect_position(text)
# Detect sentiment
sentiment = self._detect_sentiment(text)
# Extract key themes
themes = self._extract_themes(text)
# Word count
word_count = len(response.response.split())
return ResponseAnalysis(
variant_id=response.variant_id,
position=position,
sentiment=sentiment,
confidence_score=confidence,
key_themes=themes,
word_count=word_count
)
def _detect_position(self, text: str) -> Tuple[Position, float]:
"""Detect position from text with confidence score - improved accuracy"""
# Strong indicators - look at first and last sentences (where positions are usually stated)
sentences = text.split('.')
first_sentence = sentences[0].lower() if sentences else ""
last_sentence = sentences[-1].lower() if len(sentences) > 1 else ""
# Check for clear positive statements in key positions
strong_support_phrases = [
"i support", "i agree", "i approve", "i favor", "i endorse",
"strongly support", "strongly agree", "in favor of",
"this is a good", "this is beneficial", "i'm excited"
]
strong_oppose_phrases = [
"i oppose", "i disagree", "i reject", "i'm against",
"strongly oppose", "strongly disagree", "i cannot support",
"i don't support", "i can't support", "this is a bad",
"i'm concerned", "i'm worried", "i must oppose"
]
# Check first and last sentences for strong indicators (weighted heavily)
first_last_text = first_sentence + " " + last_sentence
support_score = 0
oppose_score = 0
for phrase in strong_support_phrases:
if phrase in first_last_text:
support_score += 3 # Strong weight for clear statements
for phrase in strong_oppose_phrases:
if phrase in first_last_text:
oppose_score += 3 # Strong weight for clear statements
# Count keyword matches in full text (lower weight)
support_count = sum(
1 for keyword in self.SUPPORT_KEYWORDS
if keyword in text and not any(neg in text for neg in ["don't " + keyword, "can't " + keyword, "won't " + keyword])
)
oppose_count = sum(
1 for keyword in self.OPPOSE_KEYWORDS
if keyword in text
)
neutral_count = sum(
1 for keyword in self.NEUTRAL_KEYWORDS
if keyword in text
)
# Combine scores
support_score += support_count
oppose_score += oppose_count
total_score = support_score + oppose_score + neutral_count
if total_score == 0:
return Position.UNCLEAR, 0.0
# Determine dominant position
if support_score > oppose_score and support_score > neutral_count:
confidence = support_score / max(total_score, 1)
if support_score >= 5:
return Position.STRONGLY_SUPPORT, min(confidence, 1.0)
return Position.SUPPORT, min(confidence, 1.0)
elif oppose_score > support_score and oppose_score > neutral_count:
confidence = oppose_score / max(total_score, 1)
if oppose_score >= 5:
return Position.STRONGLY_OPPOSE, min(confidence, 1.0)
return Position.OPPOSE, min(confidence, 1.0)
elif neutral_count > 0:
confidence = neutral_count / max(total_score, 1)
return Position.NEUTRAL, min(confidence, 1.0)
return Position.UNCLEAR, 0.3
def _detect_sentiment(self, text: str) -> Sentiment:
"""Detect overall sentiment"""
positive_count = sum(
1 for keyword in self.POSITIVE_SENTIMENT
if keyword in text
)
negative_count = sum(
1 for keyword in self.NEGATIVE_SENTIMENT
if keyword in text
)
if positive_count > 0 and negative_count > 0:
return Sentiment.MIXED
if positive_count >= 3:
return Sentiment.VERY_POSITIVE
elif positive_count >= 1:
return Sentiment.POSITIVE
if negative_count >= 3:
return Sentiment.VERY_NEGATIVE
elif negative_count >= 1:
return Sentiment.NEGATIVE
return Sentiment.NEUTRAL
def _extract_themes(self, text: str) -> List[str]:
"""Extract key themes from text"""
themes = []
for theme, keywords in self.THEME_KEYWORDS.items():
if any(keyword in text for keyword in keywords):
themes.append(theme)
return themes
def _count_positions(
self,
analyses: List[ResponseAnalysis]
) -> Dict[str, int]:
"""Count position occurrences"""
return dict(Counter(a.position.value for a in analyses))
def _count_sentiments(
self,
analyses: List[ResponseAnalysis]
) -> Dict[str, int]:
"""Count sentiment occurrences"""
return dict(Counter(a.sentiment.value for a in analyses))
def _calc_percentages(
self,
counts: Dict[str, int],
total: int
) -> Dict[str, float]:
"""Calculate percentages from counts"""
return {
key: (count / total * 100) if total > 0 else 0
for key, count in counts.items()
}
def _extract_top_themes(
self,
responses: List[PopulationResponse],
top_n: int = 10
) -> List[Tuple[str, int]]:
"""Extract most common themes across all responses"""
all_themes = []
for response in responses:
themes = self._extract_themes(response.response.lower())
all_themes.extend(themes)
theme_counts = Counter(all_themes)
return theme_counts.most_common(top_n)
def _cluster_by_themes(
self,
responses: List[PopulationResponse],
analyses: List[ResponseAnalysis]
) -> Dict[str, List[int]]:
"""Group variant IDs by their key themes"""
clusters = {}
for analysis in analyses:
for theme in analysis.key_themes:
if theme not in clusters:
clusters[theme] = []
clusters[theme].append(analysis.variant_id)
return clusters
def _get_sample_responses(
self,
responses: List[PopulationResponse],
analyses: List[ResponseAnalysis]
) -> Dict[str, str]:
"""Get sample responses for each position"""
samples = {}
# Find first response of each type
for response, analysis in zip(responses, analyses):
if "support" not in samples and analysis.position in [Position.SUPPORT, Position.STRONGLY_SUPPORT]:
samples["support"] = response.response
elif "oppose" not in samples and analysis.position in [Position.OPPOSE, Position.STRONGLY_OPPOSE]:
samples["oppose"] = response.response
elif "neutral" not in samples and analysis.position == Position.NEUTRAL:
samples["neutral"] = response.response
return samples