Spaces:

MatthewStroud
/

AI_Personas

Sleeping

AI_Personas / src /population /analyzer.py

Claude

Fix Population Analysis bugs: question persistence and position detection

7e03ed8 unverified 4 months ago

13.6 kB

	"""
	Response analyzer for population query results

	Analyzes responses to extract positions, sentiment, themes, and statistics.
	"""

	import re
	from typing import List, Dict, Optional, Tuple
	from collections import Counter
	from dataclasses import dataclass
	from enum import Enum

	from .sampler import PopulationResponse


	class Position(str, Enum):
	"""Position on a proposal/question"""
	STRONGLY_SUPPORT = "strongly_support"
	SUPPORT = "support"
	NEUTRAL = "neutral"
	OPPOSE = "oppose"
	STRONGLY_OPPOSE = "strongly_oppose"
	UNCLEAR = "unclear"


	class Sentiment(str, Enum):
	"""Overall sentiment of response"""
	VERY_POSITIVE = "very_positive"
	POSITIVE = "positive"
	NEUTRAL = "neutral"
	NEGATIVE = "negative"
	VERY_NEGATIVE = "very_negative"
	MIXED = "mixed"


	@dataclass
	class ResponseAnalysis:
	"""Analysis of a single response"""
	variant_id: int
	position: Position
	sentiment: Sentiment
	confidence_score: float # 0-1
	key_themes: List[str]
	word_count: int


	@dataclass
	class PopulationResults:
	"""Complete analysis results for a population query"""

	# Basic info
	question: str
	population_size: int
	base_persona_name: str

	# Individual analyses
	individual_analyses: List[ResponseAnalysis]

	# Position distribution
	position_counts: Dict[str, int]
	position_percentages: Dict[str, float]

	# Sentiment distribution
	sentiment_counts: Dict[str, int]
	sentiment_percentages: Dict[str, float]

	# Common themes
	top_themes: List[Tuple[str, int]] # (theme, count)
	theme_clusters: Dict[str, List[int]] # theme -> variant_ids

	# Statistics
	average_response_length: float
	median_response_length: float
	response_length_range: Tuple[int, int]

	# Sample responses
	sample_support: Optional[str] = None
	sample_oppose: Optional[str] = None
	sample_neutral: Optional[str] = None


	class ResponseAnalyzer:
	"""Analyze population query responses"""

	# Keywords for position detection
	SUPPORT_KEYWORDS = [
	"support", "agree", "favor", "approve", "endorse", "like", "positive",
	"good idea", "strongly support", "in favor", "beneficial", "excited"
	]

	OPPOSE_KEYWORDS = [
	"oppose", "disagree", "against", "reject", "disapprove", "don't support",
	"bad idea", "strongly oppose", "concerned", "worried", "problematic"
	]

	NEUTRAL_KEYWORDS = [
	"neutral", "mixed feelings", "depends", "conditional", "both sides",
	"need more information", "unclear", "unsure"
	]

	# Keywords for sentiment
	POSITIVE_SENTIMENT = [
	"great", "excellent", "wonderful", "fantastic", "love", "brilliant",
	"perfect", "amazing", "thrilled", "optimistic", "hopeful"
	]

	NEGATIVE_SENTIMENT = [
	"terrible", "awful", "horrible", "disaster", "hate", "concerned",
	"worried", "fearful", "pessimistic", "unfortunate"
	]

	# Common urban planning themes
	THEME_KEYWORDS = {
	"affordability": ["affordable", "cost", "expensive", "price", "rent"],
	"sustainability": ["sustainable", "green", "environment", "climate", "carbon"],
	"equity": ["equity", "justice", "fair", "displacement", "gentrification"],
	"density": ["density", "crowded", "compact", "units"],
	"transit": ["transit", "bus", "train", "transportation", "commute"],
	"parking": ["parking", "cars", "vehicles", "garage"],
	"safety": ["safety", "crime", "secure", "dangerous"],
	"community": ["community", "neighbors", "neighborhood", "local"],
	"business": ["business", "economic", "commerce", "jobs", "employment"],
	"housing": ["housing", "homes", "residential", "apartments"],
	}

	def analyze_population(
	self,
	responses: List[PopulationResponse]
	) -> PopulationResults:
	"""
	Analyze all responses from a population query

	Args:
	responses: List of PopulationResponse objects

	Returns:
	PopulationResults with complete analysis
	"""
	# Analyze each response individually
	individual_analyses = [
	self.analyze_single_response(r) for r in responses
	]

	# Calculate distributions
	position_counts = self._count_positions(individual_analyses)
	position_percentages = self._calc_percentages(
	position_counts, len(responses)
	)

	sentiment_counts = self._count_sentiments(individual_analyses)
	sentiment_percentages = self._calc_percentages(
	sentiment_counts, len(responses)
	)

	# Extract themes
	top_themes = self._extract_top_themes(responses, top_n=10)
	theme_clusters = self._cluster_by_themes(responses, individual_analyses)

	# Calculate statistics
	lengths = [len(r.response.split()) for r in responses]
	avg_length = sum(lengths) / len(lengths) if lengths else 0
	sorted_lengths = sorted(lengths)
	median_length = sorted_lengths[len(sorted_lengths) // 2] if sorted_lengths else 0

	# Get sample responses
	samples = self._get_sample_responses(responses, individual_analyses)

	return PopulationResults(
	question=responses[0].question if responses else "",
	population_size=len(responses),
	base_persona_name=responses[0].persona.name.split("_")[0] if responses else "",
	individual_analyses=individual_analyses,
	position_counts=position_counts,
	position_percentages=position_percentages,
	sentiment_counts=sentiment_counts,
	sentiment_percentages=sentiment_percentages,
	top_themes=top_themes,
	theme_clusters=theme_clusters,
	average_response_length=avg_length,
	median_response_length=median_length,
	response_length_range=(min(lengths) if lengths else 0, max(lengths) if lengths else 0),
	sample_support=samples.get("support"),
	sample_oppose=samples.get("oppose"),
	sample_neutral=samples.get("neutral"),
	)

	def analyze_single_response(
	self,
	response: PopulationResponse
	) -> ResponseAnalysis:
	"""Analyze a single response"""
	text = response.response.lower()

	# Detect position
	position, confidence = self._detect_position(text)

	# Detect sentiment
	sentiment = self._detect_sentiment(text)

	# Extract key themes
	themes = self._extract_themes(text)

	# Word count
	word_count = len(response.response.split())

	return ResponseAnalysis(
	variant_id=response.variant_id,
	position=position,
	sentiment=sentiment,
	confidence_score=confidence,
	key_themes=themes,
	word_count=word_count
	)

	def _detect_position(self, text: str) -> Tuple[Position, float]:
	"""Detect position from text with confidence score - improved accuracy"""

	# Strong indicators - look at first and last sentences (where positions are usually stated)
	sentences = text.split('.')
	first_sentence = sentences[0].lower() if sentences else ""
	last_sentence = sentences[-1].lower() if len(sentences) > 1 else ""

	# Check for clear positive statements in key positions
	strong_support_phrases = [
	"i support", "i agree", "i approve", "i favor", "i endorse",
	"strongly support", "strongly agree", "in favor of",
	"this is a good", "this is beneficial", "i'm excited"
	]
	strong_oppose_phrases = [
	"i oppose", "i disagree", "i reject", "i'm against",
	"strongly oppose", "strongly disagree", "i cannot support",
	"i don't support", "i can't support", "this is a bad",
	"i'm concerned", "i'm worried", "i must oppose"
	]

	# Check first and last sentences for strong indicators (weighted heavily)
	first_last_text = first_sentence + " " + last_sentence
	support_score = 0
	oppose_score = 0

	for phrase in strong_support_phrases:
	if phrase in first_last_text:
	support_score += 3 # Strong weight for clear statements

	for phrase in strong_oppose_phrases:
	if phrase in first_last_text:
	oppose_score += 3 # Strong weight for clear statements

	# Count keyword matches in full text (lower weight)
	support_count = sum(
	1 for keyword in self.SUPPORT_KEYWORDS
	if keyword in text and not any(neg in text for neg in ["don't " + keyword, "can't " + keyword, "won't " + keyword])
	)
	oppose_count = sum(
	1 for keyword in self.OPPOSE_KEYWORDS
	if keyword in text
	)
	neutral_count = sum(
	1 for keyword in self.NEUTRAL_KEYWORDS
	if keyword in text
	)

	# Combine scores
	support_score += support_count
	oppose_score += oppose_count

	total_score = support_score + oppose_score + neutral_count

	if total_score == 0:
	return Position.UNCLEAR, 0.0

	# Determine dominant position
	if support_score > oppose_score and support_score > neutral_count:
	confidence = support_score / max(total_score, 1)
	if support_score >= 5:
	return Position.STRONGLY_SUPPORT, min(confidence, 1.0)
	return Position.SUPPORT, min(confidence, 1.0)

	elif oppose_score > support_score and oppose_score > neutral_count:
	confidence = oppose_score / max(total_score, 1)
	if oppose_score >= 5:
	return Position.STRONGLY_OPPOSE, min(confidence, 1.0)
	return Position.OPPOSE, min(confidence, 1.0)

	elif neutral_count > 0:
	confidence = neutral_count / max(total_score, 1)
	return Position.NEUTRAL, min(confidence, 1.0)

	return Position.UNCLEAR, 0.3

	def _detect_sentiment(self, text: str) -> Sentiment:
	"""Detect overall sentiment"""
	positive_count = sum(
	1 for keyword in self.POSITIVE_SENTIMENT
	if keyword in text
	)
	negative_count = sum(
	1 for keyword in self.NEGATIVE_SENTIMENT
	if keyword in text
	)

	if positive_count > 0 and negative_count > 0:
	return Sentiment.MIXED

	if positive_count >= 3:
	return Sentiment.VERY_POSITIVE
	elif positive_count >= 1:
	return Sentiment.POSITIVE

	if negative_count >= 3:
	return Sentiment.VERY_NEGATIVE
	elif negative_count >= 1:
	return Sentiment.NEGATIVE

	return Sentiment.NEUTRAL

	def _extract_themes(self, text: str) -> List[str]:
	"""Extract key themes from text"""
	themes = []
	for theme, keywords in self.THEME_KEYWORDS.items():
	if any(keyword in text for keyword in keywords):
	themes.append(theme)
	return themes

	def _count_positions(
	self,
	analyses: List[ResponseAnalysis]
	) -> Dict[str, int]:
	"""Count position occurrences"""
	return dict(Counter(a.position.value for a in analyses))

	def _count_sentiments(
	self,
	analyses: List[ResponseAnalysis]
	) -> Dict[str, int]:
	"""Count sentiment occurrences"""
	return dict(Counter(a.sentiment.value for a in analyses))

	def _calc_percentages(
	self,
	counts: Dict[str, int],
	total: int
	) -> Dict[str, float]:
	"""Calculate percentages from counts"""
	return {
	key: (count / total * 100) if total > 0 else 0
	for key, count in counts.items()
	}

	def _extract_top_themes(
	self,
	responses: List[PopulationResponse],
	top_n: int = 10
	) -> List[Tuple[str, int]]:
	"""Extract most common themes across all responses"""
	all_themes = []
	for response in responses:
	themes = self._extract_themes(response.response.lower())
	all_themes.extend(themes)

	theme_counts = Counter(all_themes)
	return theme_counts.most_common(top_n)

	def _cluster_by_themes(
	self,
	responses: List[PopulationResponse],
	analyses: List[ResponseAnalysis]
	) -> Dict[str, List[int]]:
	"""Group variant IDs by their key themes"""
	clusters = {}
	for analysis in analyses:
	for theme in analysis.key_themes:
	if theme not in clusters:
	clusters[theme] = []
	clusters[theme].append(analysis.variant_id)
	return clusters

	def _get_sample_responses(
	self,
	responses: List[PopulationResponse],
	analyses: List[ResponseAnalysis]
	) -> Dict[str, str]:
	"""Get sample responses for each position"""
	samples = {}

	# Find first response of each type
	for response, analysis in zip(responses, analyses):
	if "support" not in samples and analysis.position in [Position.SUPPORT, Position.STRONGLY_SUPPORT]:
	samples["support"] = response.response
	elif "oppose" not in samples and analysis.position in [Position.OPPOSE, Position.STRONGLY_OPPOSE]:
	samples["oppose"] = response.response
	elif "neutral" not in samples and analysis.position == Position.NEUTRAL:
	samples["neutral"] = response.response

	return samples