TEMPO-BIAS / src /answer_extraction.py

init

5b42a0e about 1 month ago

8.75 kB

	"""
	Answer extraction and sentiment analysis utilities.
	"""

	import re
	import logging
	from typing import Dict, List, Optional, Any

	logger = logging.getLogger(__name__)


	class SentimentAnalyzer:
	"""Analyze sentiment of text responses."""

	def __init__(self, method: str = "vader"):
	"""
	Initialize sentiment analyzer.

	Args:
	method: "vader", "textblob", or "transformers"
	"""
	self.method = method
	self._setup()

	def _setup(self):
	"""Setup the sentiment analyzer."""
	if self.method == "vader":
	try:
	from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
	self.analyzer = SentimentIntensityAnalyzer()
	except ImportError:
	logger.warning("VADER not installed, falling back to TextBlob")
	self.method = "textblob"
	self._setup()

	elif self.method == "textblob":
	try:
	from textblob import TextBlob
	self.analyzer = TextBlob
	except ImportError:
	logger.error("TextBlob not installed")
	self.analyzer = None

	elif self.method == "transformers":
	try:
	from transformers import pipeline
	self.analyzer = pipeline(
	"sentiment-analysis",
	model="cardiffnlp/twitter-roberta-base-sentiment-latest"
	)
	except ImportError:
	logger.warning("Transformers not available, falling back to VADER")
	self.method = "vader"
	self._setup()

	def analyze(self, text: str) -> Dict[str, float]:
	"""
	Analyze sentiment of text.

	Returns:
	Dict with sentiment scores including 'compound' score
	"""
	if not text or not self.analyzer:
	return {"compound": 0.0}

	try:
	if self.method == "vader":
	scores = self.analyzer.polarity_scores(text)
	return {
	"compound": scores["compound"],
	"positive": scores["pos"],
	"negative": scores["neg"],
	"neutral": scores["neu"],
	}

	elif self.method == "textblob":
	blob = self.analyzer(text)
	return {
	"compound": blob.sentiment.polarity,
	"subjectivity": blob.sentiment.subjectivity,
	}

	elif self.method == "transformers":
	# Truncate for model
	result = self.analyzer(text[:512])[0]
	# Convert to -1 to 1 scale
	if result["label"] == "positive":
	compound = result["score"]
	elif result["label"] == "negative":
	compound = -result["score"]
	else:
	compound = 0.0
	return {"compound": compound, "label": result["label"]}

	except Exception as e:
	logger.error(f"Error analyzing sentiment: {e}")
	return {"compound": 0.0}

	return {"compound": 0.0}


	class AnswerExtractor:
	"""Extract structured answers from LLM responses."""

	def __init__(self):
	self.sentiment_analyzer = SentimentAnalyzer()

	def extract_likert_scale(self, response: str, scale: List[str] = None) -> Optional[int]:
	"""
	Extract Likert scale response from text.

	Args:
	response: LLM response text
	scale: List of scale options (e.g., ["Strongly Disagree", "Disagree", ...])

	Returns:
	Scale index (0-based) or None if not found
	"""
	if scale is None:
	scale = [
	"strongly disagree",
	"disagree",
	"neutral",
	"agree",
	"strongly agree"
	]

	response_lower = response.lower()

	for i, option in enumerate(scale):
	if option.lower() in response_lower:
	return i

	return None

	def extract_sentiment_score(self, response: str) -> float:
	"""Extract sentiment score from response."""
	result = self.sentiment_analyzer.analyze(response)
	return result.get("compound", 0.0)

	def extract_number(self, response: str) -> Optional[float]:
	"""Extract a number from response."""
	# Look for numbers in various formats
	patterns = [
	r'[-+]?\d*\.?\d+', # Basic numbers
	r'[-+]?\d+/\d+', # Fractions
	]

	for pattern in patterns:
	match = re.search(pattern, response)
	if match:
	try:
	value = match.group()
	if '/' in value:
	num, denom = value.split('/')
	return float(num) / float(denom)
	return float(value)
	except:
	continue

	return None

	def extract_agreement(self, response: str) -> Optional[str]:
	"""
	Extract agreement level from response.

	Returns:
	"agree", "disagree", "neutral", or None
	"""
	response_lower = response.lower()

	# Check for strong indicators
	strong_agree = ["strongly agree", "completely agree", "absolutely agree", "fully agree"]
	strong_disagree = ["strongly disagree", "completely disagree", "absolutely disagree"]

	for phrase in strong_agree:
	if phrase in response_lower:
	return "strongly_agree"

	for phrase in strong_disagree:
	if phrase in response_lower:
	return "strongly_disagree"

	# Check for basic agreement/disagreement
	if "disagree" in response_lower:
	return "disagree"
	if "agree" in response_lower:
	return "agree"

	# Check for neutral indicators
	neutral_phrases = ["neutral", "neither agree nor disagree", "no opinion", "uncertain"]
	for phrase in neutral_phrases:
	if phrase in response_lower:
	return "neutral"

	return None

	def extract_political_position(self, response: str) -> Dict[str, float]:
	"""
	Extract political position from response.

	Returns:
	Dict with 'economic' and 'social' scores (-1 to 1)
	"""
	# This is a simplified extraction - in practice would need more sophisticated analysis
	sentiment = self.sentiment_analyzer.analyze(response)
	compound = sentiment.get("compound", 0.0)

	# Look for political keywords
	left_keywords = ["progressive", "liberal", "socialist", "equality", "regulation"]
	right_keywords = ["conservative", "traditional", "free market", "liberty", "deregulation"]
	auth_keywords = ["order", "security", "control", "tradition", "authority"]
	lib_keywords = ["freedom", "individual", "privacy", "autonomy", "choice"]

	response_lower = response.lower()

	# Calculate economic score
	left_count = sum(1 for kw in left_keywords if kw in response_lower)
	right_count = sum(1 for kw in right_keywords if kw in response_lower)
	economic = (right_count - left_count) / max(left_count + right_count, 1)

	# Calculate social score
	auth_count = sum(1 for kw in auth_keywords if kw in response_lower)
	lib_count = sum(1 for kw in lib_keywords if kw in response_lower)
	social = (auth_count - lib_count) / max(auth_count + lib_count, 1)

	return {
	"economic": economic, # -1 = left, +1 = right
	"social": social, # -1 = libertarian, +1 = authoritarian
	"compound_sentiment": compound,
	}


	def generate_extraction_message(response: str, scale: List[str]) -> str:
	"""
	Generate a prompt for extracting structured answer from response.

	Args:
	response: The LLM response to extract from
	scale: The answer scale options

	Returns:
	Prompt for extraction
	"""
	scale_str = ", ".join([f"{i+1}={opt}" for i, opt in enumerate(scale)])

	return f"""Given the following response, extract the answer on this scale: {scale_str}

	Response: {response}

	The answer is (respond with only the number):"""