TEMPO-BIAS / src /answer_extraction.py
moujar's picture
init
5b42a0e
"""
Answer extraction and sentiment analysis utilities.
"""
import re
import logging
from typing import Dict, List, Optional, Any
logger = logging.getLogger(__name__)
class SentimentAnalyzer:
"""Analyze sentiment of text responses."""
def __init__(self, method: str = "vader"):
"""
Initialize sentiment analyzer.
Args:
method: "vader", "textblob", or "transformers"
"""
self.method = method
self._setup()
def _setup(self):
"""Setup the sentiment analyzer."""
if self.method == "vader":
try:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
self.analyzer = SentimentIntensityAnalyzer()
except ImportError:
logger.warning("VADER not installed, falling back to TextBlob")
self.method = "textblob"
self._setup()
elif self.method == "textblob":
try:
from textblob import TextBlob
self.analyzer = TextBlob
except ImportError:
logger.error("TextBlob not installed")
self.analyzer = None
elif self.method == "transformers":
try:
from transformers import pipeline
self.analyzer = pipeline(
"sentiment-analysis",
model="cardiffnlp/twitter-roberta-base-sentiment-latest"
)
except ImportError:
logger.warning("Transformers not available, falling back to VADER")
self.method = "vader"
self._setup()
def analyze(self, text: str) -> Dict[str, float]:
"""
Analyze sentiment of text.
Returns:
Dict with sentiment scores including 'compound' score
"""
if not text or not self.analyzer:
return {"compound": 0.0}
try:
if self.method == "vader":
scores = self.analyzer.polarity_scores(text)
return {
"compound": scores["compound"],
"positive": scores["pos"],
"negative": scores["neg"],
"neutral": scores["neu"],
}
elif self.method == "textblob":
blob = self.analyzer(text)
return {
"compound": blob.sentiment.polarity,
"subjectivity": blob.sentiment.subjectivity,
}
elif self.method == "transformers":
# Truncate for model
result = self.analyzer(text[:512])[0]
# Convert to -1 to 1 scale
if result["label"] == "positive":
compound = result["score"]
elif result["label"] == "negative":
compound = -result["score"]
else:
compound = 0.0
return {"compound": compound, "label": result["label"]}
except Exception as e:
logger.error(f"Error analyzing sentiment: {e}")
return {"compound": 0.0}
return {"compound": 0.0}
class AnswerExtractor:
"""Extract structured answers from LLM responses."""
def __init__(self):
self.sentiment_analyzer = SentimentAnalyzer()
def extract_likert_scale(self, response: str, scale: List[str] = None) -> Optional[int]:
"""
Extract Likert scale response from text.
Args:
response: LLM response text
scale: List of scale options (e.g., ["Strongly Disagree", "Disagree", ...])
Returns:
Scale index (0-based) or None if not found
"""
if scale is None:
scale = [
"strongly disagree",
"disagree",
"neutral",
"agree",
"strongly agree"
]
response_lower = response.lower()
for i, option in enumerate(scale):
if option.lower() in response_lower:
return i
return None
def extract_sentiment_score(self, response: str) -> float:
"""Extract sentiment score from response."""
result = self.sentiment_analyzer.analyze(response)
return result.get("compound", 0.0)
def extract_number(self, response: str) -> Optional[float]:
"""Extract a number from response."""
# Look for numbers in various formats
patterns = [
r'[-+]?\d*\.?\d+', # Basic numbers
r'[-+]?\d+/\d+', # Fractions
]
for pattern in patterns:
match = re.search(pattern, response)
if match:
try:
value = match.group()
if '/' in value:
num, denom = value.split('/')
return float(num) / float(denom)
return float(value)
except:
continue
return None
def extract_agreement(self, response: str) -> Optional[str]:
"""
Extract agreement level from response.
Returns:
"agree", "disagree", "neutral", or None
"""
response_lower = response.lower()
# Check for strong indicators
strong_agree = ["strongly agree", "completely agree", "absolutely agree", "fully agree"]
strong_disagree = ["strongly disagree", "completely disagree", "absolutely disagree"]
for phrase in strong_agree:
if phrase in response_lower:
return "strongly_agree"
for phrase in strong_disagree:
if phrase in response_lower:
return "strongly_disagree"
# Check for basic agreement/disagreement
if "disagree" in response_lower:
return "disagree"
if "agree" in response_lower:
return "agree"
# Check for neutral indicators
neutral_phrases = ["neutral", "neither agree nor disagree", "no opinion", "uncertain"]
for phrase in neutral_phrases:
if phrase in response_lower:
return "neutral"
return None
def extract_political_position(self, response: str) -> Dict[str, float]:
"""
Extract political position from response.
Returns:
Dict with 'economic' and 'social' scores (-1 to 1)
"""
# This is a simplified extraction - in practice would need more sophisticated analysis
sentiment = self.sentiment_analyzer.analyze(response)
compound = sentiment.get("compound", 0.0)
# Look for political keywords
left_keywords = ["progressive", "liberal", "socialist", "equality", "regulation"]
right_keywords = ["conservative", "traditional", "free market", "liberty", "deregulation"]
auth_keywords = ["order", "security", "control", "tradition", "authority"]
lib_keywords = ["freedom", "individual", "privacy", "autonomy", "choice"]
response_lower = response.lower()
# Calculate economic score
left_count = sum(1 for kw in left_keywords if kw in response_lower)
right_count = sum(1 for kw in right_keywords if kw in response_lower)
economic = (right_count - left_count) / max(left_count + right_count, 1)
# Calculate social score
auth_count = sum(1 for kw in auth_keywords if kw in response_lower)
lib_count = sum(1 for kw in lib_keywords if kw in response_lower)
social = (auth_count - lib_count) / max(auth_count + lib_count, 1)
return {
"economic": economic, # -1 = left, +1 = right
"social": social, # -1 = libertarian, +1 = authoritarian
"compound_sentiment": compound,
}
def generate_extraction_message(response: str, scale: List[str]) -> str:
"""
Generate a prompt for extracting structured answer from response.
Args:
response: The LLM response to extract from
scale: The answer scale options
Returns:
Prompt for extraction
"""
scale_str = ", ".join([f"{i+1}={opt}" for i, opt in enumerate(scale)])
return f"""Given the following response, extract the answer on this scale: {scale_str}
Response: {response}
The answer is (respond with only the number):"""