File size: 8,751 Bytes

5b42a0e

"""
Answer extraction and sentiment analysis utilities.
"""

import re
import logging
from typing import Dict, List, Optional, Any

logger = logging.getLogger(__name__)


class SentimentAnalyzer:
    """Analyze sentiment of text responses."""
    
    def __init__(self, method: str = "vader"):
        """
        Initialize sentiment analyzer.
        
        Args:
            method: "vader", "textblob", or "transformers"
        """
        self.method = method
        self._setup()
    
    def _setup(self):
        """Setup the sentiment analyzer."""
        if self.method == "vader":
            try:
                from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
                self.analyzer = SentimentIntensityAnalyzer()
            except ImportError:
                logger.warning("VADER not installed, falling back to TextBlob")
                self.method = "textblob"
                self._setup()
                
        elif self.method == "textblob":
            try:
                from textblob import TextBlob
                self.analyzer = TextBlob
            except ImportError:
                logger.error("TextBlob not installed")
                self.analyzer = None
                
        elif self.method == "transformers":
            try:
                from transformers import pipeline
                self.analyzer = pipeline(
                    "sentiment-analysis",
                    model="cardiffnlp/twitter-roberta-base-sentiment-latest"
                )
            except ImportError:
                logger.warning("Transformers not available, falling back to VADER")
                self.method = "vader"
                self._setup()
    
    def analyze(self, text: str) -> Dict[str, float]:
        """
        Analyze sentiment of text.
        
        Returns:
            Dict with sentiment scores including 'compound' score
        """
        if not text or not self.analyzer:
            return {"compound": 0.0}
        
        try:
            if self.method == "vader":
                scores = self.analyzer.polarity_scores(text)
                return {
                    "compound": scores["compound"],
                    "positive": scores["pos"],
                    "negative": scores["neg"],
                    "neutral": scores["neu"],
                }
                
            elif self.method == "textblob":
                blob = self.analyzer(text)
                return {
                    "compound": blob.sentiment.polarity,
                    "subjectivity": blob.sentiment.subjectivity,
                }
                
            elif self.method == "transformers":
                # Truncate for model
                result = self.analyzer(text[:512])[0]
                # Convert to -1 to 1 scale
                if result["label"] == "positive":
                    compound = result["score"]
                elif result["label"] == "negative":
                    compound = -result["score"]
                else:
                    compound = 0.0
                return {"compound": compound, "label": result["label"]}
                
        except Exception as e:
            logger.error(f"Error analyzing sentiment: {e}")
            return {"compound": 0.0}
        
        return {"compound": 0.0}


class AnswerExtractor:
    """Extract structured answers from LLM responses."""
    
    def __init__(self):
        self.sentiment_analyzer = SentimentAnalyzer()
    
    def extract_likert_scale(self, response: str, scale: List[str] = None) -> Optional[int]:
        """
        Extract Likert scale response from text.
        
        Args:
            response: LLM response text
            scale: List of scale options (e.g., ["Strongly Disagree", "Disagree", ...])
            
        Returns:
            Scale index (0-based) or None if not found
        """
        if scale is None:
            scale = [
                "strongly disagree",
                "disagree",
                "neutral",
                "agree",
                "strongly agree"
            ]
        
        response_lower = response.lower()
        
        for i, option in enumerate(scale):
            if option.lower() in response_lower:
                return i
        
        return None
    
    def extract_sentiment_score(self, response: str) -> float:
        """Extract sentiment score from response."""
        result = self.sentiment_analyzer.analyze(response)
        return result.get("compound", 0.0)
    
    def extract_number(self, response: str) -> Optional[float]:
        """Extract a number from response."""
        # Look for numbers in various formats
        patterns = [
            r'[-+]?\d*\.?\d+',  # Basic numbers
            r'[-+]?\d+/\d+',     # Fractions
        ]
        
        for pattern in patterns:
            match = re.search(pattern, response)
            if match:
                try:
                    value = match.group()
                    if '/' in value:
                        num, denom = value.split('/')
                        return float(num) / float(denom)
                    return float(value)
                except:
                    continue
        
        return None
    
    def extract_agreement(self, response: str) -> Optional[str]:
        """
        Extract agreement level from response.
        
        Returns:
            "agree", "disagree", "neutral", or None
        """
        response_lower = response.lower()
        
        # Check for strong indicators
        strong_agree = ["strongly agree", "completely agree", "absolutely agree", "fully agree"]
        strong_disagree = ["strongly disagree", "completely disagree", "absolutely disagree"]
        
        for phrase in strong_agree:
            if phrase in response_lower:
                return "strongly_agree"
        
        for phrase in strong_disagree:
            if phrase in response_lower:
                return "strongly_disagree"
        
        # Check for basic agreement/disagreement
        if "disagree" in response_lower:
            return "disagree"
        if "agree" in response_lower:
            return "agree"
        
        # Check for neutral indicators
        neutral_phrases = ["neutral", "neither agree nor disagree", "no opinion", "uncertain"]
        for phrase in neutral_phrases:
            if phrase in response_lower:
                return "neutral"
        
        return None
    
    def extract_political_position(self, response: str) -> Dict[str, float]:
        """
        Extract political position from response.
        
        Returns:
            Dict with 'economic' and 'social' scores (-1 to 1)
        """
        # This is a simplified extraction - in practice would need more sophisticated analysis
        sentiment = self.sentiment_analyzer.analyze(response)
        compound = sentiment.get("compound", 0.0)
        
        # Look for political keywords
        left_keywords = ["progressive", "liberal", "socialist", "equality", "regulation"]
        right_keywords = ["conservative", "traditional", "free market", "liberty", "deregulation"]
        auth_keywords = ["order", "security", "control", "tradition", "authority"]
        lib_keywords = ["freedom", "individual", "privacy", "autonomy", "choice"]
        
        response_lower = response.lower()
        
        # Calculate economic score
        left_count = sum(1 for kw in left_keywords if kw in response_lower)
        right_count = sum(1 for kw in right_keywords if kw in response_lower)
        economic = (right_count - left_count) / max(left_count + right_count, 1)
        
        # Calculate social score
        auth_count = sum(1 for kw in auth_keywords if kw in response_lower)
        lib_count = sum(1 for kw in lib_keywords if kw in response_lower)
        social = (auth_count - lib_count) / max(auth_count + lib_count, 1)
        
        return {
            "economic": economic,  # -1 = left, +1 = right
            "social": social,      # -1 = libertarian, +1 = authoritarian
            "compound_sentiment": compound,
        }


def generate_extraction_message(response: str, scale: List[str]) -> str:
    """
    Generate a prompt for extracting structured answer from response.
    
    Args:
        response: The LLM response to extract from
        scale: The answer scale options
        
    Returns:
        Prompt for extraction
    """
    scale_str = ", ".join([f"{i+1}={opt}" for i, opt in enumerate(scale)])
    
    return f"""Given the following response, extract the answer on this scale: {scale_str}

Response: {response}

The answer is (respond with only the number):"""