Spaces:

amirrezaa
/

Sentiment

Sleeping

File size: 22,226 Bytes

"""
Multilingual Sentiment Analysis Tool
Supports Turkish, Persian, and English using lexicon-based and machine learning approaches
"""

import re
import json
import os
from typing import Dict, List, Tuple, Optional
from collections import Counter
import math


class SentimentLexicon:
    """Base class for sentiment lexicons"""
    
    def __init__(self, language: str):
        self.language = language
        self.positive_words = set()
        self.negative_words = set()
        self.intensifiers = {}
        self.negation_words = set()
        self.diminishers = {}
        self.contrast_words = set()
        self.idioms_positive = []
        self.idioms_negative = []
        self._load_lexicon()
    
    def _load_lexicon(self):
        """Load language-specific sentiment lexicon"""
        lexicon_file = f"lexicons/{self.language}_lexicon.json"
        if os.path.exists(lexicon_file):
            with open(lexicon_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                self.positive_words = set(data.get('positive', []))
                self.negative_words = set(data.get('negative', []))
                self.intensifiers = data.get('intensifiers', {})
                self.negation_words = set(data.get('negation', []))
                self.diminishers = data.get('diminishers', {})
                self.contrast_words = set(data.get('contrast_words', []))
                self.idioms_positive = data.get('idioms_positive', [])
                self.idioms_negative = data.get('idioms_negative', [])
        else:
            # Default English lexicon
            self._load_default_english()
    
    def _load_default_english(self):
        """Load default English sentiment words"""
        self.positive_words = {
            'good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic',
            'love', 'like', 'best', 'perfect', 'beautiful', 'nice', 'happy',
            'pleased', 'satisfied', 'awesome', 'brilliant', 'outstanding'
        }
        self.negative_words = {
            'bad', 'terrible', 'awful', 'horrible', 'worst', 'hate', 'dislike',
            'poor', 'disappointed', 'sad', 'angry', 'frustrated', 'annoying',
            'boring', 'ugly', 'awful', 'disgusting', 'pathetic'
        }
        self.intensifiers = {
            'very': 1.5, 'extremely': 2.0, 'really': 1.3, 'quite': 1.2,
            'too': 1.4, 'so': 1.3, 'absolutely': 1.8, 'completely': 1.5
        }
        self.negation_words = {
            'not', 'no', 'never', 'none', 'nobody', 'nothing', 'nowhere',
            'neither', 'cannot', "can't", "won't", "don't", "doesn't"
        }
        self.diminishers = {}
        self.contrast_words = set()
        self.idioms_positive = []
        self.idioms_negative = []


class TextPreprocessor:
    """Text preprocessing for different languages"""
    
    def __init__(self, language: str):
        self.language = language
    
    def preprocess(self, text: str) -> List[str]:
        """Preprocess text and return tokens"""
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove special characters but keep punctuation for sentiment analysis
        text = re.sub(r'[^\w\s\.,!?;:()\-\']', '', text)
        
        # Tokenize
        tokens = re.findall(r'\b\w+\b|[.,!?;:()]', text)
        
        return tokens
    
    def normalize_turkish(self, text: str) -> str:
        """Normalize Turkish text (handle special characters)"""
        # Turkish character normalization
        replacements = {
            'ı': 'i', 'İ': 'I',
            'ğ': 'g', 'Ğ': 'G',
            'ü': 'u', 'Ü': 'U',
            'ş': 's', 'Ş': 'S',
            'ö': 'o', 'Ö': 'O',
            'ç': 'c', 'Ç': 'C'
        }
        for old, new in replacements.items():
            text = text.replace(old, new)
        return text
    
    def normalize_persian(self, text: str) -> str:
        """Normalize Persian text (handle different character forms)"""
        # Persian/Arabic character normalization
        # This is a simplified version - real implementation would be more complex
        return text


class LexiconBasedAnalyzer:
    """Lexicon-based sentiment analysis with enhanced features"""
    
    def __init__(self, language: str):
        self.language = language
        self.lexicon = SentimentLexicon(language)
        self.preprocessor = TextPreprocessor(language)
    
    def _check_idioms(self, text: str) -> Tuple[float, float]:
        """Check for sentiment idioms in text"""
        pos_score = 0.0
        neg_score = 0.0
        text_lower = text.lower()
        
        for idiom in self.lexicon.idioms_positive:
            if idiom.lower() in text_lower:
                pos_score += 2.0  # Idioms carry stronger sentiment
        
        for idiom in self.lexicon.idioms_negative:
            if idiom.lower() in text_lower:
                neg_score += 2.0
        
        return pos_score, neg_score
    
    def analyze(self, text: str) -> Dict:
        """Analyze sentiment using lexicon-based approach"""
        tokens = self.preprocessor.preprocess(text)
        text_lower = text.lower()
        
        positive_score = 0
        negative_score = 0
        sentiment_words = []
        negation_count = 0
        
        # Check idioms first
        idiom_pos, idiom_neg = self._check_idioms(text)
        positive_score += idiom_pos
        negative_score += idiom_neg
        
        # Check for negation and intensifiers with improved scope detection
        window_size = 4  # Increased window for better context
        i = 0
        while i < len(tokens):
            token = tokens[i]
            is_negated = False
            intensifier_strength = 1.0
            diminisher_strength = 1.0
            
            # Check for negation in window (improved scope)
            for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
                if tokens[j] in self.lexicon.negation_words:
                    # Check if negation is still in scope (not interrupted by punctuation)
                    if j < i:
                        # Check for punctuation between negation and token
                        has_punctuation = any(
                            tokens[k] in ['.', '!', '?', ';', ','] 
                            for k in range(j + 1, i)
                        )
                        if not has_punctuation:
                            is_negated = True
                            negation_count += 1
                            break
            
            # Check for intensifiers (look back up to 2 tokens)
            for k in range(max(0, i-2), i):
                if k >= 0 and tokens[k] in self.lexicon.intensifiers:
                    intensifier_strength = max(intensifier_strength, self.lexicon.intensifiers[tokens[k]])
            
            # Check for diminishers (look back up to 2 tokens)
            for k in range(max(0, i-2), i):
                if k >= 0 and tokens[k] in self.lexicon.diminishers:
                    diminisher_strength = min(diminisher_strength, self.lexicon.diminishers[tokens[k]])
            
            # Check sentiment
            if token in self.lexicon.positive_words:
                score = 1.0 * intensifier_strength * diminisher_strength
                if is_negated:
                    negative_score += score
                    sentiment_words.append(('negative', token, is_negated))
                else:
                    positive_score += score
                    sentiment_words.append(('positive', token, is_negated))
            elif token in self.lexicon.negative_words:
                score = 1.0 * intensifier_strength * diminisher_strength
                if is_negated:
                    positive_score += score
                    sentiment_words.append(('positive', token, is_negated))
                else:
                    negative_score += score
                    sentiment_words.append(('negative', token, is_negated))
            
            i += 1
        
        # Calculate final sentiment with improved scoring
        # Normalize scores to prevent extreme values from dominating
        total_raw = positive_score + negative_score
        if total_raw > 0:
            # Use logarithmic scaling for better balance (but keep original for display)
            pos_normalized = positive_score / total_raw
            neg_normalized = negative_score / total_raw
        else:
            pos_normalized = 0.0
            neg_normalized = 0.0
        
        if total_raw == 0:
            polarity = 'neutral'
            confidence = 0.0
        elif positive_score > negative_score:
            polarity = 'positive'
            confidence = pos_normalized
        else:
            polarity = 'negative'
            confidence = neg_normalized
        
        return {
            'polarity': polarity,
            'confidence': round(confidence, 3),
            'positive_score': round(positive_score, 3),
            'negative_score': round(negative_score, 3),
            'sentiment_words': sentiment_words[:10],  # Limit to first 10
            'method': 'lexicon-based'
        }


class RuleBasedAnalyzer:
    """Rule-based sentiment analysis with advanced linguistic rules"""
    
    def __init__(self, language: str):
        self.language = language
        self.lexicon = SentimentLexicon(language)
        self.preprocessor = TextPreprocessor(language)
    
    def _detect_emoticons(self, text: str) -> Tuple[float, float]:
        """Detect and score emoticons and emojis"""
        pos_score = 0.0
        neg_score = 0.0
        
        # Positive emoticons
        positive_emoticons = [
            ':)', ':-)', '=)', ';)', ';-)', '=D', ':D', ':-D',
            '😊', '😀', '😁', '😂', '🤣', '😃', '😄', '😆', '😍', '🥰',
            '😎', '🤗', '👍', '👏', '🎉', '❤️', '💕', '💖', '💗', '💓'
        ]
        
        # Negative emoticons
        negative_emoticons = [
            ':(', ':-(', '=(', ':/', ':-/', ':|', ':-|', '>:(', '>:(',
            '😢', '😞', '😠', '😡', '😤', '😭', '😰', '😨', '😱', '😖',
            '😣', '😫', '😩', '👎', '💔', '😒', '😔', '😕', '🙁'
        ]
        
        for emoji in positive_emoticons:
            count = text.count(emoji)
            pos_score += count * 1.5
        
        for emoji in negative_emoticons:
            count = text.count(emoji)
            neg_score += count * 1.5
        
        return pos_score, neg_score
    
    def _handle_contrast_words(self, text: str, tokens: List[str], 
                                pos_score: float, neg_score: float) -> Tuple[float, float]:
        """Handle contrast words that may shift sentiment"""
        # Find contrast words and adjust sentiment
        contrast_positions = []
        for i, token in enumerate(tokens):
            if token.lower() in self.lexicon.contrast_words:
                contrast_positions.append(i)
        
        # If contrast word found, reduce weight of sentiment before it
        if contrast_positions:
            # Simple heuristic: reduce earlier sentiment by 30%
            reduction_factor = 0.7
            return pos_score * reduction_factor, neg_score * reduction_factor
        
        return pos_score, neg_score
    
    def _detect_comparatives_superlatives(self, tokens: List[str]) -> float:
        """Detect comparative and superlative forms that intensify sentiment"""
        multiplier = 1.0
        
        # Check for superlatives
        superlative_indicators = ['most', 'best', 'worst', 'least', 'greatest']
        for token in tokens:
            if token.lower() in superlative_indicators:
                multiplier = max(multiplier, 1.4)
        
        # Check for comparatives
        comparative_patterns = ['more', 'less', 'better', 'worse', 'greater', 'smaller']
        for token in tokens:
            if token.lower() in comparative_patterns:
                multiplier = max(multiplier, 1.2)
        
        return multiplier
    
    def _detect_repetition(self, text: str) -> float:
        """Detect repeated characters/words that indicate emphasis"""
        multiplier = 1.0
        
        # Repeated characters (e.g., "soooo good")
        repeated_chars = re.findall(r'(\w)\1{2,}', text.lower())
        if repeated_chars:
            multiplier += len(repeated_chars) * 0.1
        
        # Repeated words (e.g., "good good good")
        words = text.lower().split()
        if len(words) > 2:
            for i in range(len(words) - 2):
                if words[i] == words[i+1] == words[i+2]:
                    multiplier += 0.2
                    break
        
        return min(multiplier, 1.5)  # Cap at 1.5x
    
    def _detect_sentiment_shifters(self, text: str) -> float:
        """Detect words that shift sentiment polarity"""
        shifters = {
            'but': 0.6, 'however': 0.6, 'although': 0.7, 'though': 0.7,
            'yet': 0.6, 'still': 0.7, 'nevertheless': 0.6, 'nonetheless': 0.6
        }
        
        text_lower = text.lower()
        for shifter, factor in shifters.items():
            if shifter in text_lower:
                return factor
        
        return 1.0
    
    def analyze(self, text: str) -> Dict:
        """Analyze sentiment using rule-based approach with advanced rules"""
        # Use lexicon-based as base
        base_analyzer = LexiconBasedAnalyzer(self.language)
        result = base_analyzer.analyze(text)
        
        # Apply advanced rules
        tokens = self.preprocessor.preprocess(text)
        text_lower = text.lower()
        
        # Rule 1: Exclamation marks increase sentiment strength
        exclamation_count = text.count('!')
        if exclamation_count > 0:
            multiplier = 1 + min(exclamation_count * 0.15, 0.5)  # Cap at 50% increase
            result['positive_score'] *= multiplier
            result['negative_score'] *= multiplier
        
        # Rule 2: Question marks may indicate uncertainty or sarcasm
        question_count = text.count('?')
        if question_count > 1:
            uncertainty_factor = max(0.7, 1 - (question_count * 0.1))
            result['confidence'] *= uncertainty_factor
        
        # Rule 3: All caps increase sentiment strength (but check length)
        caps_words = [w for w in text.split() if w.isupper() and len(w) > 2]
        if len(caps_words) > 0:
            caps_multiplier = 1 + (len(caps_words) * 0.1)
            result['positive_score'] *= caps_multiplier
            result['negative_score'] *= caps_multiplier
        
        # Rule 4: Enhanced emoticon detection
        emoji_pos, emoji_neg = self._detect_emoticons(text)
        result['positive_score'] += emoji_pos
        result['negative_score'] += emoji_neg
        
        # Rule 5: Contrast words handling
        result['positive_score'], result['negative_score'] = self._handle_contrast_words(
            text, tokens, result['positive_score'], result['negative_score']
        )
        
        # Rule 6: Comparatives and superlatives
        comp_super_mult = self._detect_comparatives_superlatives(tokens)
        result['positive_score'] *= comp_super_mult
        result['negative_score'] *= comp_super_mult
        
        # Rule 7: Repetition detection
        rep_mult = self._detect_repetition(text)
        result['positive_score'] *= rep_mult
        result['negative_score'] *= rep_mult
        
        # Rule 8: Sentiment shifters
        shifter_factor = self._detect_sentiment_shifters(text)
        if shifter_factor < 1.0:
            # Reduce earlier sentiment
            result['positive_score'] *= shifter_factor
            result['negative_score'] *= shifter_factor
        
        # Rule 9: Ellipsis may indicate uncertainty or trailing off
        if '...' in text or '…' in text:
            result['confidence'] *= 0.9
        
        # Rule 10: Multiple punctuation (e.g., "!!!") increases emphasis
        multi_punct = re.findall(r'[!?]{2,}', text)
        if multi_punct:
            punct_mult = 1 + (len(multi_punct) * 0.1)
            result['positive_score'] *= punct_mult
            result['negative_score'] *= punct_mult
        
        # Rule 11: Hashtags in social media context
        hashtags = re.findall(r'#\w+', text)
        if hashtags:
            # Check if hashtags contain sentiment words
            for tag in hashtags:
                tag_lower = tag.lower()
                if any(word in tag_lower for word in self.lexicon.positive_words):
                    result['positive_score'] += 0.5
                if any(word in tag_lower for word in self.lexicon.negative_words):
                    result['negative_score'] += 0.5
        
        # Rule 12: URL presence may indicate spam or less personal content
        if re.search(r'http[s]?://', text):
            result['confidence'] *= 0.95
        
        # Rule 13: Length-based confidence adjustment
        word_count = len(text.split())
        if word_count < 3:
            result['confidence'] *= 0.8  # Very short texts are less reliable
        elif word_count > 100:
            result['confidence'] *= 0.95  # Very long texts may have mixed sentiment
        
        # Recalculate polarity
        total = result['positive_score'] + result['negative_score']
        if total > 0:
            if result['positive_score'] > result['negative_score']:
                result['polarity'] = 'positive'
                result['confidence'] = result['positive_score'] / total
            else:
                result['polarity'] = 'negative'
                result['confidence'] = result['negative_score'] / total
        else:
            result['polarity'] = 'neutral'
            result['confidence'] = 0.0
        
        result['method'] = 'rule-based'
        return result


class HybridAnalyzer:
    """Hybrid approach combining lexicon, rules, and simple ML features"""
    
    def __init__(self, language: str):
        self.language = language
        self.lexicon_analyzer = LexiconBasedAnalyzer(language)
        self.rule_analyzer = RuleBasedAnalyzer(language)
    
    def analyze(self, text: str) -> Dict:
        """Analyze sentiment using hybrid approach"""
        # Get results from both methods
        lexicon_result = self.lexicon_analyzer.analyze(text)
        rule_result = self.rule_analyzer.analyze(text)
        
        # Combine scores with weights
        lexicon_weight = 0.4
        rule_weight = 0.6
        
        combined_positive = (lexicon_result['positive_score'] * lexicon_weight + 
                           rule_result['positive_score'] * rule_weight)
        combined_negative = (lexicon_result['negative_score'] * lexicon_weight + 
                           rule_result['negative_score'] * rule_weight)
        
        total = combined_positive + combined_negative
        if total == 0:
            polarity = 'neutral'
            confidence = 0.0
        elif combined_positive > combined_negative:
            polarity = 'positive'
            confidence = combined_positive / total
        else:
            polarity = 'negative'
            confidence = combined_negative / total
        
        return {
            'polarity': polarity,
            'confidence': round(confidence, 3),
            'positive_score': round(combined_positive, 3),
            'negative_score': round(combined_negative, 3),
            'lexicon_result': lexicon_result,
            'rule_result': rule_result,
            'method': 'hybrid'
        }


class MultilingualSentimentAnalyzer:
    """Main sentiment analyzer supporting multiple languages and methods"""
    
    def __init__(self, language: str = 'english', method: str = 'hybrid'):
        """
        Initialize sentiment analyzer
        
        Args:
            language: 'english', 'turkish', or 'persian'
            method: 'lexicon', 'rule', or 'hybrid'
        """
        self.language = language.lower()
        self.method = method.lower()
        
        if method == 'lexicon':
            self.analyzer = LexiconBasedAnalyzer(self.language)
        elif method == 'rule':
            self.analyzer = RuleBasedAnalyzer(self.language)
        else:  # hybrid
            self.analyzer = HybridAnalyzer(self.language)
    
    def analyze(self, text: str) -> Dict:
        """Analyze sentiment of input text"""
        if not text or not text.strip():
            return {
                'polarity': 'neutral',
                'confidence': 0.0,
                'error': 'Empty text provided'
            }
        
        try:
            result = self.analyzer.analyze(text)
            result['language'] = self.language
            result['text_length'] = len(text)
            result['word_count'] = len(text.split())
            return result
        except Exception as e:
            return {
                'polarity': 'neutral',
                'confidence': 0.0,
                'error': str(e)
            }
    
    def analyze_batch(self, texts: List[str]) -> List[Dict]:
        """Analyze multiple texts"""
        return [self.analyze(text) for text in texts]
    
    def get_statistics(self, texts: List[str]) -> Dict:
        """Get aggregate statistics for a batch of texts"""
        results = self.analyze_batch(texts)
        
        polarity_counts = Counter([r['polarity'] for r in results])
        total = len(results)
        
        avg_confidence = sum([r.get('confidence', 0) for r in results]) / total if total > 0 else 0
        
        return {
            'total_texts': total,
            'polarity_distribution': dict(polarity_counts),
            'polarity_percentages': {
                k: round(v / total * 100, 2) 
                for k, v in polarity_counts.items()
            },
            'average_confidence': round(avg_confidence, 3)
        }