""" Multilingual Sentiment Analysis Tool Supports Turkish, Persian, and English using lexicon-based and machine learning approaches """ import re import json import os from typing import Dict, List, Tuple, Optional from collections import Counter import math class SentimentLexicon: """Base class for sentiment lexicons""" def __init__(self, language: str): self.language = language self.positive_words = set() self.negative_words = set() self.intensifiers = {} self.negation_words = set() self.diminishers = {} self.contrast_words = set() self.idioms_positive = [] self.idioms_negative = [] self._load_lexicon() def _load_lexicon(self): """Load language-specific sentiment lexicon""" lexicon_file = f"lexicons/{self.language}_lexicon.json" if os.path.exists(lexicon_file): with open(lexicon_file, 'r', encoding='utf-8') as f: data = json.load(f) self.positive_words = set(data.get('positive', [])) self.negative_words = set(data.get('negative', [])) self.intensifiers = data.get('intensifiers', {}) self.negation_words = set(data.get('negation', [])) self.diminishers = data.get('diminishers', {}) self.contrast_words = set(data.get('contrast_words', [])) self.idioms_positive = data.get('idioms_positive', []) self.idioms_negative = data.get('idioms_negative', []) else: # Default English lexicon self._load_default_english() def _load_default_english(self): """Load default English sentiment words""" self.positive_words = { 'good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic', 'love', 'like', 'best', 'perfect', 'beautiful', 'nice', 'happy', 'pleased', 'satisfied', 'awesome', 'brilliant', 'outstanding' } self.negative_words = { 'bad', 'terrible', 'awful', 'horrible', 'worst', 'hate', 'dislike', 'poor', 'disappointed', 'sad', 'angry', 'frustrated', 'annoying', 'boring', 'ugly', 'awful', 'disgusting', 'pathetic' } self.intensifiers = { 'very': 1.5, 'extremely': 2.0, 'really': 1.3, 'quite': 1.2, 'too': 1.4, 'so': 1.3, 'absolutely': 1.8, 'completely': 1.5 } self.negation_words = { 'not', 'no', 'never', 'none', 'nobody', 'nothing', 'nowhere', 'neither', 'cannot', "can't", "won't", "don't", "doesn't" } self.diminishers = {} self.contrast_words = set() self.idioms_positive = [] self.idioms_negative = [] class TextPreprocessor: """Text preprocessing for different languages""" def __init__(self, language: str): self.language = language def preprocess(self, text: str) -> List[str]: """Preprocess text and return tokens""" # Convert to lowercase text = text.lower() # Remove URLs text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove email addresses text = re.sub(r'\S+@\S+', '', text) # Remove special characters but keep punctuation for sentiment analysis text = re.sub(r'[^\w\s\.,!?;:()\-\']', '', text) # Tokenize tokens = re.findall(r'\b\w+\b|[.,!?;:()]', text) return tokens def normalize_turkish(self, text: str) -> str: """Normalize Turkish text (handle special characters)""" # Turkish character normalization replacements = { 'ı': 'i', 'İ': 'I', 'ğ': 'g', 'Ğ': 'G', 'ü': 'u', 'Ü': 'U', 'ş': 's', 'Ş': 'S', 'ö': 'o', 'Ö': 'O', 'ç': 'c', 'Ç': 'C' } for old, new in replacements.items(): text = text.replace(old, new) return text def normalize_persian(self, text: str) -> str: """Normalize Persian text (handle different character forms)""" # Persian/Arabic character normalization # This is a simplified version - real implementation would be more complex return text class LexiconBasedAnalyzer: """Lexicon-based sentiment analysis with enhanced features""" def __init__(self, language: str): self.language = language self.lexicon = SentimentLexicon(language) self.preprocessor = TextPreprocessor(language) def _check_idioms(self, text: str) -> Tuple[float, float]: """Check for sentiment idioms in text""" pos_score = 0.0 neg_score = 0.0 text_lower = text.lower() for idiom in self.lexicon.idioms_positive: if idiom.lower() in text_lower: pos_score += 2.0 # Idioms carry stronger sentiment for idiom in self.lexicon.idioms_negative: if idiom.lower() in text_lower: neg_score += 2.0 return pos_score, neg_score def analyze(self, text: str) -> Dict: """Analyze sentiment using lexicon-based approach""" tokens = self.preprocessor.preprocess(text) text_lower = text.lower() positive_score = 0 negative_score = 0 sentiment_words = [] negation_count = 0 # Check idioms first idiom_pos, idiom_neg = self._check_idioms(text) positive_score += idiom_pos negative_score += idiom_neg # Check for negation and intensifiers with improved scope detection window_size = 4 # Increased window for better context i = 0 while i < len(tokens): token = tokens[i] is_negated = False intensifier_strength = 1.0 diminisher_strength = 1.0 # Check for negation in window (improved scope) for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)): if tokens[j] in self.lexicon.negation_words: # Check if negation is still in scope (not interrupted by punctuation) if j < i: # Check for punctuation between negation and token has_punctuation = any( tokens[k] in ['.', '!', '?', ';', ','] for k in range(j + 1, i) ) if not has_punctuation: is_negated = True negation_count += 1 break # Check for intensifiers (look back up to 2 tokens) for k in range(max(0, i-2), i): if k >= 0 and tokens[k] in self.lexicon.intensifiers: intensifier_strength = max(intensifier_strength, self.lexicon.intensifiers[tokens[k]]) # Check for diminishers (look back up to 2 tokens) for k in range(max(0, i-2), i): if k >= 0 and tokens[k] in self.lexicon.diminishers: diminisher_strength = min(diminisher_strength, self.lexicon.diminishers[tokens[k]]) # Check sentiment if token in self.lexicon.positive_words: score = 1.0 * intensifier_strength * diminisher_strength if is_negated: negative_score += score sentiment_words.append(('negative', token, is_negated)) else: positive_score += score sentiment_words.append(('positive', token, is_negated)) elif token in self.lexicon.negative_words: score = 1.0 * intensifier_strength * diminisher_strength if is_negated: positive_score += score sentiment_words.append(('positive', token, is_negated)) else: negative_score += score sentiment_words.append(('negative', token, is_negated)) i += 1 # Calculate final sentiment with improved scoring # Normalize scores to prevent extreme values from dominating total_raw = positive_score + negative_score if total_raw > 0: # Use logarithmic scaling for better balance (but keep original for display) pos_normalized = positive_score / total_raw neg_normalized = negative_score / total_raw else: pos_normalized = 0.0 neg_normalized = 0.0 if total_raw == 0: polarity = 'neutral' confidence = 0.0 elif positive_score > negative_score: polarity = 'positive' confidence = pos_normalized else: polarity = 'negative' confidence = neg_normalized return { 'polarity': polarity, 'confidence': round(confidence, 3), 'positive_score': round(positive_score, 3), 'negative_score': round(negative_score, 3), 'sentiment_words': sentiment_words[:10], # Limit to first 10 'method': 'lexicon-based' } class RuleBasedAnalyzer: """Rule-based sentiment analysis with advanced linguistic rules""" def __init__(self, language: str): self.language = language self.lexicon = SentimentLexicon(language) self.preprocessor = TextPreprocessor(language) def _detect_emoticons(self, text: str) -> Tuple[float, float]: """Detect and score emoticons and emojis""" pos_score = 0.0 neg_score = 0.0 # Positive emoticons positive_emoticons = [ ':)', ':-)', '=)', ';)', ';-)', '=D', ':D', ':-D', '😊', '😀', '😁', '😂', '🤣', '😃', '😄', '😆', '😍', '🥰', '😎', '🤗', '👍', '👏', '🎉', '❤️', '💕', '💖', '💗', '💓' ] # Negative emoticons negative_emoticons = [ ':(', ':-(', '=(', ':/', ':-/', ':|', ':-|', '>:(', '>:(', '😢', '😞', '😠', '😡', '😤', '😭', '😰', '😨', '😱', '😖', '😣', '😫', '😩', '👎', '💔', '😒', '😔', '😕', '🙁' ] for emoji in positive_emoticons: count = text.count(emoji) pos_score += count * 1.5 for emoji in negative_emoticons: count = text.count(emoji) neg_score += count * 1.5 return pos_score, neg_score def _handle_contrast_words(self, text: str, tokens: List[str], pos_score: float, neg_score: float) -> Tuple[float, float]: """Handle contrast words that may shift sentiment""" # Find contrast words and adjust sentiment contrast_positions = [] for i, token in enumerate(tokens): if token.lower() in self.lexicon.contrast_words: contrast_positions.append(i) # If contrast word found, reduce weight of sentiment before it if contrast_positions: # Simple heuristic: reduce earlier sentiment by 30% reduction_factor = 0.7 return pos_score * reduction_factor, neg_score * reduction_factor return pos_score, neg_score def _detect_comparatives_superlatives(self, tokens: List[str]) -> float: """Detect comparative and superlative forms that intensify sentiment""" multiplier = 1.0 # Check for superlatives superlative_indicators = ['most', 'best', 'worst', 'least', 'greatest'] for token in tokens: if token.lower() in superlative_indicators: multiplier = max(multiplier, 1.4) # Check for comparatives comparative_patterns = ['more', 'less', 'better', 'worse', 'greater', 'smaller'] for token in tokens: if token.lower() in comparative_patterns: multiplier = max(multiplier, 1.2) return multiplier def _detect_repetition(self, text: str) -> float: """Detect repeated characters/words that indicate emphasis""" multiplier = 1.0 # Repeated characters (e.g., "soooo good") repeated_chars = re.findall(r'(\w)\1{2,}', text.lower()) if repeated_chars: multiplier += len(repeated_chars) * 0.1 # Repeated words (e.g., "good good good") words = text.lower().split() if len(words) > 2: for i in range(len(words) - 2): if words[i] == words[i+1] == words[i+2]: multiplier += 0.2 break return min(multiplier, 1.5) # Cap at 1.5x def _detect_sentiment_shifters(self, text: str) -> float: """Detect words that shift sentiment polarity""" shifters = { 'but': 0.6, 'however': 0.6, 'although': 0.7, 'though': 0.7, 'yet': 0.6, 'still': 0.7, 'nevertheless': 0.6, 'nonetheless': 0.6 } text_lower = text.lower() for shifter, factor in shifters.items(): if shifter in text_lower: return factor return 1.0 def analyze(self, text: str) -> Dict: """Analyze sentiment using rule-based approach with advanced rules""" # Use lexicon-based as base base_analyzer = LexiconBasedAnalyzer(self.language) result = base_analyzer.analyze(text) # Apply advanced rules tokens = self.preprocessor.preprocess(text) text_lower = text.lower() # Rule 1: Exclamation marks increase sentiment strength exclamation_count = text.count('!') if exclamation_count > 0: multiplier = 1 + min(exclamation_count * 0.15, 0.5) # Cap at 50% increase result['positive_score'] *= multiplier result['negative_score'] *= multiplier # Rule 2: Question marks may indicate uncertainty or sarcasm question_count = text.count('?') if question_count > 1: uncertainty_factor = max(0.7, 1 - (question_count * 0.1)) result['confidence'] *= uncertainty_factor # Rule 3: All caps increase sentiment strength (but check length) caps_words = [w for w in text.split() if w.isupper() and len(w) > 2] if len(caps_words) > 0: caps_multiplier = 1 + (len(caps_words) * 0.1) result['positive_score'] *= caps_multiplier result['negative_score'] *= caps_multiplier # Rule 4: Enhanced emoticon detection emoji_pos, emoji_neg = self._detect_emoticons(text) result['positive_score'] += emoji_pos result['negative_score'] += emoji_neg # Rule 5: Contrast words handling result['positive_score'], result['negative_score'] = self._handle_contrast_words( text, tokens, result['positive_score'], result['negative_score'] ) # Rule 6: Comparatives and superlatives comp_super_mult = self._detect_comparatives_superlatives(tokens) result['positive_score'] *= comp_super_mult result['negative_score'] *= comp_super_mult # Rule 7: Repetition detection rep_mult = self._detect_repetition(text) result['positive_score'] *= rep_mult result['negative_score'] *= rep_mult # Rule 8: Sentiment shifters shifter_factor = self._detect_sentiment_shifters(text) if shifter_factor < 1.0: # Reduce earlier sentiment result['positive_score'] *= shifter_factor result['negative_score'] *= shifter_factor # Rule 9: Ellipsis may indicate uncertainty or trailing off if '...' in text or '…' in text: result['confidence'] *= 0.9 # Rule 10: Multiple punctuation (e.g., "!!!") increases emphasis multi_punct = re.findall(r'[!?]{2,}', text) if multi_punct: punct_mult = 1 + (len(multi_punct) * 0.1) result['positive_score'] *= punct_mult result['negative_score'] *= punct_mult # Rule 11: Hashtags in social media context hashtags = re.findall(r'#\w+', text) if hashtags: # Check if hashtags contain sentiment words for tag in hashtags: tag_lower = tag.lower() if any(word in tag_lower for word in self.lexicon.positive_words): result['positive_score'] += 0.5 if any(word in tag_lower for word in self.lexicon.negative_words): result['negative_score'] += 0.5 # Rule 12: URL presence may indicate spam or less personal content if re.search(r'http[s]?://', text): result['confidence'] *= 0.95 # Rule 13: Length-based confidence adjustment word_count = len(text.split()) if word_count < 3: result['confidence'] *= 0.8 # Very short texts are less reliable elif word_count > 100: result['confidence'] *= 0.95 # Very long texts may have mixed sentiment # Recalculate polarity total = result['positive_score'] + result['negative_score'] if total > 0: if result['positive_score'] > result['negative_score']: result['polarity'] = 'positive' result['confidence'] = result['positive_score'] / total else: result['polarity'] = 'negative' result['confidence'] = result['negative_score'] / total else: result['polarity'] = 'neutral' result['confidence'] = 0.0 result['method'] = 'rule-based' return result class HybridAnalyzer: """Hybrid approach combining lexicon, rules, and simple ML features""" def __init__(self, language: str): self.language = language self.lexicon_analyzer = LexiconBasedAnalyzer(language) self.rule_analyzer = RuleBasedAnalyzer(language) def analyze(self, text: str) -> Dict: """Analyze sentiment using hybrid approach""" # Get results from both methods lexicon_result = self.lexicon_analyzer.analyze(text) rule_result = self.rule_analyzer.analyze(text) # Combine scores with weights lexicon_weight = 0.4 rule_weight = 0.6 combined_positive = (lexicon_result['positive_score'] * lexicon_weight + rule_result['positive_score'] * rule_weight) combined_negative = (lexicon_result['negative_score'] * lexicon_weight + rule_result['negative_score'] * rule_weight) total = combined_positive + combined_negative if total == 0: polarity = 'neutral' confidence = 0.0 elif combined_positive > combined_negative: polarity = 'positive' confidence = combined_positive / total else: polarity = 'negative' confidence = combined_negative / total return { 'polarity': polarity, 'confidence': round(confidence, 3), 'positive_score': round(combined_positive, 3), 'negative_score': round(combined_negative, 3), 'lexicon_result': lexicon_result, 'rule_result': rule_result, 'method': 'hybrid' } class MultilingualSentimentAnalyzer: """Main sentiment analyzer supporting multiple languages and methods""" def __init__(self, language: str = 'english', method: str = 'hybrid'): """ Initialize sentiment analyzer Args: language: 'english', 'turkish', or 'persian' method: 'lexicon', 'rule', or 'hybrid' """ self.language = language.lower() self.method = method.lower() if method == 'lexicon': self.analyzer = LexiconBasedAnalyzer(self.language) elif method == 'rule': self.analyzer = RuleBasedAnalyzer(self.language) else: # hybrid self.analyzer = HybridAnalyzer(self.language) def analyze(self, text: str) -> Dict: """Analyze sentiment of input text""" if not text or not text.strip(): return { 'polarity': 'neutral', 'confidence': 0.0, 'error': 'Empty text provided' } try: result = self.analyzer.analyze(text) result['language'] = self.language result['text_length'] = len(text) result['word_count'] = len(text.split()) return result except Exception as e: return { 'polarity': 'neutral', 'confidence': 0.0, 'error': str(e) } def analyze_batch(self, texts: List[str]) -> List[Dict]: """Analyze multiple texts""" return [self.analyze(text) for text in texts] def get_statistics(self, texts: List[str]) -> Dict: """Get aggregate statistics for a batch of texts""" results = self.analyze_batch(texts) polarity_counts = Counter([r['polarity'] for r in results]) total = len(results) avg_confidence = sum([r.get('confidence', 0) for r in results]) / total if total > 0 else 0 return { 'total_texts': total, 'polarity_distribution': dict(polarity_counts), 'polarity_percentages': { k: round(v / total * 100, 2) for k, v in polarity_counts.items() }, 'average_confidence': round(avg_confidence, 3) }