Sentiment / sentiment_analyzer.py
amirrezaa's picture
Upload sentiment_analyzer.py
2340214 verified
"""
Multilingual Sentiment Analysis Tool
Supports Turkish, Persian, and English using lexicon-based and machine learning approaches
"""
import re
import json
import os
from typing import Dict, List, Tuple, Optional
from collections import Counter
import math
class SentimentLexicon:
"""Base class for sentiment lexicons"""
def __init__(self, language: str):
self.language = language
self.positive_words = set()
self.negative_words = set()
self.intensifiers = {}
self.negation_words = set()
self.diminishers = {}
self.contrast_words = set()
self.idioms_positive = []
self.idioms_negative = []
self._load_lexicon()
def _load_lexicon(self):
"""Load language-specific sentiment lexicon"""
lexicon_file = f"lexicons/{self.language}_lexicon.json"
if os.path.exists(lexicon_file):
with open(lexicon_file, 'r', encoding='utf-8') as f:
data = json.load(f)
self.positive_words = set(data.get('positive', []))
self.negative_words = set(data.get('negative', []))
self.intensifiers = data.get('intensifiers', {})
self.negation_words = set(data.get('negation', []))
self.diminishers = data.get('diminishers', {})
self.contrast_words = set(data.get('contrast_words', []))
self.idioms_positive = data.get('idioms_positive', [])
self.idioms_negative = data.get('idioms_negative', [])
else:
# Default English lexicon
self._load_default_english()
def _load_default_english(self):
"""Load default English sentiment words"""
self.positive_words = {
'good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic',
'love', 'like', 'best', 'perfect', 'beautiful', 'nice', 'happy',
'pleased', 'satisfied', 'awesome', 'brilliant', 'outstanding'
}
self.negative_words = {
'bad', 'terrible', 'awful', 'horrible', 'worst', 'hate', 'dislike',
'poor', 'disappointed', 'sad', 'angry', 'frustrated', 'annoying',
'boring', 'ugly', 'awful', 'disgusting', 'pathetic'
}
self.intensifiers = {
'very': 1.5, 'extremely': 2.0, 'really': 1.3, 'quite': 1.2,
'too': 1.4, 'so': 1.3, 'absolutely': 1.8, 'completely': 1.5
}
self.negation_words = {
'not', 'no', 'never', 'none', 'nobody', 'nothing', 'nowhere',
'neither', 'cannot', "can't", "won't", "don't", "doesn't"
}
self.diminishers = {}
self.contrast_words = set()
self.idioms_positive = []
self.idioms_negative = []
class TextPreprocessor:
"""Text preprocessing for different languages"""
def __init__(self, language: str):
self.language = language
def preprocess(self, text: str) -> List[str]:
"""Preprocess text and return tokens"""
# Convert to lowercase
text = text.lower()
# Remove URLs
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
# Remove email addresses
text = re.sub(r'\S+@\S+', '', text)
# Remove special characters but keep punctuation for sentiment analysis
text = re.sub(r'[^\w\s\.,!?;:()\-\']', '', text)
# Tokenize
tokens = re.findall(r'\b\w+\b|[.,!?;:()]', text)
return tokens
def normalize_turkish(self, text: str) -> str:
"""Normalize Turkish text (handle special characters)"""
# Turkish character normalization
replacements = {
'ı': 'i', 'İ': 'I',
'ğ': 'g', 'Ğ': 'G',
'ü': 'u', 'Ü': 'U',
'ş': 's', 'Ş': 'S',
'ö': 'o', 'Ö': 'O',
'ç': 'c', 'Ç': 'C'
}
for old, new in replacements.items():
text = text.replace(old, new)
return text
def normalize_persian(self, text: str) -> str:
"""Normalize Persian text (handle different character forms)"""
# Persian/Arabic character normalization
# This is a simplified version - real implementation would be more complex
return text
class LexiconBasedAnalyzer:
"""Lexicon-based sentiment analysis with enhanced features"""
def __init__(self, language: str):
self.language = language
self.lexicon = SentimentLexicon(language)
self.preprocessor = TextPreprocessor(language)
def _check_idioms(self, text: str) -> Tuple[float, float]:
"""Check for sentiment idioms in text"""
pos_score = 0.0
neg_score = 0.0
text_lower = text.lower()
for idiom in self.lexicon.idioms_positive:
if idiom.lower() in text_lower:
pos_score += 2.0 # Idioms carry stronger sentiment
for idiom in self.lexicon.idioms_negative:
if idiom.lower() in text_lower:
neg_score += 2.0
return pos_score, neg_score
def analyze(self, text: str) -> Dict:
"""Analyze sentiment using lexicon-based approach"""
tokens = self.preprocessor.preprocess(text)
text_lower = text.lower()
positive_score = 0
negative_score = 0
sentiment_words = []
negation_count = 0
# Check idioms first
idiom_pos, idiom_neg = self._check_idioms(text)
positive_score += idiom_pos
negative_score += idiom_neg
# Check for negation and intensifiers with improved scope detection
window_size = 4 # Increased window for better context
i = 0
while i < len(tokens):
token = tokens[i]
is_negated = False
intensifier_strength = 1.0
diminisher_strength = 1.0
# Check for negation in window (improved scope)
for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
if tokens[j] in self.lexicon.negation_words:
# Check if negation is still in scope (not interrupted by punctuation)
if j < i:
# Check for punctuation between negation and token
has_punctuation = any(
tokens[k] in ['.', '!', '?', ';', ',']
for k in range(j + 1, i)
)
if not has_punctuation:
is_negated = True
negation_count += 1
break
# Check for intensifiers (look back up to 2 tokens)
for k in range(max(0, i-2), i):
if k >= 0 and tokens[k] in self.lexicon.intensifiers:
intensifier_strength = max(intensifier_strength, self.lexicon.intensifiers[tokens[k]])
# Check for diminishers (look back up to 2 tokens)
for k in range(max(0, i-2), i):
if k >= 0 and tokens[k] in self.lexicon.diminishers:
diminisher_strength = min(diminisher_strength, self.lexicon.diminishers[tokens[k]])
# Check sentiment
if token in self.lexicon.positive_words:
score = 1.0 * intensifier_strength * diminisher_strength
if is_negated:
negative_score += score
sentiment_words.append(('negative', token, is_negated))
else:
positive_score += score
sentiment_words.append(('positive', token, is_negated))
elif token in self.lexicon.negative_words:
score = 1.0 * intensifier_strength * diminisher_strength
if is_negated:
positive_score += score
sentiment_words.append(('positive', token, is_negated))
else:
negative_score += score
sentiment_words.append(('negative', token, is_negated))
i += 1
# Calculate final sentiment with improved scoring
# Normalize scores to prevent extreme values from dominating
total_raw = positive_score + negative_score
if total_raw > 0:
# Use logarithmic scaling for better balance (but keep original for display)
pos_normalized = positive_score / total_raw
neg_normalized = negative_score / total_raw
else:
pos_normalized = 0.0
neg_normalized = 0.0
if total_raw == 0:
polarity = 'neutral'
confidence = 0.0
elif positive_score > negative_score:
polarity = 'positive'
confidence = pos_normalized
else:
polarity = 'negative'
confidence = neg_normalized
return {
'polarity': polarity,
'confidence': round(confidence, 3),
'positive_score': round(positive_score, 3),
'negative_score': round(negative_score, 3),
'sentiment_words': sentiment_words[:10], # Limit to first 10
'method': 'lexicon-based'
}
class RuleBasedAnalyzer:
"""Rule-based sentiment analysis with advanced linguistic rules"""
def __init__(self, language: str):
self.language = language
self.lexicon = SentimentLexicon(language)
self.preprocessor = TextPreprocessor(language)
def _detect_emoticons(self, text: str) -> Tuple[float, float]:
"""Detect and score emoticons and emojis"""
pos_score = 0.0
neg_score = 0.0
# Positive emoticons
positive_emoticons = [
':)', ':-)', '=)', ';)', ';-)', '=D', ':D', ':-D',
'😊', '😀', '😁', '😂', '🤣', '😃', '😄', '😆', '😍', '🥰',
'😎', '🤗', '👍', '👏', '🎉', '❤️', '💕', '💖', '💗', '💓'
]
# Negative emoticons
negative_emoticons = [
':(', ':-(', '=(', ':/', ':-/', ':|', ':-|', '>:(', '>:(',
'😢', '😞', '😠', '😡', '😤', '😭', '😰', '😨', '😱', '😖',
'😣', '😫', '😩', '👎', '💔', '😒', '😔', '😕', '🙁'
]
for emoji in positive_emoticons:
count = text.count(emoji)
pos_score += count * 1.5
for emoji in negative_emoticons:
count = text.count(emoji)
neg_score += count * 1.5
return pos_score, neg_score
def _handle_contrast_words(self, text: str, tokens: List[str],
pos_score: float, neg_score: float) -> Tuple[float, float]:
"""Handle contrast words that may shift sentiment"""
# Find contrast words and adjust sentiment
contrast_positions = []
for i, token in enumerate(tokens):
if token.lower() in self.lexicon.contrast_words:
contrast_positions.append(i)
# If contrast word found, reduce weight of sentiment before it
if contrast_positions:
# Simple heuristic: reduce earlier sentiment by 30%
reduction_factor = 0.7
return pos_score * reduction_factor, neg_score * reduction_factor
return pos_score, neg_score
def _detect_comparatives_superlatives(self, tokens: List[str]) -> float:
"""Detect comparative and superlative forms that intensify sentiment"""
multiplier = 1.0
# Check for superlatives
superlative_indicators = ['most', 'best', 'worst', 'least', 'greatest']
for token in tokens:
if token.lower() in superlative_indicators:
multiplier = max(multiplier, 1.4)
# Check for comparatives
comparative_patterns = ['more', 'less', 'better', 'worse', 'greater', 'smaller']
for token in tokens:
if token.lower() in comparative_patterns:
multiplier = max(multiplier, 1.2)
return multiplier
def _detect_repetition(self, text: str) -> float:
"""Detect repeated characters/words that indicate emphasis"""
multiplier = 1.0
# Repeated characters (e.g., "soooo good")
repeated_chars = re.findall(r'(\w)\1{2,}', text.lower())
if repeated_chars:
multiplier += len(repeated_chars) * 0.1
# Repeated words (e.g., "good good good")
words = text.lower().split()
if len(words) > 2:
for i in range(len(words) - 2):
if words[i] == words[i+1] == words[i+2]:
multiplier += 0.2
break
return min(multiplier, 1.5) # Cap at 1.5x
def _detect_sentiment_shifters(self, text: str) -> float:
"""Detect words that shift sentiment polarity"""
shifters = {
'but': 0.6, 'however': 0.6, 'although': 0.7, 'though': 0.7,
'yet': 0.6, 'still': 0.7, 'nevertheless': 0.6, 'nonetheless': 0.6
}
text_lower = text.lower()
for shifter, factor in shifters.items():
if shifter in text_lower:
return factor
return 1.0
def analyze(self, text: str) -> Dict:
"""Analyze sentiment using rule-based approach with advanced rules"""
# Use lexicon-based as base
base_analyzer = LexiconBasedAnalyzer(self.language)
result = base_analyzer.analyze(text)
# Apply advanced rules
tokens = self.preprocessor.preprocess(text)
text_lower = text.lower()
# Rule 1: Exclamation marks increase sentiment strength
exclamation_count = text.count('!')
if exclamation_count > 0:
multiplier = 1 + min(exclamation_count * 0.15, 0.5) # Cap at 50% increase
result['positive_score'] *= multiplier
result['negative_score'] *= multiplier
# Rule 2: Question marks may indicate uncertainty or sarcasm
question_count = text.count('?')
if question_count > 1:
uncertainty_factor = max(0.7, 1 - (question_count * 0.1))
result['confidence'] *= uncertainty_factor
# Rule 3: All caps increase sentiment strength (but check length)
caps_words = [w for w in text.split() if w.isupper() and len(w) > 2]
if len(caps_words) > 0:
caps_multiplier = 1 + (len(caps_words) * 0.1)
result['positive_score'] *= caps_multiplier
result['negative_score'] *= caps_multiplier
# Rule 4: Enhanced emoticon detection
emoji_pos, emoji_neg = self._detect_emoticons(text)
result['positive_score'] += emoji_pos
result['negative_score'] += emoji_neg
# Rule 5: Contrast words handling
result['positive_score'], result['negative_score'] = self._handle_contrast_words(
text, tokens, result['positive_score'], result['negative_score']
)
# Rule 6: Comparatives and superlatives
comp_super_mult = self._detect_comparatives_superlatives(tokens)
result['positive_score'] *= comp_super_mult
result['negative_score'] *= comp_super_mult
# Rule 7: Repetition detection
rep_mult = self._detect_repetition(text)
result['positive_score'] *= rep_mult
result['negative_score'] *= rep_mult
# Rule 8: Sentiment shifters
shifter_factor = self._detect_sentiment_shifters(text)
if shifter_factor < 1.0:
# Reduce earlier sentiment
result['positive_score'] *= shifter_factor
result['negative_score'] *= shifter_factor
# Rule 9: Ellipsis may indicate uncertainty or trailing off
if '...' in text or '…' in text:
result['confidence'] *= 0.9
# Rule 10: Multiple punctuation (e.g., "!!!") increases emphasis
multi_punct = re.findall(r'[!?]{2,}', text)
if multi_punct:
punct_mult = 1 + (len(multi_punct) * 0.1)
result['positive_score'] *= punct_mult
result['negative_score'] *= punct_mult
# Rule 11: Hashtags in social media context
hashtags = re.findall(r'#\w+', text)
if hashtags:
# Check if hashtags contain sentiment words
for tag in hashtags:
tag_lower = tag.lower()
if any(word in tag_lower for word in self.lexicon.positive_words):
result['positive_score'] += 0.5
if any(word in tag_lower for word in self.lexicon.negative_words):
result['negative_score'] += 0.5
# Rule 12: URL presence may indicate spam or less personal content
if re.search(r'http[s]?://', text):
result['confidence'] *= 0.95
# Rule 13: Length-based confidence adjustment
word_count = len(text.split())
if word_count < 3:
result['confidence'] *= 0.8 # Very short texts are less reliable
elif word_count > 100:
result['confidence'] *= 0.95 # Very long texts may have mixed sentiment
# Recalculate polarity
total = result['positive_score'] + result['negative_score']
if total > 0:
if result['positive_score'] > result['negative_score']:
result['polarity'] = 'positive'
result['confidence'] = result['positive_score'] / total
else:
result['polarity'] = 'negative'
result['confidence'] = result['negative_score'] / total
else:
result['polarity'] = 'neutral'
result['confidence'] = 0.0
result['method'] = 'rule-based'
return result
class HybridAnalyzer:
"""Hybrid approach combining lexicon, rules, and simple ML features"""
def __init__(self, language: str):
self.language = language
self.lexicon_analyzer = LexiconBasedAnalyzer(language)
self.rule_analyzer = RuleBasedAnalyzer(language)
def analyze(self, text: str) -> Dict:
"""Analyze sentiment using hybrid approach"""
# Get results from both methods
lexicon_result = self.lexicon_analyzer.analyze(text)
rule_result = self.rule_analyzer.analyze(text)
# Combine scores with weights
lexicon_weight = 0.4
rule_weight = 0.6
combined_positive = (lexicon_result['positive_score'] * lexicon_weight +
rule_result['positive_score'] * rule_weight)
combined_negative = (lexicon_result['negative_score'] * lexicon_weight +
rule_result['negative_score'] * rule_weight)
total = combined_positive + combined_negative
if total == 0:
polarity = 'neutral'
confidence = 0.0
elif combined_positive > combined_negative:
polarity = 'positive'
confidence = combined_positive / total
else:
polarity = 'negative'
confidence = combined_negative / total
return {
'polarity': polarity,
'confidence': round(confidence, 3),
'positive_score': round(combined_positive, 3),
'negative_score': round(combined_negative, 3),
'lexicon_result': lexicon_result,
'rule_result': rule_result,
'method': 'hybrid'
}
class MultilingualSentimentAnalyzer:
"""Main sentiment analyzer supporting multiple languages and methods"""
def __init__(self, language: str = 'english', method: str = 'hybrid'):
"""
Initialize sentiment analyzer
Args:
language: 'english', 'turkish', or 'persian'
method: 'lexicon', 'rule', or 'hybrid'
"""
self.language = language.lower()
self.method = method.lower()
if method == 'lexicon':
self.analyzer = LexiconBasedAnalyzer(self.language)
elif method == 'rule':
self.analyzer = RuleBasedAnalyzer(self.language)
else: # hybrid
self.analyzer = HybridAnalyzer(self.language)
def analyze(self, text: str) -> Dict:
"""Analyze sentiment of input text"""
if not text or not text.strip():
return {
'polarity': 'neutral',
'confidence': 0.0,
'error': 'Empty text provided'
}
try:
result = self.analyzer.analyze(text)
result['language'] = self.language
result['text_length'] = len(text)
result['word_count'] = len(text.split())
return result
except Exception as e:
return {
'polarity': 'neutral',
'confidence': 0.0,
'error': str(e)
}
def analyze_batch(self, texts: List[str]) -> List[Dict]:
"""Analyze multiple texts"""
return [self.analyze(text) for text in texts]
def get_statistics(self, texts: List[str]) -> Dict:
"""Get aggregate statistics for a batch of texts"""
results = self.analyze_batch(texts)
polarity_counts = Counter([r['polarity'] for r in results])
total = len(results)
avg_confidence = sum([r.get('confidence', 0) for r in results]) / total if total > 0 else 0
return {
'total_texts': total,
'polarity_distribution': dict(polarity_counts),
'polarity_percentages': {
k: round(v / total * 100, 2)
for k, v in polarity_counts.items()
},
'average_confidence': round(avg_confidence, 3)
}