|
|
""" |
|
|
Multilingual Sentiment Analysis Tool |
|
|
Supports Turkish, Persian, and English using lexicon-based and machine learning approaches |
|
|
""" |
|
|
|
|
|
import re |
|
|
import json |
|
|
import os |
|
|
from typing import Dict, List, Tuple, Optional |
|
|
from collections import Counter |
|
|
import math |
|
|
|
|
|
|
|
|
class SentimentLexicon: |
|
|
"""Base class for sentiment lexicons""" |
|
|
|
|
|
def __init__(self, language: str): |
|
|
self.language = language |
|
|
self.positive_words = set() |
|
|
self.negative_words = set() |
|
|
self.intensifiers = {} |
|
|
self.negation_words = set() |
|
|
self.diminishers = {} |
|
|
self.contrast_words = set() |
|
|
self.idioms_positive = [] |
|
|
self.idioms_negative = [] |
|
|
self._load_lexicon() |
|
|
|
|
|
def _load_lexicon(self): |
|
|
"""Load language-specific sentiment lexicon""" |
|
|
lexicon_file = f"lexicons/{self.language}_lexicon.json" |
|
|
if os.path.exists(lexicon_file): |
|
|
with open(lexicon_file, 'r', encoding='utf-8') as f: |
|
|
data = json.load(f) |
|
|
self.positive_words = set(data.get('positive', [])) |
|
|
self.negative_words = set(data.get('negative', [])) |
|
|
self.intensifiers = data.get('intensifiers', {}) |
|
|
self.negation_words = set(data.get('negation', [])) |
|
|
self.diminishers = data.get('diminishers', {}) |
|
|
self.contrast_words = set(data.get('contrast_words', [])) |
|
|
self.idioms_positive = data.get('idioms_positive', []) |
|
|
self.idioms_negative = data.get('idioms_negative', []) |
|
|
else: |
|
|
|
|
|
self._load_default_english() |
|
|
|
|
|
def _load_default_english(self): |
|
|
"""Load default English sentiment words""" |
|
|
self.positive_words = { |
|
|
'good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic', |
|
|
'love', 'like', 'best', 'perfect', 'beautiful', 'nice', 'happy', |
|
|
'pleased', 'satisfied', 'awesome', 'brilliant', 'outstanding' |
|
|
} |
|
|
self.negative_words = { |
|
|
'bad', 'terrible', 'awful', 'horrible', 'worst', 'hate', 'dislike', |
|
|
'poor', 'disappointed', 'sad', 'angry', 'frustrated', 'annoying', |
|
|
'boring', 'ugly', 'awful', 'disgusting', 'pathetic' |
|
|
} |
|
|
self.intensifiers = { |
|
|
'very': 1.5, 'extremely': 2.0, 'really': 1.3, 'quite': 1.2, |
|
|
'too': 1.4, 'so': 1.3, 'absolutely': 1.8, 'completely': 1.5 |
|
|
} |
|
|
self.negation_words = { |
|
|
'not', 'no', 'never', 'none', 'nobody', 'nothing', 'nowhere', |
|
|
'neither', 'cannot', "can't", "won't", "don't", "doesn't" |
|
|
} |
|
|
self.diminishers = {} |
|
|
self.contrast_words = set() |
|
|
self.idioms_positive = [] |
|
|
self.idioms_negative = [] |
|
|
|
|
|
|
|
|
class TextPreprocessor: |
|
|
"""Text preprocessing for different languages""" |
|
|
|
|
|
def __init__(self, language: str): |
|
|
self.language = language |
|
|
|
|
|
def preprocess(self, text: str) -> List[str]: |
|
|
"""Preprocess text and return tokens""" |
|
|
|
|
|
text = text.lower() |
|
|
|
|
|
|
|
|
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) |
|
|
|
|
|
|
|
|
text = re.sub(r'\S+@\S+', '', text) |
|
|
|
|
|
|
|
|
text = re.sub(r'[^\w\s\.,!?;:()\-\']', '', text) |
|
|
|
|
|
|
|
|
tokens = re.findall(r'\b\w+\b|[.,!?;:()]', text) |
|
|
|
|
|
return tokens |
|
|
|
|
|
def normalize_turkish(self, text: str) -> str: |
|
|
"""Normalize Turkish text (handle special characters)""" |
|
|
|
|
|
replacements = { |
|
|
'ı': 'i', 'İ': 'I', |
|
|
'ğ': 'g', 'Ğ': 'G', |
|
|
'ü': 'u', 'Ü': 'U', |
|
|
'ş': 's', 'Ş': 'S', |
|
|
'ö': 'o', 'Ö': 'O', |
|
|
'ç': 'c', 'Ç': 'C' |
|
|
} |
|
|
for old, new in replacements.items(): |
|
|
text = text.replace(old, new) |
|
|
return text |
|
|
|
|
|
def normalize_persian(self, text: str) -> str: |
|
|
"""Normalize Persian text (handle different character forms)""" |
|
|
|
|
|
|
|
|
return text |
|
|
|
|
|
|
|
|
class LexiconBasedAnalyzer: |
|
|
"""Lexicon-based sentiment analysis with enhanced features""" |
|
|
|
|
|
def __init__(self, language: str): |
|
|
self.language = language |
|
|
self.lexicon = SentimentLexicon(language) |
|
|
self.preprocessor = TextPreprocessor(language) |
|
|
|
|
|
def _check_idioms(self, text: str) -> Tuple[float, float]: |
|
|
"""Check for sentiment idioms in text""" |
|
|
pos_score = 0.0 |
|
|
neg_score = 0.0 |
|
|
text_lower = text.lower() |
|
|
|
|
|
for idiom in self.lexicon.idioms_positive: |
|
|
if idiom.lower() in text_lower: |
|
|
pos_score += 2.0 |
|
|
|
|
|
for idiom in self.lexicon.idioms_negative: |
|
|
if idiom.lower() in text_lower: |
|
|
neg_score += 2.0 |
|
|
|
|
|
return pos_score, neg_score |
|
|
|
|
|
def analyze(self, text: str) -> Dict: |
|
|
"""Analyze sentiment using lexicon-based approach""" |
|
|
tokens = self.preprocessor.preprocess(text) |
|
|
text_lower = text.lower() |
|
|
|
|
|
positive_score = 0 |
|
|
negative_score = 0 |
|
|
sentiment_words = [] |
|
|
negation_count = 0 |
|
|
|
|
|
|
|
|
idiom_pos, idiom_neg = self._check_idioms(text) |
|
|
positive_score += idiom_pos |
|
|
negative_score += idiom_neg |
|
|
|
|
|
|
|
|
window_size = 4 |
|
|
i = 0 |
|
|
while i < len(tokens): |
|
|
token = tokens[i] |
|
|
is_negated = False |
|
|
intensifier_strength = 1.0 |
|
|
diminisher_strength = 1.0 |
|
|
|
|
|
|
|
|
for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)): |
|
|
if tokens[j] in self.lexicon.negation_words: |
|
|
|
|
|
if j < i: |
|
|
|
|
|
has_punctuation = any( |
|
|
tokens[k] in ['.', '!', '?', ';', ','] |
|
|
for k in range(j + 1, i) |
|
|
) |
|
|
if not has_punctuation: |
|
|
is_negated = True |
|
|
negation_count += 1 |
|
|
break |
|
|
|
|
|
|
|
|
for k in range(max(0, i-2), i): |
|
|
if k >= 0 and tokens[k] in self.lexicon.intensifiers: |
|
|
intensifier_strength = max(intensifier_strength, self.lexicon.intensifiers[tokens[k]]) |
|
|
|
|
|
|
|
|
for k in range(max(0, i-2), i): |
|
|
if k >= 0 and tokens[k] in self.lexicon.diminishers: |
|
|
diminisher_strength = min(diminisher_strength, self.lexicon.diminishers[tokens[k]]) |
|
|
|
|
|
|
|
|
if token in self.lexicon.positive_words: |
|
|
score = 1.0 * intensifier_strength * diminisher_strength |
|
|
if is_negated: |
|
|
negative_score += score |
|
|
sentiment_words.append(('negative', token, is_negated)) |
|
|
else: |
|
|
positive_score += score |
|
|
sentiment_words.append(('positive', token, is_negated)) |
|
|
elif token in self.lexicon.negative_words: |
|
|
score = 1.0 * intensifier_strength * diminisher_strength |
|
|
if is_negated: |
|
|
positive_score += score |
|
|
sentiment_words.append(('positive', token, is_negated)) |
|
|
else: |
|
|
negative_score += score |
|
|
sentiment_words.append(('negative', token, is_negated)) |
|
|
|
|
|
i += 1 |
|
|
|
|
|
|
|
|
|
|
|
total_raw = positive_score + negative_score |
|
|
if total_raw > 0: |
|
|
|
|
|
pos_normalized = positive_score / total_raw |
|
|
neg_normalized = negative_score / total_raw |
|
|
else: |
|
|
pos_normalized = 0.0 |
|
|
neg_normalized = 0.0 |
|
|
|
|
|
if total_raw == 0: |
|
|
polarity = 'neutral' |
|
|
confidence = 0.0 |
|
|
elif positive_score > negative_score: |
|
|
polarity = 'positive' |
|
|
confidence = pos_normalized |
|
|
else: |
|
|
polarity = 'negative' |
|
|
confidence = neg_normalized |
|
|
|
|
|
return { |
|
|
'polarity': polarity, |
|
|
'confidence': round(confidence, 3), |
|
|
'positive_score': round(positive_score, 3), |
|
|
'negative_score': round(negative_score, 3), |
|
|
'sentiment_words': sentiment_words[:10], |
|
|
'method': 'lexicon-based' |
|
|
} |
|
|
|
|
|
|
|
|
class RuleBasedAnalyzer: |
|
|
"""Rule-based sentiment analysis with advanced linguistic rules""" |
|
|
|
|
|
def __init__(self, language: str): |
|
|
self.language = language |
|
|
self.lexicon = SentimentLexicon(language) |
|
|
self.preprocessor = TextPreprocessor(language) |
|
|
|
|
|
def _detect_emoticons(self, text: str) -> Tuple[float, float]: |
|
|
"""Detect and score emoticons and emojis""" |
|
|
pos_score = 0.0 |
|
|
neg_score = 0.0 |
|
|
|
|
|
|
|
|
positive_emoticons = [ |
|
|
':)', ':-)', '=)', ';)', ';-)', '=D', ':D', ':-D', |
|
|
'😊', '😀', '😁', '😂', '🤣', '😃', '😄', '😆', '😍', '🥰', |
|
|
'😎', '🤗', '👍', '👏', '🎉', '❤️', '💕', '💖', '💗', '💓' |
|
|
] |
|
|
|
|
|
|
|
|
negative_emoticons = [ |
|
|
':(', ':-(', '=(', ':/', ':-/', ':|', ':-|', '>:(', '>:(', |
|
|
'😢', '😞', '😠', '😡', '😤', '😭', '😰', '😨', '😱', '😖', |
|
|
'😣', '😫', '😩', '👎', '💔', '😒', '😔', '😕', '🙁' |
|
|
] |
|
|
|
|
|
for emoji in positive_emoticons: |
|
|
count = text.count(emoji) |
|
|
pos_score += count * 1.5 |
|
|
|
|
|
for emoji in negative_emoticons: |
|
|
count = text.count(emoji) |
|
|
neg_score += count * 1.5 |
|
|
|
|
|
return pos_score, neg_score |
|
|
|
|
|
def _handle_contrast_words(self, text: str, tokens: List[str], |
|
|
pos_score: float, neg_score: float) -> Tuple[float, float]: |
|
|
"""Handle contrast words that may shift sentiment""" |
|
|
|
|
|
contrast_positions = [] |
|
|
for i, token in enumerate(tokens): |
|
|
if token.lower() in self.lexicon.contrast_words: |
|
|
contrast_positions.append(i) |
|
|
|
|
|
|
|
|
if contrast_positions: |
|
|
|
|
|
reduction_factor = 0.7 |
|
|
return pos_score * reduction_factor, neg_score * reduction_factor |
|
|
|
|
|
return pos_score, neg_score |
|
|
|
|
|
def _detect_comparatives_superlatives(self, tokens: List[str]) -> float: |
|
|
"""Detect comparative and superlative forms that intensify sentiment""" |
|
|
multiplier = 1.0 |
|
|
|
|
|
|
|
|
superlative_indicators = ['most', 'best', 'worst', 'least', 'greatest'] |
|
|
for token in tokens: |
|
|
if token.lower() in superlative_indicators: |
|
|
multiplier = max(multiplier, 1.4) |
|
|
|
|
|
|
|
|
comparative_patterns = ['more', 'less', 'better', 'worse', 'greater', 'smaller'] |
|
|
for token in tokens: |
|
|
if token.lower() in comparative_patterns: |
|
|
multiplier = max(multiplier, 1.2) |
|
|
|
|
|
return multiplier |
|
|
|
|
|
def _detect_repetition(self, text: str) -> float: |
|
|
"""Detect repeated characters/words that indicate emphasis""" |
|
|
multiplier = 1.0 |
|
|
|
|
|
|
|
|
repeated_chars = re.findall(r'(\w)\1{2,}', text.lower()) |
|
|
if repeated_chars: |
|
|
multiplier += len(repeated_chars) * 0.1 |
|
|
|
|
|
|
|
|
words = text.lower().split() |
|
|
if len(words) > 2: |
|
|
for i in range(len(words) - 2): |
|
|
if words[i] == words[i+1] == words[i+2]: |
|
|
multiplier += 0.2 |
|
|
break |
|
|
|
|
|
return min(multiplier, 1.5) |
|
|
|
|
|
def _detect_sentiment_shifters(self, text: str) -> float: |
|
|
"""Detect words that shift sentiment polarity""" |
|
|
shifters = { |
|
|
'but': 0.6, 'however': 0.6, 'although': 0.7, 'though': 0.7, |
|
|
'yet': 0.6, 'still': 0.7, 'nevertheless': 0.6, 'nonetheless': 0.6 |
|
|
} |
|
|
|
|
|
text_lower = text.lower() |
|
|
for shifter, factor in shifters.items(): |
|
|
if shifter in text_lower: |
|
|
return factor |
|
|
|
|
|
return 1.0 |
|
|
|
|
|
def analyze(self, text: str) -> Dict: |
|
|
"""Analyze sentiment using rule-based approach with advanced rules""" |
|
|
|
|
|
base_analyzer = LexiconBasedAnalyzer(self.language) |
|
|
result = base_analyzer.analyze(text) |
|
|
|
|
|
|
|
|
tokens = self.preprocessor.preprocess(text) |
|
|
text_lower = text.lower() |
|
|
|
|
|
|
|
|
exclamation_count = text.count('!') |
|
|
if exclamation_count > 0: |
|
|
multiplier = 1 + min(exclamation_count * 0.15, 0.5) |
|
|
result['positive_score'] *= multiplier |
|
|
result['negative_score'] *= multiplier |
|
|
|
|
|
|
|
|
question_count = text.count('?') |
|
|
if question_count > 1: |
|
|
uncertainty_factor = max(0.7, 1 - (question_count * 0.1)) |
|
|
result['confidence'] *= uncertainty_factor |
|
|
|
|
|
|
|
|
caps_words = [w for w in text.split() if w.isupper() and len(w) > 2] |
|
|
if len(caps_words) > 0: |
|
|
caps_multiplier = 1 + (len(caps_words) * 0.1) |
|
|
result['positive_score'] *= caps_multiplier |
|
|
result['negative_score'] *= caps_multiplier |
|
|
|
|
|
|
|
|
emoji_pos, emoji_neg = self._detect_emoticons(text) |
|
|
result['positive_score'] += emoji_pos |
|
|
result['negative_score'] += emoji_neg |
|
|
|
|
|
|
|
|
result['positive_score'], result['negative_score'] = self._handle_contrast_words( |
|
|
text, tokens, result['positive_score'], result['negative_score'] |
|
|
) |
|
|
|
|
|
|
|
|
comp_super_mult = self._detect_comparatives_superlatives(tokens) |
|
|
result['positive_score'] *= comp_super_mult |
|
|
result['negative_score'] *= comp_super_mult |
|
|
|
|
|
|
|
|
rep_mult = self._detect_repetition(text) |
|
|
result['positive_score'] *= rep_mult |
|
|
result['negative_score'] *= rep_mult |
|
|
|
|
|
|
|
|
shifter_factor = self._detect_sentiment_shifters(text) |
|
|
if shifter_factor < 1.0: |
|
|
|
|
|
result['positive_score'] *= shifter_factor |
|
|
result['negative_score'] *= shifter_factor |
|
|
|
|
|
|
|
|
if '...' in text or '…' in text: |
|
|
result['confidence'] *= 0.9 |
|
|
|
|
|
|
|
|
multi_punct = re.findall(r'[!?]{2,}', text) |
|
|
if multi_punct: |
|
|
punct_mult = 1 + (len(multi_punct) * 0.1) |
|
|
result['positive_score'] *= punct_mult |
|
|
result['negative_score'] *= punct_mult |
|
|
|
|
|
|
|
|
hashtags = re.findall(r'#\w+', text) |
|
|
if hashtags: |
|
|
|
|
|
for tag in hashtags: |
|
|
tag_lower = tag.lower() |
|
|
if any(word in tag_lower for word in self.lexicon.positive_words): |
|
|
result['positive_score'] += 0.5 |
|
|
if any(word in tag_lower for word in self.lexicon.negative_words): |
|
|
result['negative_score'] += 0.5 |
|
|
|
|
|
|
|
|
if re.search(r'http[s]?://', text): |
|
|
result['confidence'] *= 0.95 |
|
|
|
|
|
|
|
|
word_count = len(text.split()) |
|
|
if word_count < 3: |
|
|
result['confidence'] *= 0.8 |
|
|
elif word_count > 100: |
|
|
result['confidence'] *= 0.95 |
|
|
|
|
|
|
|
|
total = result['positive_score'] + result['negative_score'] |
|
|
if total > 0: |
|
|
if result['positive_score'] > result['negative_score']: |
|
|
result['polarity'] = 'positive' |
|
|
result['confidence'] = result['positive_score'] / total |
|
|
else: |
|
|
result['polarity'] = 'negative' |
|
|
result['confidence'] = result['negative_score'] / total |
|
|
else: |
|
|
result['polarity'] = 'neutral' |
|
|
result['confidence'] = 0.0 |
|
|
|
|
|
result['method'] = 'rule-based' |
|
|
return result |
|
|
|
|
|
|
|
|
class HybridAnalyzer: |
|
|
"""Hybrid approach combining lexicon, rules, and simple ML features""" |
|
|
|
|
|
def __init__(self, language: str): |
|
|
self.language = language |
|
|
self.lexicon_analyzer = LexiconBasedAnalyzer(language) |
|
|
self.rule_analyzer = RuleBasedAnalyzer(language) |
|
|
|
|
|
def analyze(self, text: str) -> Dict: |
|
|
"""Analyze sentiment using hybrid approach""" |
|
|
|
|
|
lexicon_result = self.lexicon_analyzer.analyze(text) |
|
|
rule_result = self.rule_analyzer.analyze(text) |
|
|
|
|
|
|
|
|
lexicon_weight = 0.4 |
|
|
rule_weight = 0.6 |
|
|
|
|
|
combined_positive = (lexicon_result['positive_score'] * lexicon_weight + |
|
|
rule_result['positive_score'] * rule_weight) |
|
|
combined_negative = (lexicon_result['negative_score'] * lexicon_weight + |
|
|
rule_result['negative_score'] * rule_weight) |
|
|
|
|
|
total = combined_positive + combined_negative |
|
|
if total == 0: |
|
|
polarity = 'neutral' |
|
|
confidence = 0.0 |
|
|
elif combined_positive > combined_negative: |
|
|
polarity = 'positive' |
|
|
confidence = combined_positive / total |
|
|
else: |
|
|
polarity = 'negative' |
|
|
confidence = combined_negative / total |
|
|
|
|
|
return { |
|
|
'polarity': polarity, |
|
|
'confidence': round(confidence, 3), |
|
|
'positive_score': round(combined_positive, 3), |
|
|
'negative_score': round(combined_negative, 3), |
|
|
'lexicon_result': lexicon_result, |
|
|
'rule_result': rule_result, |
|
|
'method': 'hybrid' |
|
|
} |
|
|
|
|
|
|
|
|
class MultilingualSentimentAnalyzer: |
|
|
"""Main sentiment analyzer supporting multiple languages and methods""" |
|
|
|
|
|
def __init__(self, language: str = 'english', method: str = 'hybrid'): |
|
|
""" |
|
|
Initialize sentiment analyzer |
|
|
|
|
|
Args: |
|
|
language: 'english', 'turkish', or 'persian' |
|
|
method: 'lexicon', 'rule', or 'hybrid' |
|
|
""" |
|
|
self.language = language.lower() |
|
|
self.method = method.lower() |
|
|
|
|
|
if method == 'lexicon': |
|
|
self.analyzer = LexiconBasedAnalyzer(self.language) |
|
|
elif method == 'rule': |
|
|
self.analyzer = RuleBasedAnalyzer(self.language) |
|
|
else: |
|
|
self.analyzer = HybridAnalyzer(self.language) |
|
|
|
|
|
def analyze(self, text: str) -> Dict: |
|
|
"""Analyze sentiment of input text""" |
|
|
if not text or not text.strip(): |
|
|
return { |
|
|
'polarity': 'neutral', |
|
|
'confidence': 0.0, |
|
|
'error': 'Empty text provided' |
|
|
} |
|
|
|
|
|
try: |
|
|
result = self.analyzer.analyze(text) |
|
|
result['language'] = self.language |
|
|
result['text_length'] = len(text) |
|
|
result['word_count'] = len(text.split()) |
|
|
return result |
|
|
except Exception as e: |
|
|
return { |
|
|
'polarity': 'neutral', |
|
|
'confidence': 0.0, |
|
|
'error': str(e) |
|
|
} |
|
|
|
|
|
def analyze_batch(self, texts: List[str]) -> List[Dict]: |
|
|
"""Analyze multiple texts""" |
|
|
return [self.analyze(text) for text in texts] |
|
|
|
|
|
def get_statistics(self, texts: List[str]) -> Dict: |
|
|
"""Get aggregate statistics for a batch of texts""" |
|
|
results = self.analyze_batch(texts) |
|
|
|
|
|
polarity_counts = Counter([r['polarity'] for r in results]) |
|
|
total = len(results) |
|
|
|
|
|
avg_confidence = sum([r.get('confidence', 0) for r in results]) / total if total > 0 else 0 |
|
|
|
|
|
return { |
|
|
'total_texts': total, |
|
|
'polarity_distribution': dict(polarity_counts), |
|
|
'polarity_percentages': { |
|
|
k: round(v / total * 100, 2) |
|
|
for k, v in polarity_counts.items() |
|
|
}, |
|
|
'average_confidence': round(avg_confidence, 3) |
|
|
} |
|
|
|
|
|
|