amirrezaa commited on
Commit
6b1a584
·
verified ·
1 Parent(s): 508f6c2

Upload 7 files

Browse files
README.md CHANGED
@@ -1,13 +1,52 @@
1
  ---
2
- title: Sentiment
3
- emoji: 🔥
4
- colorFrom: indigo
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 6.2.0
8
- app_file: app.py
9
  pinned: false
10
- short_description: A test simple sentiment analyzer for Turkish,English,Persian
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Multilingual Sentiment Analysis
3
+ emoji: 🌍
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.0.0
8
+ app_file: app_gradio.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
+ # Multilingual Sentiment Analysis Tool
14
+
15
+ A comprehensive sentiment analysis tool supporting **English**, **Turkish**, and **Persian** languages using non-deep-learning approaches (lexicon-based, rule-based, and hybrid methods).
16
+
17
+ ## Features
18
+
19
+ - 🌍 **Multilingual Support**: English, Turkish, and Persian
20
+ - 🔧 **Multiple Methods**: Lexicon-based, rule-based, and hybrid approaches
21
+ - 📊 **Batch Processing**: Analyze multiple texts at once
22
+ - ✨ **Advanced Rules**:
23
+ - Comprehensive sentiment lexicons (200+ words per language)
24
+ - Idiom detection
25
+ - Emoticon and emoji support
26
+ - Negation scope detection
27
+ - Intensifier and diminisher handling
28
+ - Contrast word detection
29
+ - And much more!
30
+
31
+ ## Usage
32
+
33
+ 1. Select your language (English, Turkish, or Persian)
34
+ 2. Choose analysis method (Lexicon, Rule-based, or Hybrid)
35
+ 3. Enter text and click "Analyze Sentiment"
36
+ 4. View detailed results with polarity, confidence, and scores
37
+
38
+ ## Methods
39
+
40
+ - **Lexicon-based**: Uses predefined sentiment dictionaries
41
+ - **Rule-based**: Extends lexicon with linguistic rules
42
+ - **Hybrid** (Recommended): Combines both approaches for best results
43
+
44
+ ## Citation
45
+
46
+ If you use this tool in your research, please cite:
47
+
48
+ ```
49
+ Multilingual Sentiment Analysis Tool (2024)
50
+ Non-Deep-Learning Approaches for Sentiment Analysis
51
+ ```
52
+
app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio App for Multilingual Sentiment Analysis
3
+ Deploy this to Hugging Face Spaces
4
+ """
5
+
6
+ import gradio as gr
7
+ from sentiment_analyzer import MultilingualSentimentAnalyzer
8
+
9
+ def analyze_sentiment(text, language, method):
10
+ """Analyze sentiment and return formatted results"""
11
+ if not text or not text.strip():
12
+ return "Please enter some text to analyze."
13
+
14
+ try:
15
+ analyzer = MultilingualSentimentAnalyzer(language=language, method=method)
16
+ result = analyzer.analyze(text)
17
+
18
+ # Format the output nicely
19
+ output = f"""
20
+ ## Sentiment Analysis Results
21
+
22
+ **Polarity:** {result['polarity'].upper()}
23
+ **Confidence:** {result['confidence']*100:.1f}%
24
+
25
+ **Scores:**
26
+ - Positive: {result['positive_score']:.2f}
27
+ - Negative: {result['negative_score']:.2f}
28
+
29
+ **Details:**
30
+ - Method: {result['method']}
31
+ - Language: {result['language']}
32
+ - Words analyzed: {result.get('word_count', 0)}
33
+ """
34
+
35
+ return output
36
+ except Exception as e:
37
+ return f"Error: {str(e)}"
38
+
39
+ def batch_analyze(texts, language, method):
40
+ """Analyze multiple texts"""
41
+ if not texts:
42
+ return "Please enter texts to analyze (one per line)."
43
+
44
+ text_list = [t.strip() for t in texts.split('\n') if t.strip()]
45
+ if not text_list:
46
+ return "No valid texts found."
47
+
48
+ try:
49
+ analyzer = MultilingualSentimentAnalyzer(language=language, method=method)
50
+ results = analyzer.analyze_batch(text_list)
51
+ stats = analyzer.get_statistics(text_list)
52
+
53
+ output = f"""
54
+ ## Batch Analysis Results
55
+
56
+ **Statistics:**
57
+ - Total texts: {stats['total_texts']}
58
+ - Average confidence: {stats['average_confidence']*100:.1f}%
59
+
60
+ **Polarity Distribution:**
61
+ """
62
+ for polarity, percentage in stats['polarity_percentages'].items():
63
+ output += f"- {polarity.capitalize()}: {percentage}%\n"
64
+
65
+ output += "\n**Individual Results:**\n"
66
+ for i, (text, result) in enumerate(zip(text_list, results), 1):
67
+ output += f"\n{i}. \"{text[:50]}...\" → {result['polarity']} ({result['confidence']*100:.1f}%)\n"
68
+
69
+ return output
70
+ except Exception as e:
71
+ return f"Error: {str(e)}"
72
+
73
+ # Create Gradio interface
74
+ with gr.Blocks(title="Multilingual Sentiment Analysis", theme=gr.themes.Soft()) as demo:
75
+ gr.Markdown("""
76
+ # 🌍 Multilingual Sentiment Analysis Tool
77
+
78
+ Analyze sentiment in **English**, **Turkish**, and **Persian** text using non-deep-learning approaches.
79
+
80
+ This tool uses lexicon-based, rule-based, and hybrid methods for interpretable sentiment analysis.
81
+ """)
82
+
83
+ with gr.Tabs():
84
+ with gr.TabItem("Single Text Analysis"):
85
+ with gr.Row():
86
+ with gr.Column():
87
+ text_input = gr.Textbox(
88
+ label="Enter Text",
89
+ placeholder="Type your text here...",
90
+ lines=5
91
+ )
92
+ language = gr.Dropdown(
93
+ choices=["english", "turkish", "persian"],
94
+ value="english",
95
+ label="Language"
96
+ )
97
+ method = gr.Dropdown(
98
+ choices=["lexicon", "rule", "hybrid"],
99
+ value="hybrid",
100
+ label="Analysis Method"
101
+ )
102
+ analyze_btn = gr.Button("Analyze Sentiment", variant="primary")
103
+
104
+ with gr.Column():
105
+ output = gr.Markdown(label="Results")
106
+
107
+ analyze_btn.click(
108
+ fn=analyze_sentiment,
109
+ inputs=[text_input, language, method],
110
+ outputs=output
111
+ )
112
+
113
+ with gr.TabItem("Batch Analysis"):
114
+ with gr.Row():
115
+ with gr.Column():
116
+ batch_texts = gr.Textbox(
117
+ label="Enter Texts (one per line)",
118
+ placeholder="Enter multiple texts, one per line...",
119
+ lines=10
120
+ )
121
+ batch_language = gr.Dropdown(
122
+ choices=["english", "turkish", "persian"],
123
+ value="english",
124
+ label="Language"
125
+ )
126
+ batch_method = gr.Dropdown(
127
+ choices=["lexicon", "rule", "hybrid"],
128
+ value="hybrid",
129
+ label="Analysis Method"
130
+ )
131
+ batch_btn = gr.Button("Analyze Batch", variant="primary")
132
+
133
+ with gr.Column():
134
+ batch_output = gr.Markdown(label="Batch Results")
135
+
136
+ batch_btn.click(
137
+ fn=batch_analyze,
138
+ inputs=[batch_texts, batch_language, batch_method],
139
+ outputs=batch_output
140
+ )
141
+
142
+ with gr.TabItem("Examples"):
143
+ gr.Markdown("""
144
+ ### Example Texts to Try:
145
+
146
+ **English:**
147
+ - "I love this product! It's absolutely amazing!!! 😊"
148
+ - "This is terrible. I hate it."
149
+ - "Not bad, actually it's quite good!"
150
+
151
+ **Turkish:**
152
+ - "Bu ürünü çok seviyorum! Harika!"
153
+ - "Berbat bir deneyim. Hiç beğenmedim."
154
+
155
+ **Persian:**
156
+ - "این محصول عالی است!"
157
+ - "خیلی بد بود"
158
+ """)
159
+
160
+ gr.Markdown("""
161
+ ---
162
+ **About:** This tool uses lexicon-based, rule-based, and hybrid approaches (without deep learning)
163
+ for interpretable sentiment analysis. Supports English, Turkish, and Persian languages.
164
+ """)
165
+
166
+ if __name__ == "__main__":
167
+ demo.launch()
168
+
lexicons/english_lexicon.json ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "positive": [
3
+ "good", "great", "excellent", "amazing", "wonderful", "fantastic", "superb", "marvelous",
4
+ "love", "like", "adore", "cherish", "treasure", "appreciate", "enjoy", "delight",
5
+ "best", "perfect", "ideal", "flawless", "impeccable", "outstanding", "remarkable",
6
+ "beautiful", "gorgeous", "stunning", "lovely", "charming", "attractive", "appealing",
7
+ "nice", "pleasant", "agreeable", "satisfying", "pleasing", "gratifying",
8
+ "happy", "joyful", "cheerful", "glad", "pleased", "delighted", "thrilled", "ecstatic",
9
+ "satisfied", "content", "fulfilled", "gratified", "comfortable", "at ease",
10
+ "awesome", "brilliant", "magnificent", "splendid", "fabulous", "terrific", "incredible",
11
+ "successful", "triumphant", "victorious", "winning", "prosperous", "flourishing",
12
+ "optimistic", "hopeful", "confident", "positive", "upbeat", "encouraging",
13
+ "grateful", "thankful", "appreciative", "obliged", "indebted",
14
+ "excited", "enthusiastic", "eager", "passionate", "zealous", "ardent",
15
+ "proud", "honored", "privileged", "blessed", "fortunate", "lucky",
16
+ "impressive", "admirable", "praiseworthy", "commendable", "laudable",
17
+ "refreshing", "invigorating", "energizing", "uplifting", "inspiring",
18
+ "reliable", "trustworthy", "dependable", "solid", "steady", "consistent",
19
+ "valuable", "precious", "worthwhile", "beneficial", "advantageous", "profitable",
20
+ "smooth", "easy", "effortless", "seamless", "convenient", "user-friendly",
21
+ "innovative", "creative", "original", "unique", "distinctive", "special",
22
+ "professional", "expert", "skilled", "competent", "capable", "proficient",
23
+ "helpful", "supportive", "beneficial", "useful", "practical", "effective",
24
+ "clear", "transparent", "honest", "genuine", "authentic", "sincere",
25
+ "peaceful", "calm", "serene", "tranquil", "relaxing", "soothing",
26
+ "fun", "entertaining", "enjoyable", "amusing", "delightful", "pleasurable",
27
+ "fast", "quick", "rapid", "swift", "efficient", "speedy",
28
+ "affordable", "reasonable", "fair", "economical", "budget-friendly",
29
+ "modern", "contemporary", "up-to-date", "current", "fresh", "new",
30
+ "clean", "tidy", "organized", "neat", "orderly", "pristine",
31
+ "safe", "secure", "protected", "reliable", "stable", "sound",
32
+ "flexible", "adaptable", "versatile", "multipurpose", "all-purpose",
33
+ "recommend", "suggest", "endorse", "approve", "support", "back",
34
+ "exceed", "surpass", "outperform", "outshine", "beat", "top"
35
+ ],
36
+ "negative": [
37
+ "bad", "terrible", "awful", "horrible", "dreadful", "atrocious", "appalling",
38
+ "worst", "poorest", "lowest", "inferior", "substandard", "unacceptable",
39
+ "hate", "loathe", "despise", "detest", "abhor", "disgust", "repulse",
40
+ "dislike", "disapprove", "reject", "refuse", "decline", "deny",
41
+ "poor", "inadequate", "insufficient", "deficient", "lacking", "wanting",
42
+ "disappointed", "let down", "disillusioned", "disheartened", "discouraged",
43
+ "sad", "unhappy", "miserable", "depressed", "down", "blue", "gloomy",
44
+ "angry", "mad", "furious", "enraged", "irritated", "annoyed", "upset",
45
+ "frustrated", "exasperated", "aggravated", "bothered", "irked", "vexed",
46
+ "annoying", "irritating", "bothersome", "troublesome", "pesky", "nagging",
47
+ "boring", "tedious", "dull", "monotonous", "repetitive", "tiresome",
48
+ "ugly", "unattractive", "hideous", "repulsive", "revolting", "disgusting",
49
+ "pathetic", "pitiful", "lamentable", "deplorable", "regrettable",
50
+ "miserable", "wretched", "unfortunate", "unlucky", "hapless",
51
+ "depressing", "disheartening", "discouraging", "demoralizing", "daunting",
52
+ "unpleasant", "disagreeable", "offensive", "repugnant", "repellent",
53
+ "disappointing", "unsatisfactory", "inadequate", "subpar", "below expectations",
54
+ "frustrating", "infuriating", "maddening", "exasperating", "aggravating",
55
+ "confusing", "bewildering", "perplexing", "puzzling", "mystifying",
56
+ "difficult", "hard", "challenging", "tough", "arduous", "strenuous",
57
+ "slow", "sluggish", "slack", "delayed", "late", "behind schedule",
58
+ "expensive", "costly", "pricey", "overpriced", "unaffordable", "exorbitant",
59
+ "outdated", "obsolete", "old-fashioned", "archaic", "antiquated",
60
+ "dirty", "filthy", "unclean", "messy", "disorganized", "cluttered",
61
+ "unsafe", "dangerous", "risky", "hazardous", "perilous", "precarious",
62
+ "broken", "damaged", "defective", "faulty", "malfunctioning", "flawed",
63
+ "unreliable", "untrustworthy", "undependable", "inconsistent", "unstable",
64
+ "useless", "worthless", "pointless", "futile", "ineffective", "inefficient",
65
+ "complicated", "complex", "convoluted", "intricate", "elaborate",
66
+ "waste", "squander", "throw away", "lose", "miss", "fail",
67
+ "problem", "issue", "trouble", "difficulty", "hardship", "obstacle",
68
+ "error", "mistake", "fault", "flaw", "defect", "bug",
69
+ "complaint", "grievance", "objection", "protest", "criticism",
70
+ "regret", "remorse", "sorrow", "grief", "anguish", "distress",
71
+ "worry", "concern", "anxiety", "stress", "tension", "pressure",
72
+ "fear", "dread", "terror", "panic", "alarm", "apprehension",
73
+ "pain", "ache", "hurt", "suffering", "agony", "torment",
74
+ "weak", "feeble", "frail", "fragile", "delicate", "vulnerable",
75
+ "stupid", "foolish", "silly", "ridiculous", "absurd", "nonsensical",
76
+ "lazy", "sluggish", "inactive", "idle", "indolent", "lethargic",
77
+ "rude", "impolite", "discourteous", "ill-mannered", "offensive",
78
+ "selfish", "greedy", "self-centered", "egotistical", "narcissistic",
79
+ "dishonest", "deceptive", "misleading", "fraudulent", "deceitful",
80
+ "unfair", "unjust", "biased", "prejudiced", "discriminatory",
81
+ "reject", "refuse", "decline", "deny", "dismiss", "turn down"
82
+ ],
83
+ "intensifiers": {
84
+ "very": 1.5,
85
+ "extremely": 2.0,
86
+ "really": 1.3,
87
+ "quite": 1.2,
88
+ "too": 1.4,
89
+ "so": 1.3,
90
+ "absolutely": 1.8,
91
+ "completely": 1.5,
92
+ "totally": 1.6,
93
+ "incredibly": 1.7,
94
+ "amazingly": 1.6,
95
+ "exceptionally": 1.7,
96
+ "particularly": 1.4,
97
+ "especially": 1.4,
98
+ "highly": 1.6,
99
+ "greatly": 1.5,
100
+ "significantly": 1.5,
101
+ "substantially": 1.5,
102
+ "considerably": 1.4,
103
+ "tremendously": 1.8,
104
+ "immensely": 1.7,
105
+ "enormously": 1.7,
106
+ "hugely": 1.6,
107
+ "massively": 1.6,
108
+ "dramatically": 1.6,
109
+ "remarkably": 1.6,
110
+ "extraordinarily": 1.8,
111
+ "unbelievably": 1.7,
112
+ "incredibly": 1.7,
113
+ "surprisingly": 1.4,
114
+ "unusually": 1.4,
115
+ "remarkably": 1.6,
116
+ "deeply": 1.5,
117
+ "profoundly": 1.6,
118
+ "thoroughly": 1.5,
119
+ "utterly": 1.7,
120
+ "entirely": 1.5,
121
+ "fully": 1.4,
122
+ "perfectly": 1.6,
123
+ "purely": 1.4,
124
+ "simply": 1.3,
125
+ "just": 1.2,
126
+ "even": 1.2,
127
+ "more": 1.3,
128
+ "most": 1.5,
129
+ "much": 1.4,
130
+ "many": 1.3,
131
+ "most": 1.5,
132
+ "super": 1.5,
133
+ "mega": 1.6,
134
+ "ultra": 1.7,
135
+ "hyper": 1.6,
136
+ "overly": 1.4,
137
+ "excessively": 1.5,
138
+ "intensely": 1.6,
139
+ "severely": 1.5,
140
+ "badly": 1.4,
141
+ "terribly": 1.6,
142
+ "awfully": 1.6,
143
+ "horribly": 1.6,
144
+ "dreadfully": 1.6
145
+ },
146
+ "negation": [
147
+ "not", "no", "never", "none", "nobody", "nothing", "nowhere",
148
+ "neither", "cannot", "can't", "won't", "don't", "doesn't",
149
+ "didn't", "isn't", "aren't", "wasn't", "weren't", "hasn't",
150
+ "haven't", "hadn't", "wouldn't", "couldn't", "shouldn't",
151
+ "mustn't", "mightn't", "mayn't", "shan't", "ain't",
152
+ "without", "lacking", "missing", "absent", "devoid",
153
+ "neither", "nor", "nobody", "nowhere", "nothing",
154
+ "nowhere", "nevermore", "nohow", "nowise", "noways",
155
+ "barely", "hardly", "scarcely", "rarely", "seldom",
156
+ "little", "few", "less", "least", "minimal",
157
+ "refuse", "reject", "deny", "decline", "dismiss"
158
+ ],
159
+ "diminishers": {
160
+ "slightly": 0.7,
161
+ "somewhat": 0.8,
162
+ "a bit": 0.7,
163
+ "a little": 0.7,
164
+ "kind of": 0.8,
165
+ "sort of": 0.8,
166
+ "rather": 0.9,
167
+ "pretty": 0.9,
168
+ "fairly": 0.9,
169
+ "relatively": 0.85,
170
+ "moderately": 0.85,
171
+ "reasonably": 0.9,
172
+ "barely": 0.6,
173
+ "hardly": 0.6,
174
+ "scarcely": 0.6,
175
+ "almost": 0.8,
176
+ "nearly": 0.8,
177
+ "partially": 0.7,
178
+ "partly": 0.7
179
+ },
180
+ "contrast_words": [
181
+ "but", "however", "although", "though", "yet", "still",
182
+ "nevertheless", "nonetheless", "despite", "in spite of",
183
+ "whereas", "while", "on the other hand", "conversely"
184
+ ],
185
+ "idioms_positive": [
186
+ "over the moon", "on cloud nine", "thrilled to bits", "tickled pink",
187
+ "walking on air", "in seventh heaven", "feeling great", "top notch",
188
+ "second to none", "head and shoulders above", "out of this world",
189
+ "worth its weight in gold", "the bee's knees", "the cat's pajamas"
190
+ ],
191
+ "idioms_negative": [
192
+ "down in the dumps", "feeling blue", "under the weather", "out of sorts",
193
+ "at the end of one's rope", "at wit's end", "in a pickle", "in hot water",
194
+ "the last straw", "the final nail in the coffin", "hit rock bottom",
195
+ "go from bad to worse", "go downhill", "go to pieces", "fall apart"
196
+ ]
197
+ }
lexicons/persian_lexicon.json ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "positive": [
3
+ "خوب", "عالی", "عالیه", "ممتاز", "برجسته", "فوق‌العاده",
4
+ "دوست دارم", "خوشم میاد", "پسندیدم", "علاقه دارم", "عاشق", "محبوب",
5
+ "زیبا", "قشنگ", "خوب", "نیک", "خوب", "عالی",
6
+ "موفق", "کامیاب", "پیروز", "فاتح", "برنده", "کامیاب",
7
+ "راضی", "خوشحال", "شاد", "خوش", "مسرور", "خوشنود",
8
+ "لذت بخش", "خوشایند", "مطلوب", "مثبت", "امیدوار", "خوش بین",
9
+ "راضی کننده", "قانع کننده", "رضایت بخش", "خوشایند",
10
+ "ممتاز", "عالی", "برجسته", "فوق‌العاده", "استثنایی",
11
+ "خوب", "نیک", "خوب", "عالی", "ممتاز", "برجسته",
12
+ "زیبا", "قشنگ", "خوب", "نیک", "خوب", "عالی",
13
+ "خوشحال", "شاد", "خوش", "مسرور", "خوشنود", "شادمان",
14
+ "مفید", "سودمند", "کارآمد", "عملی", "موثر", "کاربردی",
15
+ "با کیفیت", "مرغوب", "عالی", "برتر", "عالی", "ممتاز",
16
+ "سریع", "تند", "چابک", "عاجل", "فوری", "سریع",
17
+ "ارزان", "اقتصادی", "مقرون به صرفه", "مناسب", "جذاب",
18
+ "تمیز", "پاک", "منظم", "مرتب", "منظم", "پاکیزه",
19
+ "امن", "ایمن", "مطمئن", "قابل اعتماد", "پایدار", "مستحکم",
20
+ "راحت", "آسوده", "آرام", "ساکت", "آرام", "آرامش",
21
+ "سرگرم کننده", "جالب", "خوشایند", "لذت بخش", "خوش", "شاد",
22
+ "جدید", "نو", "مدرن", "معاصر", "جاری", "تازه",
23
+ "آسان", "ساده", "راحت", "قابل فهم", "روشن", "واضح",
24
+ "توصیه می‌کنم", "پیشنهاد می‌کنم", "توصیه می‌کنم", "پیشنهاد می‌دهم"
25
+ ],
26
+ "negative": [
27
+ "بد", "زشت", "ناگوار", "ناخوشایند", "نفرت", "نپسندیدم",
28
+ "بد", "بد", "بد", "زشت", "ناگوار", "ناخوشایند",
29
+ "نپسندیدم", "خوشم نیامد", "دوست ندارم", "نفرت دارم", "متنفرم",
30
+ "غمگین", "عصبانی", "ناراحت", "ناامید", "مایوس", "دلگیر",
31
+ "ناامیدی", "یأس", "ناامیدی", "نومیدی", "ناامیدی",
32
+ "خسته کننده", "کسل کننده", "خسته‌کننده", "ملال‌آور", "خسته‌کننده",
33
+ "ناگوار", "ناخوشایند", "ناپسند", "منفور", "متنفر",
34
+ "ناراضی", "غمگین", "اندوهگین", "غمناک", "دردناک", "اندوهناک",
35
+ "غم", "اندوه", "درد", "رنج", "الم", "غم", "اندوه",
36
+ "عصبانیت", "خشم", "غضب", "خشم", "عصبانیت", "خشم",
37
+ "ناراحت", "ناخوشنود", "ناراضی", "ناراضی", "ناراضی",
38
+ "کند", "آهسته", "دیر", "تاخیر", "تنبل", "کند",
39
+ "گران", "پرهزینه", "گران", "گران قیمت", "پرهزینه",
40
+ "کثیف", "آلوده", "ناپاک", "کثیف", "آلوده", "ناپاک",
41
+ "خطرناک", "ریسکی", "مضر", "خطرناک", "ناامن", "خطرناک",
42
+ "خراب", "معیوب", "ناقص", "ناقص", "ناقص", "ناقص",
43
+ "غیر قابل اعتماد", "غیر قابل اعتماد", "ناپایدار", "ناپایدار", "نامطمئن",
44
+ "بی‌فایده", "بی‌فایده", "بی‌فایده", "بی‌معنی", "بی‌معنی",
45
+ "پیچیده", "مشکل", "نامفهوم", "مبهم", "مبهم", "مبهم",
46
+ "شکایت", "درد", "مشکل", "مسئله", "ناراحتی", "درد",
47
+ "اشتباه", "خطا", "نقص", "کمبود", "کمبود", "نقص",
48
+ "پشیمانی", "ندامت", "غم", "اندوه", "درد", "الم",
49
+ "نگرانی", "اضطراب", "استرس", "فشار", "تنش", "نگرانی",
50
+ "ترس", "وحشت", "هراس", "هشدار", "نگرانی", "اضطراب",
51
+ "درد", "الم", "رنج", "الم", "الم", "رنج",
52
+ "ضعیف", "ناتوان", "ضعیف", "شکننده", "حساس", "شکننده",
53
+ "احمق", "نادان", "احمق", "بی‌معنی", "بی‌معنی", "بی‌منطق",
54
+ "تنبل", "کند", "بیکار", "خالی", "بی‌حرکت", "ساکن",
55
+ "بی‌ادب", "ناسزاگو", "زخم‌زبان", "ناعادلانه", "ناعادلانه",
56
+ "خودخواه", "حریص", "خودخواه", "خودخواه", "خودخواه",
57
+ "نادرست", "فریبنده", "گمراه‌کننده", "جعلی", "دروغگو",
58
+ "ناعادلانه", "ناعادلانه", "متعصب", "تبعیض‌آمیز", "نابرابر",
59
+ "رد", "انکار", "نفی", "رد", "نپذیرفتن"
60
+ ],
61
+ "intensifiers": {
62
+ "خیلی": 1.5,
63
+ "بسیار": 1.6,
64
+ "فوق العاده": 2.0,
65
+ "کاملا": 1.8,
66
+ "و��قعا": 1.3,
67
+ "نسبتا": 1.2,
68
+ "زیاد": 1.4,
69
+ "تمام": 1.5,
70
+ "حتما": 1.7,
71
+ "شدیدا": 1.6,
72
+ "به شدت": 1.7,
73
+ "بسیار زیاد": 1.8,
74
+ "خیلی زیاد": 1.7,
75
+ "بسیار": 1.6,
76
+ "به کرات": 1.5,
77
+ "بیش از حد": 1.6,
78
+ "به طور کامل": 1.5,
79
+ "کاملا": 1.8,
80
+ "تماما": 1.5,
81
+ "به طور کامل": 1.5
82
+ },
83
+ "negation": [
84
+ "نیست", "نیست", "نه", "هیچ", "هیچ وقت", "هرگز",
85
+ "نمی", "نمی‌کنم", "نکردم", "نخواهم کرد",
86
+ "نمی‌خواهم", "نپسندیدم", "خوشم نمی‌آید",
87
+ "نیستم", "نیستی", "نیست", "نیستیم", "نیستید", "نیستند",
88
+ "نیست", "نیست", "نیست", "نیست", "نیست", "نیست",
89
+ "نه", "نیست", "نیست", "نیست", "هرگز", "هیچ",
90
+ "نه...نه", "نه هم", "هیچ", "هیچ وقت", "هرگز"
91
+ ],
92
+ "diminishers": {
93
+ "کمی": 0.7,
94
+ "کم": 0.6,
95
+ "کمی": 0.7,
96
+ "کوچک": 0.7,
97
+ "سبک": 0.8,
98
+ "نسبتا": 0.85,
99
+ "نسبتا": 0.9,
100
+ "تقریبا": 0.8,
101
+ "تقریبا": 0.8,
102
+ "تا حدی": 0.7,
103
+ "جزئی": 0.7
104
+ },
105
+ "contrast_words": [
106
+ "اما", "ولی", "لیکن", "با این حال", "با این وجود",
107
+ "با این حال", "با این وجود", "با این حال",
108
+ "در حالی که", "در حالی که", "در حالی که", "بر خلاف", "بر خلاف"
109
+ ]
110
+ }
lexicons/turkish_lexicon.json ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "positive": [
3
+ "iyi", "güzel", "harika", "mükemmel", "muhteşem", "süper", "müthiş", "olağanüstü",
4
+ "seviyorum", "beğendim", "beğeniyorum", "hoşlanıyorum", "sevdim", "aşığım",
5
+ "hoş", "güzel", "şahane", "nefis", "leziz", "tatlı", "sevimli", "hoş",
6
+ "başarılı", "başarılı", "başarı", "zafer", "galibiyet", "kazanç",
7
+ "memnun", "mutlu", "sevinçli", "neşeli", "keyifli", "zevkli", "hoşnut",
8
+ "tatmin", "memnuniyet", "beğeni", "hoşnutluk", "razı", "kabul",
9
+ "övgü", "takdir", "alkış", "bravo", "aferin", "tebrik", "kutlama",
10
+ "mükemmel", "kusursuz", "mükemmeliyet", "mükemmellik", "mükemmel",
11
+ "harika", "muhteşem", "olağanüstü", "fevkalade", "sıra dışı",
12
+ "güzel", "hoş", "şirin", "sevimli", "çekici", "cazibeli", "alımlı",
13
+ "mutlu", "sevinçli", "neşeli", "keyifli", "şen", "sevinç dolu",
14
+ "başarılı", "başarılı", "başarılı", "başarılı", "başarılı",
15
+ "faydalı", "yararlı", "kullanışlı", "pratik", "etkili", "verimli",
16
+ "kaliteli", "nitelikli", "üstün", "yüksek kalite", "premium",
17
+ "hızlı", "çabuk", "süratli", "tez", "ivedi", "acele",
18
+ "ucuz", "ekonomik", "uygun fiyatlı", "makul", "cazip",
19
+ "temiz", "düzenli", "tertipli", "derli toplu", "düzenli",
20
+ "güvenli", "emniyetli", "güvenilir", "sağlam", "istikrarlı",
21
+ "rahat", "konforlu", "huzurlu", "sakin", "dingin", "sükunetli",
22
+ "eğlenceli", "keyifli", "zevkli", "hoş", "neşeli", "şen",
23
+ "yeni", "modern", "çağdaş", "güncel", "aktüel", "fresh",
24
+ "kolay", "basit", "sade", "anlaşılır", "açık", "net",
25
+ "öneriyorum", "tavsiye ediyorum", "öneririm", "tavsiye ederim"
26
+ ],
27
+ "negative": [
28
+ "kötü", "berbat", "çirkin", "iğrenç", "nefret", "beğenmedim", "hoşlanmadım",
29
+ "kötü", "fena", "berbat", "rezil", "korkunç", "dehşet", "felaket",
30
+ "beğenmedim", "hoşlanmadım", "sevmedim", "nefret ediyorum", "tiksinme",
31
+ "üzgün", "kızgın", "sinirli", "öfkeli", "hiddetli", "kızgın",
32
+ "hayal kırıklığı", "hayal kırıklığı", "umutsuzluk", "çaresizlik",
33
+ "can sıkıcı", "sıkıcı", "bıktırıcı", "usandırıcı", "bezdirici",
34
+ "tatsız", "hoş olmayan", "nahoş", "itici", "tiksindirici",
35
+ "mutsuz", "üzgün", "kederli", "hüzünlü", "acılı", "elemli",
36
+ "üzüntü", "keder", "acı", "elem", "hüzün", "gam", "kaygı",
37
+ "kızgınlık", "öfke", "sinir", "hiddet", "gazap", "kızgınlık",
38
+ "rahatsız", "hoşnutsuz", "memnuniyetsiz", "beğenmeme", "razı olmama",
39
+ "yavaş", "ağır", "gecikmiş", "gecikmeli", "tembel", "atıl",
40
+ "pahalı", "masraflı", "maliyetli", "pahalı", "yüksek fiyatlı",
41
+ "kirli", "pis", "murdar", "kirli", "pasaklı", "dağınık",
42
+ "tehlikeli", "riskli", "zararlı", "tehlikeli", "güvensiz",
43
+ "bozuk", "arızalı", "hatalı", "kusurlu", "eksik", "noksan",
44
+ "güvensiz", "güvenilmez", "istikrarsız", "kararsız", "belirsiz",
45
+ "işe yaramaz", "faydasız", "yararsız", "boş", "anlamsız",
46
+ "karmaşık", "karışık", "anlaşılmaz", "belirsiz", "muğlak",
47
+ "şikayet", "dert", "sorun", "problem", "sıkıntı", "dert",
48
+ "hata", "yanlış", "kusur", "eksiklik", "noksanlık", "arız",
49
+ "pişmanlık", "nedamet", "üzüntü", "keder", "acı", "elem",
50
+ "endişe", "kaygı", "stres", "baskı", "gerilim", "tedirginlik",
51
+ "korku", "dehşet", "panik", "alarm", "endişe", "kaygı",
52
+ "ağrı", "acı", "sancı", "sızı", "elem", "ızdırap",
53
+ "zayıf", "güçsüz", "cılız", "narin", "hassas", "kırılgan",
54
+ "aptal", "ahmak", "budala", "saçma", "anlamsız", "mantıksız",
55
+ "tembel", "atalet", "işsiz", "boş", "hareketsiz", "durgun",
56
+ "kaba", "nezaketsiz", "kırıcı", "incitici", "haksız", "adil olmayan",
57
+ "bencil", "açgözlü", "kendini düşünen", "egoist", "narsist",
58
+ "dürüst olmayan", "aldatıcı", "yanıltıcı", "sahte", "yalancı",
59
+ "haksız", "adil olmayan", "önyargılı", "ayrımcı", "eşitsiz",
60
+ "reddet", "ret", "inkar", "yadsıma", "kabul etmeme"
61
+ ],
62
+ "intensifiers": {
63
+ "çok": 1.5,
64
+ "çok fazla": 1.6,
65
+ "aşırı": 2.0,
66
+ "son derece": 1.8,
67
+ "gerçekten": 1.3,
68
+ "oldukça": 1.2,
69
+ "fazlasıyla": 1.4,
70
+ "tamamen": 1.5,
71
+ "kesinlikle": 1.7,
72
+ "müthiş": 1.6,
73
+ "fevkalade": 1.8,
74
+ "olağanüstü": 1.9,
75
+ "son derece": 1.8,
76
+ "hayli": 1.4,
77
+ "epey": 1.3,
78
+ "bir hayli": 1.4,
79
+ "oldukça": 1.2,
80
+ "epeyce": 1.3,
81
+ "hayli": 1.4,
82
+ "daha": 1.3,
83
+ "en": 1.5,
84
+ "pek": 1.4,
85
+ "gayet": 1.3,
86
+ "iyice": 1.4,
87
+ "iyiden iyiye": 1.5
88
+ },
89
+ "negation": [
90
+ "değil", "yok", "hayır", "hiç", "hiçbir", "hiçbir şey",
91
+ "hiçbir zaman", "asla", "bir daha", "olmaz", "olmayacak",
92
+ "yapmam", "yapmıyorum", "yapmadım", "yapmayacağım",
93
+ "istemiyorum", "beğenmedim", "hoşlanmıyorum",
94
+ "değilim", "değilsin", "değil", "değiliz", "değilsiniz", "değiller",
95
+ "yok", "yoktur", "yoktur", "yok", "yok", "yok",
96
+ "hayır", "olmaz", "yok", "değil", "asla", "hiç",
97
+ "ne...ne", "ne de", "hiç de", "hiç değil", "asla değil"
98
+ ],
99
+ "diminishers": {
100
+ "biraz": 0.7,
101
+ "az": 0.6,
102
+ "birazcık": 0.7,
103
+ "küçük": 0.7,
104
+ "hafif": 0.8,
105
+ "nispeten": 0.85,
106
+ "oldukça": 0.9,
107
+ "epey": 0.9,
108
+ "hayli": 0.9,
109
+ "neredeyse": 0.8,
110
+ "hemen hemen": 0.8,
111
+ "kısmen": 0.7,
112
+ "kısmi": 0.7
113
+ },
114
+ "contrast_words": [
115
+ "ama", "fakat", "lakin", "ancak", "yalnız", "sadece",
116
+ "buna rağmen", "yine de", "gene de", "bununla birlikte",
117
+ "oysa", "oysaki", "halbuki", "buna karşın", "bunun aksine"
118
+ ]
119
+ }
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio>=4.0.0
2
+
sentiment_analyzer.py ADDED
@@ -0,0 +1,555 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Multilingual Sentiment Analysis Tool
3
+ Supports Turkish, Persian, and English using lexicon-based and machine learning approaches
4
+ """
5
+
6
+ import re
7
+ import json
8
+ import os
9
+ from typing import Dict, List, Tuple, Optional
10
+ from collections import Counter
11
+ import math
12
+
13
+
14
+ class SentimentLexicon:
15
+ """Base class for sentiment lexicons"""
16
+
17
+ def __init__(self, language: str):
18
+ self.language = language
19
+ self.positive_words = set()
20
+ self.negative_words = set()
21
+ self.intensifiers = {}
22
+ self.negation_words = set()
23
+ self.diminishers = {}
24
+ self.contrast_words = set()
25
+ self.idioms_positive = []
26
+ self.idioms_negative = []
27
+ self._load_lexicon()
28
+
29
+ def _load_lexicon(self):
30
+ """Load language-specific sentiment lexicon"""
31
+ lexicon_file = f"lexicons/{self.language}_lexicon.json"
32
+ if os.path.exists(lexicon_file):
33
+ with open(lexicon_file, 'r', encoding='utf-8') as f:
34
+ data = json.load(f)
35
+ self.positive_words = set(data.get('positive', []))
36
+ self.negative_words = set(data.get('negative', []))
37
+ self.intensifiers = data.get('intensifiers', {})
38
+ self.negation_words = set(data.get('negation', []))
39
+ self.diminishers = data.get('diminishers', {})
40
+ self.contrast_words = set(data.get('contrast_words', []))
41
+ self.idioms_positive = data.get('idioms_positive', [])
42
+ self.idioms_negative = data.get('idioms_negative', [])
43
+ else:
44
+ # Default English lexicon
45
+ self._load_default_english()
46
+
47
+ def _load_default_english(self):
48
+ """Load default English sentiment words"""
49
+ self.positive_words = {
50
+ 'good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic',
51
+ 'love', 'like', 'best', 'perfect', 'beautiful', 'nice', 'happy',
52
+ 'pleased', 'satisfied', 'awesome', 'brilliant', 'outstanding'
53
+ }
54
+ self.negative_words = {
55
+ 'bad', 'terrible', 'awful', 'horrible', 'worst', 'hate', 'dislike',
56
+ 'poor', 'disappointed', 'sad', 'angry', 'frustrated', 'annoying',
57
+ 'boring', 'ugly', 'awful', 'disgusting', 'pathetic'
58
+ }
59
+ self.intensifiers = {
60
+ 'very': 1.5, 'extremely': 2.0, 'really': 1.3, 'quite': 1.2,
61
+ 'too': 1.4, 'so': 1.3, 'absolutely': 1.8, 'completely': 1.5
62
+ }
63
+ self.negation_words = {
64
+ 'not', 'no', 'never', 'none', 'nobody', 'nothing', 'nowhere',
65
+ 'neither', 'cannot', "can't", "won't", "don't", "doesn't"
66
+ }
67
+ self.diminishers = {}
68
+ self.contrast_words = set()
69
+ self.idioms_positive = []
70
+ self.idioms_negative = []
71
+
72
+
73
+ class TextPreprocessor:
74
+ """Text preprocessing for different languages"""
75
+
76
+ def __init__(self, language: str):
77
+ self.language = language
78
+
79
+ def preprocess(self, text: str) -> List[str]:
80
+ """Preprocess text and return tokens"""
81
+ # Convert to lowercase
82
+ text = text.lower()
83
+
84
+ # Remove URLs
85
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
86
+
87
+ # Remove email addresses
88
+ text = re.sub(r'\S+@\S+', '', text)
89
+
90
+ # Remove special characters but keep punctuation for sentiment analysis
91
+ text = re.sub(r'[^\w\s\.,!?;:()\-\']', '', text)
92
+
93
+ # Tokenize
94
+ tokens = re.findall(r'\b\w+\b|[.,!?;:()]', text)
95
+
96
+ return tokens
97
+
98
+ def normalize_turkish(self, text: str) -> str:
99
+ """Normalize Turkish text (handle special characters)"""
100
+ # Turkish character normalization
101
+ replacements = {
102
+ 'ı': 'i', 'İ': 'I',
103
+ 'ğ': 'g', 'Ğ': 'G',
104
+ 'ü': 'u', 'Ü': 'U',
105
+ 'ş': 's', 'Ş': 'S',
106
+ 'ö': 'o', 'Ö': 'O',
107
+ 'ç': 'c', 'Ç': 'C'
108
+ }
109
+ for old, new in replacements.items():
110
+ text = text.replace(old, new)
111
+ return text
112
+
113
+ def normalize_persian(self, text: str) -> str:
114
+ """Normalize Persian text (handle different character forms)"""
115
+ # Persian/Arabic character normalization
116
+ # This is a simplified version - real implementation would be more complex
117
+ return text
118
+
119
+
120
+ class LexiconBasedAnalyzer:
121
+ """Lexicon-based sentiment analysis with enhanced features"""
122
+
123
+ def __init__(self, language: str):
124
+ self.language = language
125
+ self.lexicon = SentimentLexicon(language)
126
+ self.preprocessor = TextPreprocessor(language)
127
+
128
+ def _check_idioms(self, text: str) -> Tuple[float, float]:
129
+ """Check for sentiment idioms in text"""
130
+ pos_score = 0.0
131
+ neg_score = 0.0
132
+ text_lower = text.lower()
133
+
134
+ for idiom in self.lexicon.idioms_positive:
135
+ if idiom.lower() in text_lower:
136
+ pos_score += 2.0 # Idioms carry stronger sentiment
137
+
138
+ for idiom in self.lexicon.idioms_negative:
139
+ if idiom.lower() in text_lower:
140
+ neg_score += 2.0
141
+
142
+ return pos_score, neg_score
143
+
144
+ def analyze(self, text: str) -> Dict:
145
+ """Analyze sentiment using lexicon-based approach"""
146
+ tokens = self.preprocessor.preprocess(text)
147
+ text_lower = text.lower()
148
+
149
+ positive_score = 0
150
+ negative_score = 0
151
+ sentiment_words = []
152
+ negation_count = 0
153
+
154
+ # Check idioms first
155
+ idiom_pos, idiom_neg = self._check_idioms(text)
156
+ positive_score += idiom_pos
157
+ negative_score += idiom_neg
158
+
159
+ # Check for negation and intensifiers with improved scope detection
160
+ window_size = 4 # Increased window for better context
161
+ i = 0
162
+ while i < len(tokens):
163
+ token = tokens[i]
164
+ is_negated = False
165
+ intensifier_strength = 1.0
166
+ diminisher_strength = 1.0
167
+
168
+ # Check for negation in window (improved scope)
169
+ for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
170
+ if tokens[j] in self.lexicon.negation_words:
171
+ # Check if negation is still in scope (not interrupted by punctuation)
172
+ if j < i:
173
+ # Check for punctuation between negation and token
174
+ has_punctuation = any(
175
+ tokens[k] in ['.', '!', '?', ';', ',']
176
+ for k in range(j + 1, i)
177
+ )
178
+ if not has_punctuation:
179
+ is_negated = True
180
+ negation_count += 1
181
+ break
182
+
183
+ # Check for intensifiers
184
+ if i > 0 and tokens[i-1] in self.lexicon.intensifiers:
185
+ intensifier_strength = self.lexicon.intensifiers[tokens[i-1]]
186
+
187
+ # Check for diminishers
188
+ if i > 0 and tokens[i-1] in self.lexicon.diminishers:
189
+ diminisher_strength = self.lexicon.diminishers[tokens[i-1]]
190
+
191
+ # Check sentiment
192
+ if token in self.lexicon.positive_words:
193
+ score = 1.0 * intensifier_strength * diminisher_strength
194
+ if is_negated:
195
+ negative_score += score
196
+ sentiment_words.append(('negative', token, is_negated))
197
+ else:
198
+ positive_score += score
199
+ sentiment_words.append(('positive', token, is_negated))
200
+ elif token in self.lexicon.negative_words:
201
+ score = 1.0 * intensifier_strength * diminisher_strength
202
+ if is_negated:
203
+ positive_score += score
204
+ sentiment_words.append(('positive', token, is_negated))
205
+ else:
206
+ negative_score += score
207
+ sentiment_words.append(('negative', token, is_negated))
208
+
209
+ i += 1
210
+
211
+ # Calculate final sentiment
212
+ total_score = positive_score + negative_score
213
+ if total_score == 0:
214
+ polarity = 'neutral'
215
+ confidence = 0.0
216
+ elif positive_score > negative_score:
217
+ polarity = 'positive'
218
+ confidence = positive_score / total_score if total_score > 0 else 0.0
219
+ else:
220
+ polarity = 'negative'
221
+ confidence = negative_score / total_score if total_score > 0 else 0.0
222
+
223
+ return {
224
+ 'polarity': polarity,
225
+ 'confidence': round(confidence, 3),
226
+ 'positive_score': round(positive_score, 3),
227
+ 'negative_score': round(negative_score, 3),
228
+ 'sentiment_words': sentiment_words[:10], # Limit to first 10
229
+ 'method': 'lexicon-based'
230
+ }
231
+
232
+
233
+ class RuleBasedAnalyzer:
234
+ """Rule-based sentiment analysis with advanced linguistic rules"""
235
+
236
+ def __init__(self, language: str):
237
+ self.language = language
238
+ self.lexicon = SentimentLexicon(language)
239
+ self.preprocessor = TextPreprocessor(language)
240
+
241
+ def _detect_emoticons(self, text: str) -> Tuple[float, float]:
242
+ """Detect and score emoticons and emojis"""
243
+ pos_score = 0.0
244
+ neg_score = 0.0
245
+
246
+ # Positive emoticons
247
+ positive_emoticons = [
248
+ ':)', ':-)', '=)', ';)', ';-)', '=D', ':D', ':-D',
249
+ '😊', '😀', '😁', '😂', '🤣', '😃', '😄', '😆', '😍', '🥰',
250
+ '😎', '🤗', '👍', '👏', '🎉', '❤️', '💕', '💖', '💗', '💓'
251
+ ]
252
+
253
+ # Negative emoticons
254
+ negative_emoticons = [
255
+ ':(', ':-(', '=(', ':/', ':-/', ':|', ':-|', '>:(', '>:(',
256
+ '😢', '😞', '😠', '😡', '😤', '😭', '😰', '😨', '😱', '😖',
257
+ '😣', '😫', '😩', '👎', '💔', '😒', '😔', '😕', '🙁'
258
+ ]
259
+
260
+ for emoji in positive_emoticons:
261
+ count = text.count(emoji)
262
+ pos_score += count * 1.5
263
+
264
+ for emoji in negative_emoticons:
265
+ count = text.count(emoji)
266
+ neg_score += count * 1.5
267
+
268
+ return pos_score, neg_score
269
+
270
+ def _handle_contrast_words(self, text: str, tokens: List[str],
271
+ pos_score: float, neg_score: float) -> Tuple[float, float]:
272
+ """Handle contrast words that may shift sentiment"""
273
+ # Find contrast words and adjust sentiment
274
+ contrast_positions = []
275
+ for i, token in enumerate(tokens):
276
+ if token.lower() in self.lexicon.contrast_words:
277
+ contrast_positions.append(i)
278
+
279
+ # If contrast word found, reduce weight of sentiment before it
280
+ if contrast_positions:
281
+ # Simple heuristic: reduce earlier sentiment by 30%
282
+ reduction_factor = 0.7
283
+ return pos_score * reduction_factor, neg_score * reduction_factor
284
+
285
+ return pos_score, neg_score
286
+
287
+ def _detect_comparatives_superlatives(self, tokens: List[str]) -> float:
288
+ """Detect comparative and superlative forms that intensify sentiment"""
289
+ multiplier = 1.0
290
+
291
+ # Check for superlatives
292
+ superlative_indicators = ['most', 'best', 'worst', 'least', 'greatest']
293
+ for token in tokens:
294
+ if token.lower() in superlative_indicators:
295
+ multiplier = max(multiplier, 1.4)
296
+
297
+ # Check for comparatives
298
+ comparative_patterns = ['more', 'less', 'better', 'worse', 'greater', 'smaller']
299
+ for token in tokens:
300
+ if token.lower() in comparative_patterns:
301
+ multiplier = max(multiplier, 1.2)
302
+
303
+ return multiplier
304
+
305
+ def _detect_repetition(self, text: str) -> float:
306
+ """Detect repeated characters/words that indicate emphasis"""
307
+ multiplier = 1.0
308
+
309
+ # Repeated characters (e.g., "soooo good")
310
+ repeated_chars = re.findall(r'(\w)\1{2,}', text.lower())
311
+ if repeated_chars:
312
+ multiplier += len(repeated_chars) * 0.1
313
+
314
+ # Repeated words (e.g., "good good good")
315
+ words = text.lower().split()
316
+ if len(words) > 2:
317
+ for i in range(len(words) - 2):
318
+ if words[i] == words[i+1] == words[i+2]:
319
+ multiplier += 0.2
320
+ break
321
+
322
+ return min(multiplier, 1.5) # Cap at 1.5x
323
+
324
+ def _detect_sentiment_shifters(self, text: str) -> float:
325
+ """Detect words that shift sentiment polarity"""
326
+ shifters = {
327
+ 'but': 0.6, 'however': 0.6, 'although': 0.7, 'though': 0.7,
328
+ 'yet': 0.6, 'still': 0.7, 'nevertheless': 0.6, 'nonetheless': 0.6
329
+ }
330
+
331
+ text_lower = text.lower()
332
+ for shifter, factor in shifters.items():
333
+ if shifter in text_lower:
334
+ return factor
335
+
336
+ return 1.0
337
+
338
+ def analyze(self, text: str) -> Dict:
339
+ """Analyze sentiment using rule-based approach with advanced rules"""
340
+ # Use lexicon-based as base
341
+ base_analyzer = LexiconBasedAnalyzer(self.language)
342
+ result = base_analyzer.analyze(text)
343
+
344
+ # Apply advanced rules
345
+ tokens = self.preprocessor.preprocess(text)
346
+ text_lower = text.lower()
347
+
348
+ # Rule 1: Exclamation marks increase sentiment strength
349
+ exclamation_count = text.count('!')
350
+ if exclamation_count > 0:
351
+ multiplier = 1 + min(exclamation_count * 0.15, 0.5) # Cap at 50% increase
352
+ result['positive_score'] *= multiplier
353
+ result['negative_score'] *= multiplier
354
+
355
+ # Rule 2: Question marks may indicate uncertainty or sarcasm
356
+ question_count = text.count('?')
357
+ if question_count > 1:
358
+ uncertainty_factor = max(0.7, 1 - (question_count * 0.1))
359
+ result['confidence'] *= uncertainty_factor
360
+
361
+ # Rule 3: All caps increase sentiment strength (but check length)
362
+ caps_words = [w for w in text.split() if w.isupper() and len(w) > 2]
363
+ if len(caps_words) > 0:
364
+ caps_multiplier = 1 + (len(caps_words) * 0.1)
365
+ result['positive_score'] *= caps_multiplier
366
+ result['negative_score'] *= caps_multiplier
367
+
368
+ # Rule 4: Enhanced emoticon detection
369
+ emoji_pos, emoji_neg = self._detect_emoticons(text)
370
+ result['positive_score'] += emoji_pos
371
+ result['negative_score'] += emoji_neg
372
+
373
+ # Rule 5: Contrast words handling
374
+ result['positive_score'], result['negative_score'] = self._handle_contrast_words(
375
+ text, tokens, result['positive_score'], result['negative_score']
376
+ )
377
+
378
+ # Rule 6: Comparatives and superlatives
379
+ comp_super_mult = self._detect_comparatives_superlatives(tokens)
380
+ result['positive_score'] *= comp_super_mult
381
+ result['negative_score'] *= comp_super_mult
382
+
383
+ # Rule 7: Repetition detection
384
+ rep_mult = self._detect_repetition(text)
385
+ result['positive_score'] *= rep_mult
386
+ result['negative_score'] *= rep_mult
387
+
388
+ # Rule 8: Sentiment shifters
389
+ shifter_factor = self._detect_sentiment_shifters(text)
390
+ if shifter_factor < 1.0:
391
+ # Reduce earlier sentiment
392
+ result['positive_score'] *= shifter_factor
393
+ result['negative_score'] *= shifter_factor
394
+
395
+ # Rule 9: Ellipsis may indicate uncertainty or trailing off
396
+ if '...' in text or '…' in text:
397
+ result['confidence'] *= 0.9
398
+
399
+ # Rule 10: Multiple punctuation (e.g., "!!!") increases emphasis
400
+ multi_punct = re.findall(r'[!?]{2,}', text)
401
+ if multi_punct:
402
+ punct_mult = 1 + (len(multi_punct) * 0.1)
403
+ result['positive_score'] *= punct_mult
404
+ result['negative_score'] *= punct_mult
405
+
406
+ # Rule 11: Hashtags in social media context
407
+ hashtags = re.findall(r'#\w+', text)
408
+ if hashtags:
409
+ # Check if hashtags contain sentiment words
410
+ for tag in hashtags:
411
+ tag_lower = tag.lower()
412
+ if any(word in tag_lower for word in self.lexicon.positive_words):
413
+ result['positive_score'] += 0.5
414
+ if any(word in tag_lower for word in self.lexicon.negative_words):
415
+ result['negative_score'] += 0.5
416
+
417
+ # Rule 12: URL presence may indicate spam or less personal content
418
+ if re.search(r'http[s]?://', text):
419
+ result['confidence'] *= 0.95
420
+
421
+ # Rule 13: Length-based confidence adjustment
422
+ word_count = len(text.split())
423
+ if word_count < 3:
424
+ result['confidence'] *= 0.8 # Very short texts are less reliable
425
+ elif word_count > 100:
426
+ result['confidence'] *= 0.95 # Very long texts may have mixed sentiment
427
+
428
+ # Recalculate polarity
429
+ total = result['positive_score'] + result['negative_score']
430
+ if total > 0:
431
+ if result['positive_score'] > result['negative_score']:
432
+ result['polarity'] = 'positive'
433
+ result['confidence'] = result['positive_score'] / total
434
+ else:
435
+ result['polarity'] = 'negative'
436
+ result['confidence'] = result['negative_score'] / total
437
+ else:
438
+ result['polarity'] = 'neutral'
439
+ result['confidence'] = 0.0
440
+
441
+ result['method'] = 'rule-based'
442
+ return result
443
+
444
+
445
+ class HybridAnalyzer:
446
+ """Hybrid approach combining lexicon, rules, and simple ML features"""
447
+
448
+ def __init__(self, language: str):
449
+ self.language = language
450
+ self.lexicon_analyzer = LexiconBasedAnalyzer(language)
451
+ self.rule_analyzer = RuleBasedAnalyzer(language)
452
+
453
+ def analyze(self, text: str) -> Dict:
454
+ """Analyze sentiment using hybrid approach"""
455
+ # Get results from both methods
456
+ lexicon_result = self.lexicon_analyzer.analyze(text)
457
+ rule_result = self.rule_analyzer.analyze(text)
458
+
459
+ # Combine scores with weights
460
+ lexicon_weight = 0.4
461
+ rule_weight = 0.6
462
+
463
+ combined_positive = (lexicon_result['positive_score'] * lexicon_weight +
464
+ rule_result['positive_score'] * rule_weight)
465
+ combined_negative = (lexicon_result['negative_score'] * lexicon_weight +
466
+ rule_result['negative_score'] * rule_weight)
467
+
468
+ total = combined_positive + combined_negative
469
+ if total == 0:
470
+ polarity = 'neutral'
471
+ confidence = 0.0
472
+ elif combined_positive > combined_negative:
473
+ polarity = 'positive'
474
+ confidence = combined_positive / total
475
+ else:
476
+ polarity = 'negative'
477
+ confidence = combined_negative / total
478
+
479
+ return {
480
+ 'polarity': polarity,
481
+ 'confidence': round(confidence, 3),
482
+ 'positive_score': round(combined_positive, 3),
483
+ 'negative_score': round(combined_negative, 3),
484
+ 'lexicon_result': lexicon_result,
485
+ 'rule_result': rule_result,
486
+ 'method': 'hybrid'
487
+ }
488
+
489
+
490
+ class MultilingualSentimentAnalyzer:
491
+ """Main sentiment analyzer supporting multiple languages and methods"""
492
+
493
+ def __init__(self, language: str = 'english', method: str = 'hybrid'):
494
+ """
495
+ Initialize sentiment analyzer
496
+
497
+ Args:
498
+ language: 'english', 'turkish', or 'persian'
499
+ method: 'lexicon', 'rule', or 'hybrid'
500
+ """
501
+ self.language = language.lower()
502
+ self.method = method.lower()
503
+
504
+ if method == 'lexicon':
505
+ self.analyzer = LexiconBasedAnalyzer(self.language)
506
+ elif method == 'rule':
507
+ self.analyzer = RuleBasedAnalyzer(self.language)
508
+ else: # hybrid
509
+ self.analyzer = HybridAnalyzer(self.language)
510
+
511
+ def analyze(self, text: str) -> Dict:
512
+ """Analyze sentiment of input text"""
513
+ if not text or not text.strip():
514
+ return {
515
+ 'polarity': 'neutral',
516
+ 'confidence': 0.0,
517
+ 'error': 'Empty text provided'
518
+ }
519
+
520
+ try:
521
+ result = self.analyzer.analyze(text)
522
+ result['language'] = self.language
523
+ result['text_length'] = len(text)
524
+ result['word_count'] = len(text.split())
525
+ return result
526
+ except Exception as e:
527
+ return {
528
+ 'polarity': 'neutral',
529
+ 'confidence': 0.0,
530
+ 'error': str(e)
531
+ }
532
+
533
+ def analyze_batch(self, texts: List[str]) -> List[Dict]:
534
+ """Analyze multiple texts"""
535
+ return [self.analyze(text) for text in texts]
536
+
537
+ def get_statistics(self, texts: List[str]) -> Dict:
538
+ """Get aggregate statistics for a batch of texts"""
539
+ results = self.analyze_batch(texts)
540
+
541
+ polarity_counts = Counter([r['polarity'] for r in results])
542
+ total = len(results)
543
+
544
+ avg_confidence = sum([r.get('confidence', 0) for r in results]) / total if total > 0 else 0
545
+
546
+ return {
547
+ 'total_texts': total,
548
+ 'polarity_distribution': dict(polarity_counts),
549
+ 'polarity_percentages': {
550
+ k: round(v / total * 100, 2)
551
+ for k, v in polarity_counts.items()
552
+ },
553
+ 'average_confidence': round(avg_confidence, 3)
554
+ }
555
+