chariscait commited on
Commit
82d2d36
·
verified ·
1 Parent(s): 2170a8d

Add Greek/Spanish/French/German multilingual keywords + non-English keyword priority

Browse files
Files changed (1) hide show
  1. text_detector.py +112 -4
text_detector.py CHANGED
@@ -137,6 +137,99 @@ KEYWORDS: dict[EmotionLabel, list[str]] = {
137
  ],
138
  }
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  # Emoji patterns
141
  EMOJI_PATTERNS: dict[EmotionLabel, re.Pattern] = {
142
  EmotionLabel.JOY: re.compile(r'[\U0001F600-\U0001F606\U0001F609\U0001F60A\U0001F60B\U0001F60E\U0001F929\U0001F973\U0001F389\U0001F38A]'),
@@ -188,11 +281,16 @@ class TextEmotionDetector:
188
  scores: dict[EmotionLabel, float] = {label: 0.0 for label in EMOTION_LABELS}
189
  scores[EmotionLabel.NEUTRAL] = 0.08 # baseline
190
 
191
- # Keyword matching
192
  for label, keywords in KEYWORDS.items():
193
  count = sum(1 for kw in keywords if kw in lower)
194
  scores[label] += count * 0.12
195
 
 
 
 
 
 
196
  # Emoji matching
197
  for label, pattern in EMOJI_PATTERNS.items():
198
  matches = pattern.findall(text)
@@ -272,9 +370,19 @@ class TextEmotionDetector:
272
  # Blend with keyword analysis for robustness
273
  if self.model_type == "transformer" and text.strip():
274
  kw_scores = self._keyword_analysis(text)
275
- # 80% model, 20% keywords for better emoji/punctuation handling
276
- for label in EMOTION_LABELS:
277
- scores[label] = scores[label] * 0.8 + kw_scores[label] * 0.2
 
 
 
 
 
 
 
 
 
 
278
  total = sum(scores.values())
279
  if total > 0:
280
  scores = {k: v / total for k, v in scores.items()}
 
137
  ],
138
  }
139
 
140
+ # ── Multilingual Keywords (Greek, Spanish, French, German, etc.) ─────
141
+ MULTILINGUAL_KEYWORDS: dict[EmotionLabel, list[str]] = {
142
+ EmotionLabel.JOY: [
143
+ # Greek
144
+ "χαρά", "χαρούμενος", "χαρούμενη", "ευτυχισμένος", "ευτυχισμένη",
145
+ "ευτυχία", "χαίρομαι", "υπέροχα", "τέλεια", "φανταστικά", "γέλιο",
146
+ "γελάω", "χαμογελώ", "χαμόγελο", "ωραία", "εξαιρετικά",
147
+ # Spanish
148
+ "feliz", "alegre", "contento", "maravilloso", "genial", "risa",
149
+ # French
150
+ "heureux", "heureuse", "joie", "magnifique", "formidable",
151
+ # German
152
+ "glücklich", "froh", "wunderbar", "fantastisch", "freude",
153
+ ],
154
+ EmotionLabel.SADNESS: [
155
+ # Greek
156
+ "λυπημένος", "λυπημένη", "λύπη", "στεναχωρημένος", "στεναχώρια",
157
+ "κλαίω", "δάκρυα", "πόνος", "μοναξιά", "μόνος", "μόνη",
158
+ "θλίψη", "απογοητευμένος", "δυστυχισμένος", "απελπισία",
159
+ # Spanish
160
+ "triste", "tristeza", "llorar", "dolor", "soledad",
161
+ # French
162
+ "triste", "tristesse", "pleurer", "douleur", "chagrin",
163
+ ],
164
+ EmotionLabel.SURPRISE: [
165
+ # Greek
166
+ "έκπληξη", "εκπληκτικό", "εκπληκτικός", "εκπληκτική", "εκπλήσσομαι",
167
+ "απίστευτο", "αναπάντεχο", "ξαφνικά", "δεν το περίμενα", "σοκ",
168
+ "εντυπωσιακό", "παράξενο", "εκπληκτη",
169
+ # Spanish
170
+ "sorpresa", "sorprendido", "increíble", "inesperado",
171
+ # French
172
+ "surprise", "surpris", "incroyable", "inattendu",
173
+ ],
174
+ EmotionLabel.FEAR: [
175
+ # Greek
176
+ "φόβος", "φοβάμαι", "τρομαγμένος", "τρομαγμένη", "ανησυχία",
177
+ "ανήσυχος", "αγχωμένος", "άγχος", "πανικός", "τρόμος",
178
+ "φοβερό", "ανησυχώ", "στρες",
179
+ # Spanish
180
+ "miedo", "asustado", "nervioso", "ansiedad", "pánico",
181
+ # French
182
+ "peur", "effrayé", "anxieux", "angoisse", "panique",
183
+ ],
184
+ EmotionLabel.ANGER: [
185
+ # Greek
186
+ "θυμός", "θυμωμένος", "θυμωμένη", "εκνευρισμένος", "εκνευρισμένη",
187
+ "οργή", "εξοργισμένος", "νεύρα", "μίσος", "μισώ",
188
+ "αγανακτισμένος", "εξαγριωμένος", "τσαντίλα",
189
+ # Spanish
190
+ "enojado", "furioso", "rabia", "odio", "ira",
191
+ # French
192
+ "colère", "furieux", "enragé", "haine", "irrité",
193
+ ],
194
+ EmotionLabel.DISGUST: [
195
+ # Greek
196
+ "αηδία", "αηδιαστικό", "αποκρουστικό", "φρικτό", "απαίσιο",
197
+ "σιχαμερό", "αρρωστημένο", "χάλια",
198
+ # Spanish
199
+ "asco", "asqueroso", "repugnante", "horrible",
200
+ # French
201
+ "dégoût", "dégoûtant", "horrible", "répugnant",
202
+ ],
203
+ EmotionLabel.LOVE: [
204
+ # Greek
205
+ "αγάπη", "αγαπώ", "αγαπημένος", "αγαπημένη", "ερωτευμένος",
206
+ "ερωτευμένη", "τρυφερότητα", "αγκαλιά", "φιλί", "καρδιά",
207
+ "λατρεύω", "στοργή", "αφοσίωση",
208
+ # Spanish
209
+ "amor", "te quiero", "cariño", "corazón", "ternura",
210
+ # French
211
+ "amour", "aimer", "tendresse", "coeur", "chéri",
212
+ ],
213
+ EmotionLabel.CALM: [
214
+ # Greek
215
+ "ηρεμία", "ήρεμος", "ήρεμη", "χαλαρός", "χαλαρή",
216
+ "γαλήνη", "ήσυχος", "ειρηνικός", "ξεκούραση", "ψυχραιμία",
217
+ # Spanish
218
+ "calma", "tranquilo", "relajado", "sereno", "paz",
219
+ # French
220
+ "calme", "tranquille", "détendu", "serein", "paix",
221
+ ],
222
+ EmotionLabel.NEUTRAL: [
223
+ # Greek
224
+ "εντάξει", "μια χαρά", "κανονικά", "συνήθως", "απλά",
225
+ "τίποτα", "ουδέτερο",
226
+ # Spanish
227
+ "bien", "normal", "regular",
228
+ # French
229
+ "bien", "normal", "ordinaire",
230
+ ],
231
+ }
232
+
233
  # Emoji patterns
234
  EMOJI_PATTERNS: dict[EmotionLabel, re.Pattern] = {
235
  EmotionLabel.JOY: re.compile(r'[\U0001F600-\U0001F606\U0001F609\U0001F60A\U0001F60B\U0001F60E\U0001F929\U0001F973\U0001F389\U0001F38A]'),
 
281
  scores: dict[EmotionLabel, float] = {label: 0.0 for label in EMOTION_LABELS}
282
  scores[EmotionLabel.NEUTRAL] = 0.08 # baseline
283
 
284
+ # Keyword matching (English)
285
  for label, keywords in KEYWORDS.items():
286
  count = sum(1 for kw in keywords if kw in lower)
287
  scores[label] += count * 0.12
288
 
289
+ # Multilingual keyword matching (Greek, Spanish, French, German, etc.)
290
+ for label, keywords in MULTILINGUAL_KEYWORDS.items():
291
+ count = sum(1 for kw in keywords if kw in lower)
292
+ scores[label] += count * 0.15 # slightly higher weight for exact multilingual match
293
+
294
  # Emoji matching
295
  for label, pattern in EMOJI_PATTERNS.items():
296
  matches = pattern.findall(text)
 
370
  # Blend with keyword analysis for robustness
371
  if self.model_type == "transformer" and text.strip():
372
  kw_scores = self._keyword_analysis(text)
373
+ # Detect if text is non-Latin (Greek, Arabic, Chinese, etc.)
374
+ non_latin_chars = sum(1 for c in text if ord(c) > 0x024F and c.isalpha())
375
+ total_alpha = sum(1 for c in text if c.isalpha()) or 1
376
+ is_non_english = (non_latin_chars / total_alpha) > 0.3
377
+
378
+ if is_non_english:
379
+ # For non-English: 30% model, 70% keywords (model is English-only)
380
+ for label in EMOTION_LABELS:
381
+ scores[label] = scores[label] * 0.3 + kw_scores[label] * 0.7
382
+ else:
383
+ # For English: 75% model, 25% keywords
384
+ for label in EMOTION_LABELS:
385
+ scores[label] = scores[label] * 0.75 + kw_scores[label] * 0.25
386
  total = sum(scores.values())
387
  if total > 0:
388
  scores = {k: v / total for k, v in scores.items()}