Spaces:
Running
Running
Add Greek/Spanish/French/German multilingual keywords + non-English keyword priority
Browse files- text_detector.py +112 -4
text_detector.py
CHANGED
|
@@ -137,6 +137,99 @@ KEYWORDS: dict[EmotionLabel, list[str]] = {
|
|
| 137 |
],
|
| 138 |
}
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
# Emoji patterns
|
| 141 |
EMOJI_PATTERNS: dict[EmotionLabel, re.Pattern] = {
|
| 142 |
EmotionLabel.JOY: re.compile(r'[\U0001F600-\U0001F606\U0001F609\U0001F60A\U0001F60B\U0001F60E\U0001F929\U0001F973\U0001F389\U0001F38A]'),
|
|
@@ -188,11 +281,16 @@ class TextEmotionDetector:
|
|
| 188 |
scores: dict[EmotionLabel, float] = {label: 0.0 for label in EMOTION_LABELS}
|
| 189 |
scores[EmotionLabel.NEUTRAL] = 0.08 # baseline
|
| 190 |
|
| 191 |
-
# Keyword matching
|
| 192 |
for label, keywords in KEYWORDS.items():
|
| 193 |
count = sum(1 for kw in keywords if kw in lower)
|
| 194 |
scores[label] += count * 0.12
|
| 195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
# Emoji matching
|
| 197 |
for label, pattern in EMOJI_PATTERNS.items():
|
| 198 |
matches = pattern.findall(text)
|
|
@@ -272,9 +370,19 @@ class TextEmotionDetector:
|
|
| 272 |
# Blend with keyword analysis for robustness
|
| 273 |
if self.model_type == "transformer" and text.strip():
|
| 274 |
kw_scores = self._keyword_analysis(text)
|
| 275 |
-
#
|
| 276 |
-
for
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
total = sum(scores.values())
|
| 279 |
if total > 0:
|
| 280 |
scores = {k: v / total for k, v in scores.items()}
|
|
|
|
| 137 |
],
|
| 138 |
}
|
| 139 |
|
| 140 |
+
# ── Multilingual Keywords (Greek, Spanish, French, German, etc.) ─────
|
| 141 |
+
MULTILINGUAL_KEYWORDS: dict[EmotionLabel, list[str]] = {
|
| 142 |
+
EmotionLabel.JOY: [
|
| 143 |
+
# Greek
|
| 144 |
+
"χαρά", "χαρούμενος", "χαρούμενη", "ευτυχισμένος", "ευτυχισμένη",
|
| 145 |
+
"ευτυχία", "χαίρομαι", "υπέροχα", "τέλεια", "φανταστικά", "γέλιο",
|
| 146 |
+
"γελάω", "χαμογελώ", "χαμόγελο", "ωραία", "εξαιρετικά",
|
| 147 |
+
# Spanish
|
| 148 |
+
"feliz", "alegre", "contento", "maravilloso", "genial", "risa",
|
| 149 |
+
# French
|
| 150 |
+
"heureux", "heureuse", "joie", "magnifique", "formidable",
|
| 151 |
+
# German
|
| 152 |
+
"glücklich", "froh", "wunderbar", "fantastisch", "freude",
|
| 153 |
+
],
|
| 154 |
+
EmotionLabel.SADNESS: [
|
| 155 |
+
# Greek
|
| 156 |
+
"λυπημένος", "λυπημένη", "λύπη", "στεναχωρημένος", "στεναχώρια",
|
| 157 |
+
"κλαίω", "δάκρυα", "πόνος", "μοναξιά", "μόνος", "μόνη",
|
| 158 |
+
"θλίψη", "απογοητευμένος", "δυστυχισμένος", "απελπισία",
|
| 159 |
+
# Spanish
|
| 160 |
+
"triste", "tristeza", "llorar", "dolor", "soledad",
|
| 161 |
+
# French
|
| 162 |
+
"triste", "tristesse", "pleurer", "douleur", "chagrin",
|
| 163 |
+
],
|
| 164 |
+
EmotionLabel.SURPRISE: [
|
| 165 |
+
# Greek
|
| 166 |
+
"έκπληξη", "εκπληκτικό", "εκπληκτικός", "εκπληκτική", "εκπλήσσομαι",
|
| 167 |
+
"απίστευτο", "αναπάντεχο", "ξαφνικά", "δεν το περίμενα", "σοκ",
|
| 168 |
+
"εντυπωσιακό", "παράξενο", "εκπληκτη",
|
| 169 |
+
# Spanish
|
| 170 |
+
"sorpresa", "sorprendido", "increíble", "inesperado",
|
| 171 |
+
# French
|
| 172 |
+
"surprise", "surpris", "incroyable", "inattendu",
|
| 173 |
+
],
|
| 174 |
+
EmotionLabel.FEAR: [
|
| 175 |
+
# Greek
|
| 176 |
+
"φόβος", "φοβάμαι", "τρομαγμένος", "τρομαγμένη", "ανησυχία",
|
| 177 |
+
"ανήσυχος", "αγχωμένος", "άγχος", "πανικός", "τρόμος",
|
| 178 |
+
"φοβερό", "ανησυχώ", "στρες",
|
| 179 |
+
# Spanish
|
| 180 |
+
"miedo", "asustado", "nervioso", "ansiedad", "pánico",
|
| 181 |
+
# French
|
| 182 |
+
"peur", "effrayé", "anxieux", "angoisse", "panique",
|
| 183 |
+
],
|
| 184 |
+
EmotionLabel.ANGER: [
|
| 185 |
+
# Greek
|
| 186 |
+
"θυμός", "θυμωμένος", "θυμωμένη", "εκνευρισμένος", "εκνευρισμένη",
|
| 187 |
+
"οργή", "εξοργισμένος", "νεύρα", "μίσος", "μισώ",
|
| 188 |
+
"αγανακτισμένος", "εξαγριωμένος", "τσαντίλα",
|
| 189 |
+
# Spanish
|
| 190 |
+
"enojado", "furioso", "rabia", "odio", "ira",
|
| 191 |
+
# French
|
| 192 |
+
"colère", "furieux", "enragé", "haine", "irrité",
|
| 193 |
+
],
|
| 194 |
+
EmotionLabel.DISGUST: [
|
| 195 |
+
# Greek
|
| 196 |
+
"αηδία", "αηδιαστικό", "αποκρουστικό", "φρικτό", "απαίσιο",
|
| 197 |
+
"σιχαμερό", "αρρωστημένο", "χάλια",
|
| 198 |
+
# Spanish
|
| 199 |
+
"asco", "asqueroso", "repugnante", "horrible",
|
| 200 |
+
# French
|
| 201 |
+
"dégoût", "dégoûtant", "horrible", "répugnant",
|
| 202 |
+
],
|
| 203 |
+
EmotionLabel.LOVE: [
|
| 204 |
+
# Greek
|
| 205 |
+
"αγάπη", "αγαπώ", "αγαπημένος", "αγαπημένη", "ερωτευμένος",
|
| 206 |
+
"ερωτευμένη", "τρυφερότητα", "αγκαλιά", "φιλί", "καρδιά",
|
| 207 |
+
"λατρεύω", "στοργή", "αφοσίωση",
|
| 208 |
+
# Spanish
|
| 209 |
+
"amor", "te quiero", "cariño", "corazón", "ternura",
|
| 210 |
+
# French
|
| 211 |
+
"amour", "aimer", "tendresse", "coeur", "chéri",
|
| 212 |
+
],
|
| 213 |
+
EmotionLabel.CALM: [
|
| 214 |
+
# Greek
|
| 215 |
+
"ηρεμία", "ήρεμος", "ήρεμη", "χαλαρός", "χαλαρή",
|
| 216 |
+
"γαλήνη", "ήσυχος", "ειρηνικός", "ξεκούραση", "ψυχραιμία",
|
| 217 |
+
# Spanish
|
| 218 |
+
"calma", "tranquilo", "relajado", "sereno", "paz",
|
| 219 |
+
# French
|
| 220 |
+
"calme", "tranquille", "détendu", "serein", "paix",
|
| 221 |
+
],
|
| 222 |
+
EmotionLabel.NEUTRAL: [
|
| 223 |
+
# Greek
|
| 224 |
+
"εντάξει", "μια χαρά", "κανονικά", "συνήθως", "απλά",
|
| 225 |
+
"τίποτα", "ουδέτερο",
|
| 226 |
+
# Spanish
|
| 227 |
+
"bien", "normal", "regular",
|
| 228 |
+
# French
|
| 229 |
+
"bien", "normal", "ordinaire",
|
| 230 |
+
],
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
# Emoji patterns
|
| 234 |
EMOJI_PATTERNS: dict[EmotionLabel, re.Pattern] = {
|
| 235 |
EmotionLabel.JOY: re.compile(r'[\U0001F600-\U0001F606\U0001F609\U0001F60A\U0001F60B\U0001F60E\U0001F929\U0001F973\U0001F389\U0001F38A]'),
|
|
|
|
| 281 |
scores: dict[EmotionLabel, float] = {label: 0.0 for label in EMOTION_LABELS}
|
| 282 |
scores[EmotionLabel.NEUTRAL] = 0.08 # baseline
|
| 283 |
|
| 284 |
+
# Keyword matching (English)
|
| 285 |
for label, keywords in KEYWORDS.items():
|
| 286 |
count = sum(1 for kw in keywords if kw in lower)
|
| 287 |
scores[label] += count * 0.12
|
| 288 |
|
| 289 |
+
# Multilingual keyword matching (Greek, Spanish, French, German, etc.)
|
| 290 |
+
for label, keywords in MULTILINGUAL_KEYWORDS.items():
|
| 291 |
+
count = sum(1 for kw in keywords if kw in lower)
|
| 292 |
+
scores[label] += count * 0.15 # slightly higher weight for exact multilingual match
|
| 293 |
+
|
| 294 |
# Emoji matching
|
| 295 |
for label, pattern in EMOJI_PATTERNS.items():
|
| 296 |
matches = pattern.findall(text)
|
|
|
|
| 370 |
# Blend with keyword analysis for robustness
|
| 371 |
if self.model_type == "transformer" and text.strip():
|
| 372 |
kw_scores = self._keyword_analysis(text)
|
| 373 |
+
# Detect if text is non-Latin (Greek, Arabic, Chinese, etc.)
|
| 374 |
+
non_latin_chars = sum(1 for c in text if ord(c) > 0x024F and c.isalpha())
|
| 375 |
+
total_alpha = sum(1 for c in text if c.isalpha()) or 1
|
| 376 |
+
is_non_english = (non_latin_chars / total_alpha) > 0.3
|
| 377 |
+
|
| 378 |
+
if is_non_english:
|
| 379 |
+
# For non-English: 30% model, 70% keywords (model is English-only)
|
| 380 |
+
for label in EMOTION_LABELS:
|
| 381 |
+
scores[label] = scores[label] * 0.3 + kw_scores[label] * 0.7
|
| 382 |
+
else:
|
| 383 |
+
# For English: 75% model, 25% keywords
|
| 384 |
+
for label in EMOTION_LABELS:
|
| 385 |
+
scores[label] = scores[label] * 0.75 + kw_scores[label] * 0.25
|
| 386 |
total = sum(scores.values())
|
| 387 |
if total > 0:
|
| 388 |
scores = {k: v / total for k, v in scores.items()}
|