Spaces:
Sleeping
Sleeping
File size: 18,210 Bytes
c024705 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 |
"""
Professional Multilingual Chatbot Translation Service
Supports English, French, Kiswahili, and Kinyarwanda
Features:
- Automatic language detection from user input
- Exclusively responds in the detected language
- Uses GoogleTranslator from deep_translator for accurate translation
- Maintains natural tone, accuracy, and clarity in all supported languages
"""
from typing import Dict, List, Optional, Tuple
from langdetect import detect, detect_langs, DetectorFactory
from deep_translator import GoogleTranslator
import re
# Optional, higher-quality detectors/translators
try:
import langid
# Lightweight, fast language id
except Exception: # pragma: no cover
langid = None
try:
import pycld3
# Google Compact Language Detector v3
except Exception: # pragma: no cover
pycld3 = None
# Set seed for consistent language detection
DetectorFactory.seed = 0
class TranslationService:
def __init__(self):
# Initialize GoogleTranslator for all translations
try:
self.translator = GoogleTranslator()
except Exception as e:
print(f"Warning: Failed to initialize GoogleTranslator: {e}")
self.translator = None
# Language mappings for supported languages
self.language_codes = {
'kinyarwanda': 'rw',
'french': 'fr',
'kiswahili': 'sw',
'english': 'en'
}
# Supported language codes for detection
self.supported_languages = ['en', 'fr', 'sw', 'rw']
# Domain glossary for consistent Kinyarwanda phrasing
# Maps common English/French mental health phrases to preferred Kinyarwanda
self.rw_glossary = [
(r"(?i)mental health hotline\s*:?\s*105", "Umurongo wa telefone w'ubufasha mu by'ubuzima bwo mu mutwe: 105"),
(r"(?i)ligne d'assistance en santé mentale\s*:?\s*105", "Umurongo wa telefone w'ubufasha mu by'ubuzima bwo mu mutwe: 105"),
(r"(?i)call\s*112", "Hamagara 112 mu gihe cy'ibyago byihutirwa"),
(r"(?i)emergency", "ibyago byihutirwa"),
(r"(?i)caraes\s*ndera\s*hospital", "CARAES Ndera"),
(r"(?i)hdi\s*rwanda\s*counseling", "HDI Rwanda (Inama n'Ubujyanama)"),
(r"(?i)arct\s*ruhuka", "ARCT Ruhuka"),
(r"(?i)mental health", "ubuzima bwo mu mutwe"),
(r"(?i)anxiety", "impungenge"),
(r"(?i)depression", "agahinda kenshi"),
(r"(?i)stress", "umunaniro w'ubwonko"),
(r"(?i)coping strategies", "uburyo bwo kwifasha"),
(r"(?i)ku bihano[,\s]*", ""),
(r"(?i)komeza amajwi make ariko akunze", ""),
]
def detect_language(self, text: str) -> str:
"""
Professional language detection for multilingual chatbot.
Detects language from user input and returns one of: 'en', 'fr', 'sw', 'rw'
Uses ensemble method combining pattern matching, multiple detectors,
and domain-specific knowledge for maximum accuracy.
"""
if not text or not text.strip():
return 'en'
# Clean the text for better detection
cleaned_text = re.sub(r'[^\w\s]', '', text.strip().lower())
if len(cleaned_text) < 2:
return 'en'
try:
# Primary detection using pattern matching
pattern_lang = self._detect_by_patterns(text)
if pattern_lang:
return pattern_lang
# Secondary detection using langdetect
detected = detect(text)
mapped = self._map_code(detected)
# Tertiary validation using domain knowledge
if mapped in self.supported_languages:
return mapped
return 'en'
except Exception as e:
print(f"Language detection error: {e}")
return 'en'
def _detect_by_patterns(self, text: str) -> str:
"""
Detect language using comprehensive pattern matching for better accuracy
"""
text_lower = text.lower().strip()
# Count matches for each language to determine the strongest signal
language_scores = {'rw': 0, 'fr': 0, 'sw': 0, 'en': 0}
# Kinyarwanda patterns - more comprehensive
kinyarwanda_patterns = [
r'\b(muraho|murakaza|murabe|murakoze|mwiriwe|mwaramutse|murakaza neza|muraho rwose|muraho neza)\b',
r'\b(ndabizi|ntabwo|ndabishaka|ndabishimira|ndabishimye|ndabishimye cyane|ndumva)\b',
r'\b(umunsi|umunsi mwiza|umunsi mubi|ejo|ejo hazaza|ejo hashize|uyu munsi)\b',
r'\b(amahoro|amahoro yose|amahoro yanyu|amahoro yanjye)\b',
r'\b(ubwoba|ubwoba bubabaje|ubwoba bunyuma|ubwoba bwinshi|umutwe|umereye|nabi)\b',
r'\b(umutima|umutima wanjye|umutima wanyu|umutima wanjye)\b',
r'\b(ubuzima|ubuzima bwiza|ubuzima bubi|ubuzima bwinshi)\b',
r'\b(nshaka|ntabwo|ndabizi|ndabishimira|ndabishimye|ndumva|ndabishimye)\b',
r'\b(jewe|wewe|we|jewe|twebwe|mwebwe|bo)\b',
r'\b(murakoze|murakoze cyane|murakoze cane|murakoze rwose)\b',
r"\b(ntabwo|ntabwo bimeze|ntabwo bimeze nk'uko)\b",
r'\b(umutwe|umereye|nabi|ndumva|cyane|rwose|neza)\b'
]
# French patterns - more comprehensive
french_patterns = [
r'\b(bonjour|bonsoir|salut|bonne journée|bonne soirée)\b',
r'\b(merci|merci beaucoup|merci bien|de rien)\b',
r'\b(comment allez-vous|comment ça va|ça va bien|ça va mal)\b',
r'\b(je suis|je vais|je peux|je veux|je dois|je fais)\b',
r'\b(très bien|très mal|pas mal|comme ci comme ça|ça va)\b',
r'\b(anxieux|anxieuse|déprimé|déprimée|stressé|stressée)\b',
r"\b(depuis|pendant|maintenant|hier|demain|aujourd'hui)\b",
r'\b(problème|difficulté|souci|inquiétude|santé mentale)\b',
r'\b(santé|mental|psychologique|émotionnel|psychologue)\b',
r'\b(avec|sans|pour|dans|sur|sous|entre|parmi)\b',
r'\b(et|ou|mais|donc|car|ni|puis)\b'
]
# Kiswahili patterns - more comprehensive
kiswahili_patterns = [
r'\b(hujambo|hamjambo|habari|habari yako|habari za asubuhi|habari za mchana)\b',
r'\b(asante|asante sana|karibu|pole|pole sana|pole kwa ajili)\b',
r'\b(sijambo|hajambo|hatujambo|hamjambo|hawajambo)\b',
r'\b(mimi|wewe|yeye|sisi|nyinyi|wao)\b',
r'\b(nina|una|ana|tuna|mna|wana|niko|uko|ako|tuko|mko|wako)\b',
r'\b(shida|matatizo|huzuni|furaha|wasiwasi|msongo wa mawazo)\b',
r'\b(afya ya akili|moyo|roho|hisia|mawazo)\b',
r'\b(rafiki|mpenzi|mama|baba|mtoto|mzee|mke|mume)\b',
r'\b(leo|jana|kesho|sasa|zamani|baadaye)\b',
r'\b(naomba|tafadhali|samahani|pole|pole sana)\b'
]
# English patterns - to distinguish from other languages
english_patterns = [
r'\b(hello|hi|hey|good morning|good afternoon|good evening)\b',
r'\b(thank you|thanks|please|sorry|excuse me)\b',
r"\b(i am|i'm|i have|i can|i will|i would)\b",
r'\b(help|support|assistance|mental health|anxiety|depression)\b',
r'\b(how are you|how do you|what is|where is|when is)\b'
]
# Count pattern matches
for pattern in kinyarwanda_patterns:
if re.search(pattern, text_lower):
language_scores['rw'] += 1
for pattern in french_patterns:
if re.search(pattern, text_lower):
language_scores['fr'] += 1
for pattern in kiswahili_patterns:
if re.search(pattern, text_lower):
language_scores['sw'] += 1
for pattern in english_patterns:
if re.search(pattern, text_lower):
language_scores['en'] += 1
# Return the language with the highest score
if max(language_scores.values()) > 0:
return max(language_scores, key=language_scores.get)
return None
def _map_code(self, code: str) -> str:
"""Map various detector codes into our set {en, fr, sw, rw}."""
mapping = {
'en': 'en', 'eng': 'en',
'fr': 'fr', 'fra': 'fr', 'fre': 'fr',
'sw': 'sw', 'swa': 'sw', 'swc': 'sw',
'rw': 'rw', 'kin': 'rw',
}
return mapping.get(code, 'en')
def _has_strong_kinyarwanda_tokens(self, text_lower: str) -> bool:
"""Check for strong Kinyarwanda indicators"""
tokens = [
'muraho', 'mwiriwe', 'mwaramutse', 'murakoze', 'ndumva',
'ubwoba', 'umutwe', 'umereye', 'nabi', 'amahoro', 'ubuzima',
'ndabizi', 'ntabwo', 'ndabishaka', 'ndabishimira', 'cyane', 'rwose'
]
return any(t in text_lower for t in tokens)
def _has_strong_french_tokens(self, text_lower: str) -> bool:
"""Check for strong French indicators"""
tokens = [
'bonjour', 'bonsoir', 'merci', 'comment', 'allez-vous', 'ça va',
'je suis', 'je vais', 'je peux', 'très bien', 'très mal',
'anxieux', 'déprimé', 'stressé', 'santé mentale', 'problème'
]
return any(t in text_lower for t in tokens)
def _has_strong_kiswahili_tokens(self, text_lower: str) -> bool:
"""Check for strong Kiswahili indicators"""
tokens = [
'hujambo', 'hamjambo', 'habari', 'asante', 'karibu', 'pole',
'sijambo', 'hajambo', 'mimi', 'wewe', 'yeye', 'sisi', 'nyinyi',
'nina', 'una', 'ana', 'tuna', 'mna', 'wana', 'shida', 'matatizo'
]
return any(t in text_lower for t in tokens)
def _is_common_greeting(self, text: str) -> bool:
"""Check if text is a common greeting that should default to English"""
greetings = ['hello', 'hi', 'hey', 'good morning', 'good afternoon', 'good evening']
return text.lower().strip() in greetings
def translate_text(self, text: str, target_language: str) -> str:
"""
Professional translation using GoogleTranslator exclusively.
Translates text to target language with high accuracy and natural tone.
Args:
text: Text to translate
target_language: Target language code ('en', 'fr', 'sw', 'rw')
Returns:
Translated text in target language
"""
if not text or not text.strip():
return text
if target_language == 'en':
return text
try:
# Normalize language code for GoogleTranslator
target_code = self._normalize_language_code(target_language)
# Translate using GoogleTranslator
if self.translator:
translated = GoogleTranslator(source='auto', target=target_code).translate(text)
# Post-process based on target language
if target_language == 'rw':
translated = self.normalize_kinyarwanda(translated)
elif target_language == 'fr':
translated = self.normalize_french(translated)
elif target_language == 'sw':
translated = self.normalize_kiswahili(translated)
return translated
else:
return text
except Exception as e:
print(f"Translation error: {e}")
return text
def _normalize_language_code(self, lang: str) -> str:
"""Normalize language code to GoogleTranslator format"""
mapping = {
'en': 'en', 'english': 'en',
'fr': 'fr', 'french': 'fr', 'français': 'fr',
'sw': 'sw', 'kiswahili': 'sw', 'swahili': 'sw',
'rw': 'rw', 'kinyarwanda': 'rw', 'kin': 'rw', 'ikinyarwanda': 'rw'
}
return mapping.get(lang.lower(), 'en')
def normalize_kinyarwanda(self, text: str) -> str:
"""
Post-process Kinyarwanda to remove mixed-language fragments and enforce
consistent, professional terminology using a small domain glossary.
"""
if not text:
return text
normalized = text
# Remove common French connective phrases that sometimes leak in
french_leak_patterns = [
r"(?i)ligne d'assistance en santé mentale",
r"(?i)pour|avec|sans|dans|sur|entre|car|donc|mais|ou",
]
for pat in french_leak_patterns:
normalized = re.sub(pat, "", normalized)
# Apply glossary replacements
for pat, repl in self.rw_glossary:
normalized = re.sub(pat, repl, normalized)
# Trim repetitive spaces and stray punctuation
normalized = re.sub(r"\s+", " ", normalized).strip()
normalized = re.sub(r"\s+,", ",", normalized)
normalized = re.sub(r"\s+\.", ".", normalized)
return normalized
def normalize_french(self, text: str) -> str:
"""
Post-process French text to ensure natural, professional tone
"""
if not text:
return text
normalized = text
# Fix common translation artifacts
french_fixes = [
(r'\bje suis\s+je suis\b', 'je suis'),
(r'\btrès\s+très\b', 'très'),
(r'\bde\s+de\b', 'de'),
(r'\bdu\s+du\b', 'du'),
(r'\bdes\s+des\b', 'des'),
]
for pattern, replacement in french_fixes:
normalized = re.sub(pattern, replacement, normalized, flags=re.IGNORECASE)
# Clean up spacing and punctuation
normalized = re.sub(r"\s+", " ", normalized).strip()
normalized = re.sub(r"\s+,", ",", normalized)
normalized = re.sub(r"\s+\.", ".", normalized)
return normalized
def normalize_kiswahili(self, text: str) -> str:
"""
Post-process Kiswahili text to ensure natural, professional tone
"""
if not text:
return text
normalized = text
# Fix common translation artifacts
kiswahili_fixes = [
(r'\bmimi\s+mimi\b', 'mimi'),
(r'\bwewe\s+wewe\b', 'wewe'),
(r'\byeye\s+yeye\b', 'yeye'),
(r'\bsisi\s+sisi\b', 'sisi'),
(r'\bnyinyi\s+nyinyi\b', 'nyinyi'),
(r'\bwao\s+wao\b', 'wao'),
]
for pattern, replacement in kiswahili_fixes:
normalized = re.sub(pattern, replacement, normalized, flags=re.IGNORECASE)
# Clean up spacing and punctuation
normalized = re.sub(r"\s+", " ", normalized).strip()
normalized = re.sub(r"\s+,", ",", normalized)
normalized = re.sub(r"\s+\.", ".", normalized)
return normalized
def get_appropriate_response(self, english_response: str, user_language: str) -> str:
"""
Get response in the user's detected language with improved reliability.
This is the main method for ensuring single-language responses.
"""
if user_language == 'en' or not user_language:
return english_response
try:
return self.translate_text(english_response, user_language)
except Exception as e:
print(f"Translation failed: {e}")
return english_response
def process_user_message(self, user_message: str, english_response: str) -> str:
"""
Main method for professional multilingual chatbot.
Automatically detects the user's language from their message and responds
exclusively in that same language. This is the primary interface method.
Args:
user_message: The user's input message
english_response: The AI-generated response in English
Returns:
Response translated to the user's detected language
"""
if not user_message or not english_response:
return english_response
# Detect language from user's message
detected_language = self.detect_language(user_message)
print(f"User message language detected: {detected_language}")
print(f"User message: {user_message[:100]}...")
return self.get_appropriate_response(english_response, detected_language)
def get_multilingual_response(self, english_response: str, user_language: str) -> Dict[str, str]:
responses = {'en': english_response}
for lang in ['fr', 'sw', 'rw']:
if lang != user_language:
responses[lang] = self.translate_text(english_response, lang)
return responses
def get_language_name(self, lang_code: str) -> str:
names = {'en': 'English', 'fr': 'French', 'sw': 'Kiswahili', 'rw': 'Kinyarwanda'}
return names.get(lang_code, 'English')
def is_supported_language(self, lang_code: str) -> bool:
return lang_code in self.supported_languages
def get_supported_languages(self) -> List[str]:
return self.supported_languages
# Global translation service instance
translation_service = TranslationService()
# Convenience function for easy integration
def translate_chatbot_response(user_message: str, english_response: str) -> str:
"""
Convenience function for translating chatbot responses.
This is the main function to use for integrating the multilingual
chatbot functionality into your application.
Args:
user_message: The user's input message
english_response: The AI-generated response in English
Returns:
Response translated to the user's detected language
"""
return translation_service.process_user_message(user_message, english_response)
|