""" Professional Multilingual Chatbot Translation Service Supports English, French, Kiswahili, and Kinyarwanda Features: - Automatic language detection from user input - Exclusively responds in the detected language - Uses GoogleTranslator from deep_translator for accurate translation - Maintains natural tone, accuracy, and clarity in all supported languages """ from typing import Dict, List, Optional, Tuple from langdetect import detect, detect_langs, DetectorFactory from deep_translator import GoogleTranslator import re # Optional, higher-quality detectors/translators try: import langid # Lightweight, fast language id except Exception: # pragma: no cover langid = None try: import pycld3 # Google Compact Language Detector v3 except Exception: # pragma: no cover pycld3 = None # Set seed for consistent language detection DetectorFactory.seed = 0 class TranslationService: def __init__(self): # Initialize GoogleTranslator for all translations try: self.translator = GoogleTranslator() except Exception as e: print(f"Warning: Failed to initialize GoogleTranslator: {e}") self.translator = None # Language mappings for supported languages self.language_codes = { 'kinyarwanda': 'rw', 'french': 'fr', 'kiswahili': 'sw', 'english': 'en' } # Supported language codes for detection self.supported_languages = ['en', 'fr', 'sw', 'rw'] # Domain glossary for consistent Kinyarwanda phrasing # Maps common English/French mental health phrases to preferred Kinyarwanda self.rw_glossary = [ (r"(?i)mental health hotline\s*:?\s*105", "Umurongo wa telefone w'ubufasha mu by'ubuzima bwo mu mutwe: 105"), (r"(?i)ligne d'assistance en santé mentale\s*:?\s*105", "Umurongo wa telefone w'ubufasha mu by'ubuzima bwo mu mutwe: 105"), (r"(?i)call\s*112", "Hamagara 112 mu gihe cy'ibyago byihutirwa"), (r"(?i)emergency", "ibyago byihutirwa"), (r"(?i)caraes\s*ndera\s*hospital", "CARAES Ndera"), (r"(?i)hdi\s*rwanda\s*counseling", "HDI Rwanda (Inama n'Ubujyanama)"), (r"(?i)arct\s*ruhuka", "ARCT Ruhuka"), (r"(?i)mental health", "ubuzima bwo mu mutwe"), (r"(?i)anxiety", "impungenge"), (r"(?i)depression", "agahinda kenshi"), (r"(?i)stress", "umunaniro w'ubwonko"), (r"(?i)coping strategies", "uburyo bwo kwifasha"), (r"(?i)ku bihano[,\s]*", ""), (r"(?i)komeza amajwi make ariko akunze", ""), ] def detect_language(self, text: str) -> str: """ Professional language detection for multilingual chatbot. Detects language from user input and returns one of: 'en', 'fr', 'sw', 'rw' Uses ensemble method combining pattern matching, multiple detectors, and domain-specific knowledge for maximum accuracy. """ if not text or not text.strip(): return 'en' # Clean the text for better detection cleaned_text = re.sub(r'[^\w\s]', '', text.strip().lower()) if len(cleaned_text) < 2: return 'en' try: # Primary detection using pattern matching pattern_lang = self._detect_by_patterns(text) if pattern_lang: return pattern_lang # Secondary detection using langdetect detected = detect(text) mapped = self._map_code(detected) # Tertiary validation using domain knowledge if mapped in self.supported_languages: return mapped return 'en' except Exception as e: print(f"Language detection error: {e}") return 'en' def _detect_by_patterns(self, text: str) -> str: """ Detect language using comprehensive pattern matching for better accuracy """ text_lower = text.lower().strip() # Count matches for each language to determine the strongest signal language_scores = {'rw': 0, 'fr': 0, 'sw': 0, 'en': 0} # Kinyarwanda patterns - more comprehensive kinyarwanda_patterns = [ r'\b(muraho|murakaza|murabe|murakoze|mwiriwe|mwaramutse|murakaza neza|muraho rwose|muraho neza)\b', r'\b(ndabizi|ntabwo|ndabishaka|ndabishimira|ndabishimye|ndabishimye cyane|ndumva)\b', r'\b(umunsi|umunsi mwiza|umunsi mubi|ejo|ejo hazaza|ejo hashize|uyu munsi)\b', r'\b(amahoro|amahoro yose|amahoro yanyu|amahoro yanjye)\b', r'\b(ubwoba|ubwoba bubabaje|ubwoba bunyuma|ubwoba bwinshi|umutwe|umereye|nabi)\b', r'\b(umutima|umutima wanjye|umutima wanyu|umutima wanjye)\b', r'\b(ubuzima|ubuzima bwiza|ubuzima bubi|ubuzima bwinshi)\b', r'\b(nshaka|ntabwo|ndabizi|ndabishimira|ndabishimye|ndumva|ndabishimye)\b', r'\b(jewe|wewe|we|jewe|twebwe|mwebwe|bo)\b', r'\b(murakoze|murakoze cyane|murakoze cane|murakoze rwose)\b', r"\b(ntabwo|ntabwo bimeze|ntabwo bimeze nk'uko)\b", r'\b(umutwe|umereye|nabi|ndumva|cyane|rwose|neza)\b' ] # French patterns - more comprehensive french_patterns = [ r'\b(bonjour|bonsoir|salut|bonne journée|bonne soirée)\b', r'\b(merci|merci beaucoup|merci bien|de rien)\b', r'\b(comment allez-vous|comment ça va|ça va bien|ça va mal)\b', r'\b(je suis|je vais|je peux|je veux|je dois|je fais)\b', r'\b(très bien|très mal|pas mal|comme ci comme ça|ça va)\b', r'\b(anxieux|anxieuse|déprimé|déprimée|stressé|stressée)\b', r"\b(depuis|pendant|maintenant|hier|demain|aujourd'hui)\b", r'\b(problème|difficulté|souci|inquiétude|santé mentale)\b', r'\b(santé|mental|psychologique|émotionnel|psychologue)\b', r'\b(avec|sans|pour|dans|sur|sous|entre|parmi)\b', r'\b(et|ou|mais|donc|car|ni|puis)\b' ] # Kiswahili patterns - more comprehensive kiswahili_patterns = [ r'\b(hujambo|hamjambo|habari|habari yako|habari za asubuhi|habari za mchana)\b', r'\b(asante|asante sana|karibu|pole|pole sana|pole kwa ajili)\b', r'\b(sijambo|hajambo|hatujambo|hamjambo|hawajambo)\b', r'\b(mimi|wewe|yeye|sisi|nyinyi|wao)\b', r'\b(nina|una|ana|tuna|mna|wana|niko|uko|ako|tuko|mko|wako)\b', r'\b(shida|matatizo|huzuni|furaha|wasiwasi|msongo wa mawazo)\b', r'\b(afya ya akili|moyo|roho|hisia|mawazo)\b', r'\b(rafiki|mpenzi|mama|baba|mtoto|mzee|mke|mume)\b', r'\b(leo|jana|kesho|sasa|zamani|baadaye)\b', r'\b(naomba|tafadhali|samahani|pole|pole sana)\b' ] # English patterns - to distinguish from other languages english_patterns = [ r'\b(hello|hi|hey|good morning|good afternoon|good evening)\b', r'\b(thank you|thanks|please|sorry|excuse me)\b', r"\b(i am|i'm|i have|i can|i will|i would)\b", r'\b(help|support|assistance|mental health|anxiety|depression)\b', r'\b(how are you|how do you|what is|where is|when is)\b' ] # Count pattern matches for pattern in kinyarwanda_patterns: if re.search(pattern, text_lower): language_scores['rw'] += 1 for pattern in french_patterns: if re.search(pattern, text_lower): language_scores['fr'] += 1 for pattern in kiswahili_patterns: if re.search(pattern, text_lower): language_scores['sw'] += 1 for pattern in english_patterns: if re.search(pattern, text_lower): language_scores['en'] += 1 # Return the language with the highest score if max(language_scores.values()) > 0: return max(language_scores, key=language_scores.get) return None def _map_code(self, code: str) -> str: """Map various detector codes into our set {en, fr, sw, rw}.""" mapping = { 'en': 'en', 'eng': 'en', 'fr': 'fr', 'fra': 'fr', 'fre': 'fr', 'sw': 'sw', 'swa': 'sw', 'swc': 'sw', 'rw': 'rw', 'kin': 'rw', } return mapping.get(code, 'en') def _has_strong_kinyarwanda_tokens(self, text_lower: str) -> bool: """Check for strong Kinyarwanda indicators""" tokens = [ 'muraho', 'mwiriwe', 'mwaramutse', 'murakoze', 'ndumva', 'ubwoba', 'umutwe', 'umereye', 'nabi', 'amahoro', 'ubuzima', 'ndabizi', 'ntabwo', 'ndabishaka', 'ndabishimira', 'cyane', 'rwose' ] return any(t in text_lower for t in tokens) def _has_strong_french_tokens(self, text_lower: str) -> bool: """Check for strong French indicators""" tokens = [ 'bonjour', 'bonsoir', 'merci', 'comment', 'allez-vous', 'ça va', 'je suis', 'je vais', 'je peux', 'très bien', 'très mal', 'anxieux', 'déprimé', 'stressé', 'santé mentale', 'problème' ] return any(t in text_lower for t in tokens) def _has_strong_kiswahili_tokens(self, text_lower: str) -> bool: """Check for strong Kiswahili indicators""" tokens = [ 'hujambo', 'hamjambo', 'habari', 'asante', 'karibu', 'pole', 'sijambo', 'hajambo', 'mimi', 'wewe', 'yeye', 'sisi', 'nyinyi', 'nina', 'una', 'ana', 'tuna', 'mna', 'wana', 'shida', 'matatizo' ] return any(t in text_lower for t in tokens) def _is_common_greeting(self, text: str) -> bool: """Check if text is a common greeting that should default to English""" greetings = ['hello', 'hi', 'hey', 'good morning', 'good afternoon', 'good evening'] return text.lower().strip() in greetings def translate_text(self, text: str, target_language: str) -> str: """ Professional translation using GoogleTranslator exclusively. Translates text to target language with high accuracy and natural tone. Args: text: Text to translate target_language: Target language code ('en', 'fr', 'sw', 'rw') Returns: Translated text in target language """ if not text or not text.strip(): return text if target_language == 'en': return text try: # Normalize language code for GoogleTranslator target_code = self._normalize_language_code(target_language) # Translate using GoogleTranslator if self.translator: translated = GoogleTranslator(source='auto', target=target_code).translate(text) # Post-process based on target language if target_language == 'rw': translated = self.normalize_kinyarwanda(translated) elif target_language == 'fr': translated = self.normalize_french(translated) elif target_language == 'sw': translated = self.normalize_kiswahili(translated) return translated else: return text except Exception as e: print(f"Translation error: {e}") return text def _normalize_language_code(self, lang: str) -> str: """Normalize language code to GoogleTranslator format""" mapping = { 'en': 'en', 'english': 'en', 'fr': 'fr', 'french': 'fr', 'français': 'fr', 'sw': 'sw', 'kiswahili': 'sw', 'swahili': 'sw', 'rw': 'rw', 'kinyarwanda': 'rw', 'kin': 'rw', 'ikinyarwanda': 'rw' } return mapping.get(lang.lower(), 'en') def normalize_kinyarwanda(self, text: str) -> str: """ Post-process Kinyarwanda to remove mixed-language fragments and enforce consistent, professional terminology using a small domain glossary. """ if not text: return text normalized = text # Remove common French connective phrases that sometimes leak in french_leak_patterns = [ r"(?i)ligne d'assistance en santé mentale", r"(?i)pour|avec|sans|dans|sur|entre|car|donc|mais|ou", ] for pat in french_leak_patterns: normalized = re.sub(pat, "", normalized) # Apply glossary replacements for pat, repl in self.rw_glossary: normalized = re.sub(pat, repl, normalized) # Trim repetitive spaces and stray punctuation normalized = re.sub(r"\s+", " ", normalized).strip() normalized = re.sub(r"\s+,", ",", normalized) normalized = re.sub(r"\s+\.", ".", normalized) return normalized def normalize_french(self, text: str) -> str: """ Post-process French text to ensure natural, professional tone """ if not text: return text normalized = text # Fix common translation artifacts french_fixes = [ (r'\bje suis\s+je suis\b', 'je suis'), (r'\btrès\s+très\b', 'très'), (r'\bde\s+de\b', 'de'), (r'\bdu\s+du\b', 'du'), (r'\bdes\s+des\b', 'des'), ] for pattern, replacement in french_fixes: normalized = re.sub(pattern, replacement, normalized, flags=re.IGNORECASE) # Clean up spacing and punctuation normalized = re.sub(r"\s+", " ", normalized).strip() normalized = re.sub(r"\s+,", ",", normalized) normalized = re.sub(r"\s+\.", ".", normalized) return normalized def normalize_kiswahili(self, text: str) -> str: """ Post-process Kiswahili text to ensure natural, professional tone """ if not text: return text normalized = text # Fix common translation artifacts kiswahili_fixes = [ (r'\bmimi\s+mimi\b', 'mimi'), (r'\bwewe\s+wewe\b', 'wewe'), (r'\byeye\s+yeye\b', 'yeye'), (r'\bsisi\s+sisi\b', 'sisi'), (r'\bnyinyi\s+nyinyi\b', 'nyinyi'), (r'\bwao\s+wao\b', 'wao'), ] for pattern, replacement in kiswahili_fixes: normalized = re.sub(pattern, replacement, normalized, flags=re.IGNORECASE) # Clean up spacing and punctuation normalized = re.sub(r"\s+", " ", normalized).strip() normalized = re.sub(r"\s+,", ",", normalized) normalized = re.sub(r"\s+\.", ".", normalized) return normalized def get_appropriate_response(self, english_response: str, user_language: str) -> str: """ Get response in the user's detected language with improved reliability. This is the main method for ensuring single-language responses. """ if user_language == 'en' or not user_language: return english_response try: return self.translate_text(english_response, user_language) except Exception as e: print(f"Translation failed: {e}") return english_response def process_user_message(self, user_message: str, english_response: str) -> str: """ Main method for professional multilingual chatbot. Automatically detects the user's language from their message and responds exclusively in that same language. This is the primary interface method. Args: user_message: The user's input message english_response: The AI-generated response in English Returns: Response translated to the user's detected language """ if not user_message or not english_response: return english_response # Detect language from user's message detected_language = self.detect_language(user_message) print(f"User message language detected: {detected_language}") print(f"User message: {user_message[:100]}...") return self.get_appropriate_response(english_response, detected_language) def get_multilingual_response(self, english_response: str, user_language: str) -> Dict[str, str]: responses = {'en': english_response} for lang in ['fr', 'sw', 'rw']: if lang != user_language: responses[lang] = self.translate_text(english_response, lang) return responses def get_language_name(self, lang_code: str) -> str: names = {'en': 'English', 'fr': 'French', 'sw': 'Kiswahili', 'rw': 'Kinyarwanda'} return names.get(lang_code, 'English') def is_supported_language(self, lang_code: str) -> bool: return lang_code in self.supported_languages def get_supported_languages(self) -> List[str]: return self.supported_languages # Global translation service instance translation_service = TranslationService() # Convenience function for easy integration def translate_chatbot_response(user_message: str, english_response: str) -> str: """ Convenience function for translating chatbot responses. This is the main function to use for integrating the multilingual chatbot functionality into your application. Args: user_message: The user's input message english_response: The AI-generated response in English Returns: Response translated to the user's detected language """ return translation_service.process_user_message(user_message, english_response)