ishingiro / translation_service.py
IZERE HIRWA Roger
ishingiro
c024705
"""
Professional Multilingual Chatbot Translation Service
Supports English, French, Kiswahili, and Kinyarwanda
Features:
- Automatic language detection from user input
- Exclusively responds in the detected language
- Uses GoogleTranslator from deep_translator for accurate translation
- Maintains natural tone, accuracy, and clarity in all supported languages
"""
from typing import Dict, List, Optional, Tuple
from langdetect import detect, detect_langs, DetectorFactory
from deep_translator import GoogleTranslator
import re
# Optional, higher-quality detectors/translators
try:
import langid
# Lightweight, fast language id
except Exception: # pragma: no cover
langid = None
try:
import pycld3
# Google Compact Language Detector v3
except Exception: # pragma: no cover
pycld3 = None
# Set seed for consistent language detection
DetectorFactory.seed = 0
class TranslationService:
def __init__(self):
# Initialize GoogleTranslator for all translations
try:
self.translator = GoogleTranslator()
except Exception as e:
print(f"Warning: Failed to initialize GoogleTranslator: {e}")
self.translator = None
# Language mappings for supported languages
self.language_codes = {
'kinyarwanda': 'rw',
'french': 'fr',
'kiswahili': 'sw',
'english': 'en'
}
# Supported language codes for detection
self.supported_languages = ['en', 'fr', 'sw', 'rw']
# Domain glossary for consistent Kinyarwanda phrasing
# Maps common English/French mental health phrases to preferred Kinyarwanda
self.rw_glossary = [
(r"(?i)mental health hotline\s*:?\s*105", "Umurongo wa telefone w'ubufasha mu by'ubuzima bwo mu mutwe: 105"),
(r"(?i)ligne d'assistance en santé mentale\s*:?\s*105", "Umurongo wa telefone w'ubufasha mu by'ubuzima bwo mu mutwe: 105"),
(r"(?i)call\s*112", "Hamagara 112 mu gihe cy'ibyago byihutirwa"),
(r"(?i)emergency", "ibyago byihutirwa"),
(r"(?i)caraes\s*ndera\s*hospital", "CARAES Ndera"),
(r"(?i)hdi\s*rwanda\s*counseling", "HDI Rwanda (Inama n'Ubujyanama)"),
(r"(?i)arct\s*ruhuka", "ARCT Ruhuka"),
(r"(?i)mental health", "ubuzima bwo mu mutwe"),
(r"(?i)anxiety", "impungenge"),
(r"(?i)depression", "agahinda kenshi"),
(r"(?i)stress", "umunaniro w'ubwonko"),
(r"(?i)coping strategies", "uburyo bwo kwifasha"),
(r"(?i)ku bihano[,\s]*", ""),
(r"(?i)komeza amajwi make ariko akunze", ""),
]
def detect_language(self, text: str) -> str:
"""
Professional language detection for multilingual chatbot.
Detects language from user input and returns one of: 'en', 'fr', 'sw', 'rw'
Uses ensemble method combining pattern matching, multiple detectors,
and domain-specific knowledge for maximum accuracy.
"""
if not text or not text.strip():
return 'en'
# Clean the text for better detection
cleaned_text = re.sub(r'[^\w\s]', '', text.strip().lower())
if len(cleaned_text) < 2:
return 'en'
try:
# Primary detection using pattern matching
pattern_lang = self._detect_by_patterns(text)
if pattern_lang:
return pattern_lang
# Secondary detection using langdetect
detected = detect(text)
mapped = self._map_code(detected)
# Tertiary validation using domain knowledge
if mapped in self.supported_languages:
return mapped
return 'en'
except Exception as e:
print(f"Language detection error: {e}")
return 'en'
def _detect_by_patterns(self, text: str) -> str:
"""
Detect language using comprehensive pattern matching for better accuracy
"""
text_lower = text.lower().strip()
# Count matches for each language to determine the strongest signal
language_scores = {'rw': 0, 'fr': 0, 'sw': 0, 'en': 0}
# Kinyarwanda patterns - more comprehensive
kinyarwanda_patterns = [
r'\b(muraho|murakaza|murabe|murakoze|mwiriwe|mwaramutse|murakaza neza|muraho rwose|muraho neza)\b',
r'\b(ndabizi|ntabwo|ndabishaka|ndabishimira|ndabishimye|ndabishimye cyane|ndumva)\b',
r'\b(umunsi|umunsi mwiza|umunsi mubi|ejo|ejo hazaza|ejo hashize|uyu munsi)\b',
r'\b(amahoro|amahoro yose|amahoro yanyu|amahoro yanjye)\b',
r'\b(ubwoba|ubwoba bubabaje|ubwoba bunyuma|ubwoba bwinshi|umutwe|umereye|nabi)\b',
r'\b(umutima|umutima wanjye|umutima wanyu|umutima wanjye)\b',
r'\b(ubuzima|ubuzima bwiza|ubuzima bubi|ubuzima bwinshi)\b',
r'\b(nshaka|ntabwo|ndabizi|ndabishimira|ndabishimye|ndumva|ndabishimye)\b',
r'\b(jewe|wewe|we|jewe|twebwe|mwebwe|bo)\b',
r'\b(murakoze|murakoze cyane|murakoze cane|murakoze rwose)\b',
r"\b(ntabwo|ntabwo bimeze|ntabwo bimeze nk'uko)\b",
r'\b(umutwe|umereye|nabi|ndumva|cyane|rwose|neza)\b'
]
# French patterns - more comprehensive
french_patterns = [
r'\b(bonjour|bonsoir|salut|bonne journée|bonne soirée)\b',
r'\b(merci|merci beaucoup|merci bien|de rien)\b',
r'\b(comment allez-vous|comment ça va|ça va bien|ça va mal)\b',
r'\b(je suis|je vais|je peux|je veux|je dois|je fais)\b',
r'\b(très bien|très mal|pas mal|comme ci comme ça|ça va)\b',
r'\b(anxieux|anxieuse|déprimé|déprimée|stressé|stressée)\b',
r"\b(depuis|pendant|maintenant|hier|demain|aujourd'hui)\b",
r'\b(problème|difficulté|souci|inquiétude|santé mentale)\b',
r'\b(santé|mental|psychologique|émotionnel|psychologue)\b',
r'\b(avec|sans|pour|dans|sur|sous|entre|parmi)\b',
r'\b(et|ou|mais|donc|car|ni|puis)\b'
]
# Kiswahili patterns - more comprehensive
kiswahili_patterns = [
r'\b(hujambo|hamjambo|habari|habari yako|habari za asubuhi|habari za mchana)\b',
r'\b(asante|asante sana|karibu|pole|pole sana|pole kwa ajili)\b',
r'\b(sijambo|hajambo|hatujambo|hamjambo|hawajambo)\b',
r'\b(mimi|wewe|yeye|sisi|nyinyi|wao)\b',
r'\b(nina|una|ana|tuna|mna|wana|niko|uko|ako|tuko|mko|wako)\b',
r'\b(shida|matatizo|huzuni|furaha|wasiwasi|msongo wa mawazo)\b',
r'\b(afya ya akili|moyo|roho|hisia|mawazo)\b',
r'\b(rafiki|mpenzi|mama|baba|mtoto|mzee|mke|mume)\b',
r'\b(leo|jana|kesho|sasa|zamani|baadaye)\b',
r'\b(naomba|tafadhali|samahani|pole|pole sana)\b'
]
# English patterns - to distinguish from other languages
english_patterns = [
r'\b(hello|hi|hey|good morning|good afternoon|good evening)\b',
r'\b(thank you|thanks|please|sorry|excuse me)\b',
r"\b(i am|i'm|i have|i can|i will|i would)\b",
r'\b(help|support|assistance|mental health|anxiety|depression)\b',
r'\b(how are you|how do you|what is|where is|when is)\b'
]
# Count pattern matches
for pattern in kinyarwanda_patterns:
if re.search(pattern, text_lower):
language_scores['rw'] += 1
for pattern in french_patterns:
if re.search(pattern, text_lower):
language_scores['fr'] += 1
for pattern in kiswahili_patterns:
if re.search(pattern, text_lower):
language_scores['sw'] += 1
for pattern in english_patterns:
if re.search(pattern, text_lower):
language_scores['en'] += 1
# Return the language with the highest score
if max(language_scores.values()) > 0:
return max(language_scores, key=language_scores.get)
return None
def _map_code(self, code: str) -> str:
"""Map various detector codes into our set {en, fr, sw, rw}."""
mapping = {
'en': 'en', 'eng': 'en',
'fr': 'fr', 'fra': 'fr', 'fre': 'fr',
'sw': 'sw', 'swa': 'sw', 'swc': 'sw',
'rw': 'rw', 'kin': 'rw',
}
return mapping.get(code, 'en')
def _has_strong_kinyarwanda_tokens(self, text_lower: str) -> bool:
"""Check for strong Kinyarwanda indicators"""
tokens = [
'muraho', 'mwiriwe', 'mwaramutse', 'murakoze', 'ndumva',
'ubwoba', 'umutwe', 'umereye', 'nabi', 'amahoro', 'ubuzima',
'ndabizi', 'ntabwo', 'ndabishaka', 'ndabishimira', 'cyane', 'rwose'
]
return any(t in text_lower for t in tokens)
def _has_strong_french_tokens(self, text_lower: str) -> bool:
"""Check for strong French indicators"""
tokens = [
'bonjour', 'bonsoir', 'merci', 'comment', 'allez-vous', 'ça va',
'je suis', 'je vais', 'je peux', 'très bien', 'très mal',
'anxieux', 'déprimé', 'stressé', 'santé mentale', 'problème'
]
return any(t in text_lower for t in tokens)
def _has_strong_kiswahili_tokens(self, text_lower: str) -> bool:
"""Check for strong Kiswahili indicators"""
tokens = [
'hujambo', 'hamjambo', 'habari', 'asante', 'karibu', 'pole',
'sijambo', 'hajambo', 'mimi', 'wewe', 'yeye', 'sisi', 'nyinyi',
'nina', 'una', 'ana', 'tuna', 'mna', 'wana', 'shida', 'matatizo'
]
return any(t in text_lower for t in tokens)
def _is_common_greeting(self, text: str) -> bool:
"""Check if text is a common greeting that should default to English"""
greetings = ['hello', 'hi', 'hey', 'good morning', 'good afternoon', 'good evening']
return text.lower().strip() in greetings
def translate_text(self, text: str, target_language: str) -> str:
"""
Professional translation using GoogleTranslator exclusively.
Translates text to target language with high accuracy and natural tone.
Args:
text: Text to translate
target_language: Target language code ('en', 'fr', 'sw', 'rw')
Returns:
Translated text in target language
"""
if not text or not text.strip():
return text
if target_language == 'en':
return text
try:
# Normalize language code for GoogleTranslator
target_code = self._normalize_language_code(target_language)
# Translate using GoogleTranslator
if self.translator:
translated = GoogleTranslator(source='auto', target=target_code).translate(text)
# Post-process based on target language
if target_language == 'rw':
translated = self.normalize_kinyarwanda(translated)
elif target_language == 'fr':
translated = self.normalize_french(translated)
elif target_language == 'sw':
translated = self.normalize_kiswahili(translated)
return translated
else:
return text
except Exception as e:
print(f"Translation error: {e}")
return text
def _normalize_language_code(self, lang: str) -> str:
"""Normalize language code to GoogleTranslator format"""
mapping = {
'en': 'en', 'english': 'en',
'fr': 'fr', 'french': 'fr', 'français': 'fr',
'sw': 'sw', 'kiswahili': 'sw', 'swahili': 'sw',
'rw': 'rw', 'kinyarwanda': 'rw', 'kin': 'rw', 'ikinyarwanda': 'rw'
}
return mapping.get(lang.lower(), 'en')
def normalize_kinyarwanda(self, text: str) -> str:
"""
Post-process Kinyarwanda to remove mixed-language fragments and enforce
consistent, professional terminology using a small domain glossary.
"""
if not text:
return text
normalized = text
# Remove common French connective phrases that sometimes leak in
french_leak_patterns = [
r"(?i)ligne d'assistance en santé mentale",
r"(?i)pour|avec|sans|dans|sur|entre|car|donc|mais|ou",
]
for pat in french_leak_patterns:
normalized = re.sub(pat, "", normalized)
# Apply glossary replacements
for pat, repl in self.rw_glossary:
normalized = re.sub(pat, repl, normalized)
# Trim repetitive spaces and stray punctuation
normalized = re.sub(r"\s+", " ", normalized).strip()
normalized = re.sub(r"\s+,", ",", normalized)
normalized = re.sub(r"\s+\.", ".", normalized)
return normalized
def normalize_french(self, text: str) -> str:
"""
Post-process French text to ensure natural, professional tone
"""
if not text:
return text
normalized = text
# Fix common translation artifacts
french_fixes = [
(r'\bje suis\s+je suis\b', 'je suis'),
(r'\btrès\s+très\b', 'très'),
(r'\bde\s+de\b', 'de'),
(r'\bdu\s+du\b', 'du'),
(r'\bdes\s+des\b', 'des'),
]
for pattern, replacement in french_fixes:
normalized = re.sub(pattern, replacement, normalized, flags=re.IGNORECASE)
# Clean up spacing and punctuation
normalized = re.sub(r"\s+", " ", normalized).strip()
normalized = re.sub(r"\s+,", ",", normalized)
normalized = re.sub(r"\s+\.", ".", normalized)
return normalized
def normalize_kiswahili(self, text: str) -> str:
"""
Post-process Kiswahili text to ensure natural, professional tone
"""
if not text:
return text
normalized = text
# Fix common translation artifacts
kiswahili_fixes = [
(r'\bmimi\s+mimi\b', 'mimi'),
(r'\bwewe\s+wewe\b', 'wewe'),
(r'\byeye\s+yeye\b', 'yeye'),
(r'\bsisi\s+sisi\b', 'sisi'),
(r'\bnyinyi\s+nyinyi\b', 'nyinyi'),
(r'\bwao\s+wao\b', 'wao'),
]
for pattern, replacement in kiswahili_fixes:
normalized = re.sub(pattern, replacement, normalized, flags=re.IGNORECASE)
# Clean up spacing and punctuation
normalized = re.sub(r"\s+", " ", normalized).strip()
normalized = re.sub(r"\s+,", ",", normalized)
normalized = re.sub(r"\s+\.", ".", normalized)
return normalized
def get_appropriate_response(self, english_response: str, user_language: str) -> str:
"""
Get response in the user's detected language with improved reliability.
This is the main method for ensuring single-language responses.
"""
if user_language == 'en' or not user_language:
return english_response
try:
return self.translate_text(english_response, user_language)
except Exception as e:
print(f"Translation failed: {e}")
return english_response
def process_user_message(self, user_message: str, english_response: str) -> str:
"""
Main method for professional multilingual chatbot.
Automatically detects the user's language from their message and responds
exclusively in that same language. This is the primary interface method.
Args:
user_message: The user's input message
english_response: The AI-generated response in English
Returns:
Response translated to the user's detected language
"""
if not user_message or not english_response:
return english_response
# Detect language from user's message
detected_language = self.detect_language(user_message)
print(f"User message language detected: {detected_language}")
print(f"User message: {user_message[:100]}...")
return self.get_appropriate_response(english_response, detected_language)
def get_multilingual_response(self, english_response: str, user_language: str) -> Dict[str, str]:
responses = {'en': english_response}
for lang in ['fr', 'sw', 'rw']:
if lang != user_language:
responses[lang] = self.translate_text(english_response, lang)
return responses
def get_language_name(self, lang_code: str) -> str:
names = {'en': 'English', 'fr': 'French', 'sw': 'Kiswahili', 'rw': 'Kinyarwanda'}
return names.get(lang_code, 'English')
def is_supported_language(self, lang_code: str) -> bool:
return lang_code in self.supported_languages
def get_supported_languages(self) -> List[str]:
return self.supported_languages
# Global translation service instance
translation_service = TranslationService()
# Convenience function for easy integration
def translate_chatbot_response(user_message: str, english_response: str) -> str:
"""
Convenience function for translating chatbot responses.
This is the main function to use for integrating the multilingual
chatbot functionality into your application.
Args:
user_message: The user's input message
english_response: The AI-generated response in English
Returns:
Response translated to the user's detected language
"""
return translation_service.process_user_message(user_message, english_response)