File size: 18,210 Bytes
c024705
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
"""
Professional Multilingual Chatbot Translation Service
Supports English, French, Kiswahili, and Kinyarwanda

Features:
- Automatic language detection from user input
- Exclusively responds in the detected language
- Uses GoogleTranslator from deep_translator for accurate translation
- Maintains natural tone, accuracy, and clarity in all supported languages
"""
from typing import Dict, List, Optional, Tuple
from langdetect import detect, detect_langs, DetectorFactory
from deep_translator import GoogleTranslator
import re

# Optional, higher-quality detectors/translators
try:
    import langid
    # Lightweight, fast language id
except Exception:  # pragma: no cover
    langid = None

try:
    import pycld3
    # Google Compact Language Detector v3
except Exception:  # pragma: no cover
    pycld3 = None

# Set seed for consistent language detection
DetectorFactory.seed = 0

class TranslationService:
    def __init__(self):
        # Initialize GoogleTranslator for all translations
        try:
            self.translator = GoogleTranslator()
        except Exception as e:
            print(f"Warning: Failed to initialize GoogleTranslator: {e}")
            self.translator = None
        
        # Language mappings for supported languages
        self.language_codes = {
            'kinyarwanda': 'rw',
            'french': 'fr', 
            'kiswahili': 'sw',
            'english': 'en'
        }
        
        # Supported language codes for detection
        self.supported_languages = ['en', 'fr', 'sw', 'rw']

        # Domain glossary for consistent Kinyarwanda phrasing
        # Maps common English/French mental health phrases to preferred Kinyarwanda
        self.rw_glossary = [
            (r"(?i)mental health hotline\s*:?\s*105", "Umurongo wa telefone w'ubufasha mu by'ubuzima bwo mu mutwe: 105"),
            (r"(?i)ligne d'assistance en santé mentale\s*:?\s*105", "Umurongo wa telefone w'ubufasha mu by'ubuzima bwo mu mutwe: 105"),
            (r"(?i)call\s*112", "Hamagara 112 mu gihe cy'ibyago byihutirwa"),
            (r"(?i)emergency", "ibyago byihutirwa"),
            (r"(?i)caraes\s*ndera\s*hospital", "CARAES Ndera"),
            (r"(?i)hdi\s*rwanda\s*counseling", "HDI Rwanda (Inama n'Ubujyanama)"),
            (r"(?i)arct\s*ruhuka", "ARCT Ruhuka"),
            (r"(?i)mental health", "ubuzima bwo mu mutwe"),
            (r"(?i)anxiety", "impungenge"),
            (r"(?i)depression", "agahinda kenshi"),
            (r"(?i)stress", "umunaniro w'ubwonko"),
            (r"(?i)coping strategies", "uburyo bwo kwifasha"),
            (r"(?i)ku bihano[,\s]*", ""),
            (r"(?i)komeza amajwi make ariko akunze", ""),
        ]

    def detect_language(self, text: str) -> str:
        """
        Professional language detection for multilingual chatbot.
        Detects language from user input and returns one of: 'en', 'fr', 'sw', 'rw'
        
        Uses ensemble method combining pattern matching, multiple detectors,
        and domain-specific knowledge for maximum accuracy.
        """
        if not text or not text.strip():
            return 'en'
        
        # Clean the text for better detection
        cleaned_text = re.sub(r'[^\w\s]', '', text.strip().lower())
        
        if len(cleaned_text) < 2:
            return 'en'
        
        try:
            # Primary detection using pattern matching
            pattern_lang = self._detect_by_patterns(text)
            if pattern_lang:
                return pattern_lang
            
            # Secondary detection using langdetect
            detected = detect(text)
            mapped = self._map_code(detected)
            
            # Tertiary validation using domain knowledge
            if mapped in self.supported_languages:
                return mapped
            
            return 'en'

        except Exception as e:
            print(f"Language detection error: {e}")
            return 'en'
    
    def _detect_by_patterns(self, text: str) -> str:
        """
        Detect language using comprehensive pattern matching for better accuracy
        """
        text_lower = text.lower().strip()
        
        # Count matches for each language to determine the strongest signal
        language_scores = {'rw': 0, 'fr': 0, 'sw': 0, 'en': 0}
        
        # Kinyarwanda patterns - more comprehensive
        kinyarwanda_patterns = [
            r'\b(muraho|murakaza|murabe|murakoze|mwiriwe|mwaramutse|murakaza neza|muraho rwose|muraho neza)\b',
            r'\b(ndabizi|ntabwo|ndabishaka|ndabishimira|ndabishimye|ndabishimye cyane|ndumva)\b',
            r'\b(umunsi|umunsi mwiza|umunsi mubi|ejo|ejo hazaza|ejo hashize|uyu munsi)\b',
            r'\b(amahoro|amahoro yose|amahoro yanyu|amahoro yanjye)\b',
            r'\b(ubwoba|ubwoba bubabaje|ubwoba bunyuma|ubwoba bwinshi|umutwe|umereye|nabi)\b',
            r'\b(umutima|umutima wanjye|umutima wanyu|umutima wanjye)\b',
            r'\b(ubuzima|ubuzima bwiza|ubuzima bubi|ubuzima bwinshi)\b',
            r'\b(nshaka|ntabwo|ndabizi|ndabishimira|ndabishimye|ndumva|ndabishimye)\b',
            r'\b(jewe|wewe|we|jewe|twebwe|mwebwe|bo)\b',
            r'\b(murakoze|murakoze cyane|murakoze cane|murakoze rwose)\b',
            r"\b(ntabwo|ntabwo bimeze|ntabwo bimeze nk'uko)\b",
            r'\b(umutwe|umereye|nabi|ndumva|cyane|rwose|neza)\b'
        ]
        
        # French patterns - more comprehensive
        french_patterns = [
            r'\b(bonjour|bonsoir|salut|bonne journée|bonne soirée)\b',
            r'\b(merci|merci beaucoup|merci bien|de rien)\b',
            r'\b(comment allez-vous|comment ça va|ça va bien|ça va mal)\b',
            r'\b(je suis|je vais|je peux|je veux|je dois|je fais)\b',
            r'\b(très bien|très mal|pas mal|comme ci comme ça|ça va)\b',
            r'\b(anxieux|anxieuse|déprimé|déprimée|stressé|stressée)\b',
            r"\b(depuis|pendant|maintenant|hier|demain|aujourd'hui)\b",
            r'\b(problème|difficulté|souci|inquiétude|santé mentale)\b',
            r'\b(santé|mental|psychologique|émotionnel|psychologue)\b',
            r'\b(avec|sans|pour|dans|sur|sous|entre|parmi)\b',
            r'\b(et|ou|mais|donc|car|ni|puis)\b'
        ]
        
        # Kiswahili patterns - more comprehensive
        kiswahili_patterns = [
            r'\b(hujambo|hamjambo|habari|habari yako|habari za asubuhi|habari za mchana)\b',
            r'\b(asante|asante sana|karibu|pole|pole sana|pole kwa ajili)\b',
            r'\b(sijambo|hajambo|hatujambo|hamjambo|hawajambo)\b',
            r'\b(mimi|wewe|yeye|sisi|nyinyi|wao)\b',
            r'\b(nina|una|ana|tuna|mna|wana|niko|uko|ako|tuko|mko|wako)\b',
            r'\b(shida|matatizo|huzuni|furaha|wasiwasi|msongo wa mawazo)\b',
            r'\b(afya ya akili|moyo|roho|hisia|mawazo)\b',
            r'\b(rafiki|mpenzi|mama|baba|mtoto|mzee|mke|mume)\b',
            r'\b(leo|jana|kesho|sasa|zamani|baadaye)\b',
            r'\b(naomba|tafadhali|samahani|pole|pole sana)\b'
        ]
        
        # English patterns - to distinguish from other languages
        english_patterns = [
            r'\b(hello|hi|hey|good morning|good afternoon|good evening)\b',
            r'\b(thank you|thanks|please|sorry|excuse me)\b',
            r"\b(i am|i'm|i have|i can|i will|i would)\b",
            r'\b(help|support|assistance|mental health|anxiety|depression)\b',
            r'\b(how are you|how do you|what is|where is|when is)\b'
        ]
        
        # Count pattern matches
        for pattern in kinyarwanda_patterns:
            if re.search(pattern, text_lower):
                language_scores['rw'] += 1
        
        for pattern in french_patterns:
            if re.search(pattern, text_lower):
                language_scores['fr'] += 1
        
        for pattern in kiswahili_patterns:
            if re.search(pattern, text_lower):
                language_scores['sw'] += 1
                
        for pattern in english_patterns:
            if re.search(pattern, text_lower):
                language_scores['en'] += 1
        
        # Return the language with the highest score
        if max(language_scores.values()) > 0:
            return max(language_scores, key=language_scores.get)
        
        return None

    def _map_code(self, code: str) -> str:
        """Map various detector codes into our set {en, fr, sw, rw}."""
        mapping = {
            'en': 'en', 'eng': 'en',
            'fr': 'fr', 'fra': 'fr', 'fre': 'fr',
            'sw': 'sw', 'swa': 'sw', 'swc': 'sw',
            'rw': 'rw', 'kin': 'rw',
        }
        return mapping.get(code, 'en')

    def _has_strong_kinyarwanda_tokens(self, text_lower: str) -> bool:
        """Check for strong Kinyarwanda indicators"""
        tokens = [
            'muraho', 'mwiriwe', 'mwaramutse', 'murakoze', 'ndumva',
            'ubwoba', 'umutwe', 'umereye', 'nabi', 'amahoro', 'ubuzima',
            'ndabizi', 'ntabwo', 'ndabishaka', 'ndabishimira', 'cyane', 'rwose'
        ]
        return any(t in text_lower for t in tokens)
    
    def _has_strong_french_tokens(self, text_lower: str) -> bool:
        """Check for strong French indicators"""
        tokens = [
            'bonjour', 'bonsoir', 'merci', 'comment', 'allez-vous', 'ça va',
            'je suis', 'je vais', 'je peux', 'très bien', 'très mal',
            'anxieux', 'déprimé', 'stressé', 'santé mentale', 'problème'
        ]
        return any(t in text_lower for t in tokens)
    
    def _has_strong_kiswahili_tokens(self, text_lower: str) -> bool:
        """Check for strong Kiswahili indicators"""
        tokens = [
            'hujambo', 'hamjambo', 'habari', 'asante', 'karibu', 'pole',
            'sijambo', 'hajambo', 'mimi', 'wewe', 'yeye', 'sisi', 'nyinyi',
            'nina', 'una', 'ana', 'tuna', 'mna', 'wana', 'shida', 'matatizo'
        ]
        return any(t in text_lower for t in tokens)
    
    def _is_common_greeting(self, text: str) -> bool:
        """Check if text is a common greeting that should default to English"""
        greetings = ['hello', 'hi', 'hey', 'good morning', 'good afternoon', 'good evening']
        return text.lower().strip() in greetings

    def translate_text(self, text: str, target_language: str) -> str:
        """
        Professional translation using GoogleTranslator exclusively.
        Translates text to target language with high accuracy and natural tone.
        
        Args:
            text: Text to translate
            target_language: Target language code ('en', 'fr', 'sw', 'rw')
            
        Returns:
            Translated text in target language
        """
        if not text or not text.strip():
            return text
            
        if target_language == 'en':
            return text
            
        try:
            # Normalize language code for GoogleTranslator
            target_code = self._normalize_language_code(target_language)
            
            # Translate using GoogleTranslator
            if self.translator:
                translated = GoogleTranslator(source='auto', target=target_code).translate(text)
                
                # Post-process based on target language
                if target_language == 'rw':
                    translated = self.normalize_kinyarwanda(translated)
                elif target_language == 'fr':
                    translated = self.normalize_french(translated)
                elif target_language == 'sw':
                    translated = self.normalize_kiswahili(translated)
                
                return translated
            else:
                return text
                
        except Exception as e:
            print(f"Translation error: {e}")
            return text
    
    def _normalize_language_code(self, lang: str) -> str:
        """Normalize language code to GoogleTranslator format"""
        mapping = {
            'en': 'en', 'english': 'en',
            'fr': 'fr', 'french': 'fr', 'français': 'fr',
            'sw': 'sw', 'kiswahili': 'sw', 'swahili': 'sw',
            'rw': 'rw', 'kinyarwanda': 'rw', 'kin': 'rw', 'ikinyarwanda': 'rw'
        }
        return mapping.get(lang.lower(), 'en')

    def normalize_kinyarwanda(self, text: str) -> str:
        """
        Post-process Kinyarwanda to remove mixed-language fragments and enforce
        consistent, professional terminology using a small domain glossary.
        """
        if not text:
            return text
        
        normalized = text
        # Remove common French connective phrases that sometimes leak in
        french_leak_patterns = [
            r"(?i)ligne d'assistance en santé mentale",
            r"(?i)pour|avec|sans|dans|sur|entre|car|donc|mais|ou",
        ]
        for pat in french_leak_patterns:
            normalized = re.sub(pat, "", normalized)

        # Apply glossary replacements
        for pat, repl in self.rw_glossary:
            normalized = re.sub(pat, repl, normalized)

        # Trim repetitive spaces and stray punctuation
        normalized = re.sub(r"\s+", " ", normalized).strip()
        normalized = re.sub(r"\s+,", ",", normalized)
        normalized = re.sub(r"\s+\.", ".", normalized)
        return normalized
    
    def normalize_french(self, text: str) -> str:
        """
        Post-process French text to ensure natural, professional tone
        """
        if not text:
            return text
            
        normalized = text
        
        # Fix common translation artifacts
        french_fixes = [
            (r'\bje suis\s+je suis\b', 'je suis'),
            (r'\btrès\s+très\b', 'très'),
            (r'\bde\s+de\b', 'de'),
            (r'\bdu\s+du\b', 'du'),
            (r'\bdes\s+des\b', 'des'),
        ]
        
        for pattern, replacement in french_fixes:
            normalized = re.sub(pattern, replacement, normalized, flags=re.IGNORECASE)
        
        # Clean up spacing and punctuation
        normalized = re.sub(r"\s+", " ", normalized).strip()
        normalized = re.sub(r"\s+,", ",", normalized)
        normalized = re.sub(r"\s+\.", ".", normalized)
        
        return normalized
    
    def normalize_kiswahili(self, text: str) -> str:
        """
        Post-process Kiswahili text to ensure natural, professional tone
        """
        if not text:
            return text
            
        normalized = text
        
        # Fix common translation artifacts
        kiswahili_fixes = [
            (r'\bmimi\s+mimi\b', 'mimi'),
            (r'\bwewe\s+wewe\b', 'wewe'),
            (r'\byeye\s+yeye\b', 'yeye'),
            (r'\bsisi\s+sisi\b', 'sisi'),
            (r'\bnyinyi\s+nyinyi\b', 'nyinyi'),
            (r'\bwao\s+wao\b', 'wao'),
        ]
        
        for pattern, replacement in kiswahili_fixes:
            normalized = re.sub(pattern, replacement, normalized, flags=re.IGNORECASE)
        
        # Clean up spacing and punctuation
        normalized = re.sub(r"\s+", " ", normalized).strip()
        normalized = re.sub(r"\s+,", ",", normalized)
        normalized = re.sub(r"\s+\.", ".", normalized)
        
        return normalized

    def get_appropriate_response(self, english_response: str, user_language: str) -> str:
        """
        Get response in the user's detected language with improved reliability.
        This is the main method for ensuring single-language responses.
        """
        if user_language == 'en' or not user_language:
            return english_response
        
        try:
            return self.translate_text(english_response, user_language)
        except Exception as e:
            print(f"Translation failed: {e}")
            return english_response
    
    def process_user_message(self, user_message: str, english_response: str) -> str:
        """
        Main method for professional multilingual chatbot.
        
        Automatically detects the user's language from their message and responds
        exclusively in that same language. This is the primary interface method.
        
        Args:
            user_message: The user's input message
            english_response: The AI-generated response in English
            
        Returns:
            Response translated to the user's detected language
        """
        if not user_message or not english_response:
            return english_response
        
        # Detect language from user's message
        detected_language = self.detect_language(user_message)
        
        print(f"User message language detected: {detected_language}")
        print(f"User message: {user_message[:100]}...")

        return self.get_appropriate_response(english_response, detected_language)

    def get_multilingual_response(self, english_response: str, user_language: str) -> Dict[str, str]:
        responses = {'en': english_response}
        for lang in ['fr', 'sw', 'rw']:
            if lang != user_language:
                responses[lang] = self.translate_text(english_response, lang)
        return responses

    def get_language_name(self, lang_code: str) -> str:
        names = {'en': 'English', 'fr': 'French', 'sw': 'Kiswahili', 'rw': 'Kinyarwanda'}
        return names.get(lang_code, 'English')
    
    def is_supported_language(self, lang_code: str) -> bool:
        return lang_code in self.supported_languages
    
    def get_supported_languages(self) -> List[str]:
        return self.supported_languages

# Global translation service instance
translation_service = TranslationService()

# Convenience function for easy integration
def translate_chatbot_response(user_message: str, english_response: str) -> str:
    """
    Convenience function for translating chatbot responses.
    
    This is the main function to use for integrating the multilingual
    chatbot functionality into your application.
    
    Args:
        user_message: The user's input message
        english_response: The AI-generated response in English
        
    Returns:
        Response translated to the user's detected language
    """
    return translation_service.process_user_message(user_message, english_response)