Spaces:

prodevroger
/

ishingiro

Sleeping

File size: 18,210 Bytes

c024705

"""
Professional Multilingual Chatbot Translation Service
Supports English, French, Kiswahili, and Kinyarwanda

Features:
- Automatic language detection from user input
- Exclusively responds in the detected language
- Uses GoogleTranslator from deep_translator for accurate translation
- Maintains natural tone, accuracy, and clarity in all supported languages
"""
from typing import Dict, List, Optional, Tuple
from langdetect import detect, detect_langs, DetectorFactory
from deep_translator import GoogleTranslator
import re

# Optional, higher-quality detectors/translators
try:
    import langid
    # Lightweight, fast language id
except Exception:  # pragma: no cover
    langid = None

try:
    import pycld3
    # Google Compact Language Detector v3
except Exception:  # pragma: no cover
    pycld3 = None

# Set seed for consistent language detection
DetectorFactory.seed = 0

class TranslationService:
    def __init__(self):
        # Initialize GoogleTranslator for all translations
        try:
            self.translator = GoogleTranslator()
        except Exception as e:
            print(f"Warning: Failed to initialize GoogleTranslator: {e}")
            self.translator = None
        
        # Language mappings for supported languages
        self.language_codes = {
            'kinyarwanda': 'rw',
            'french': 'fr', 
            'kiswahili': 'sw',
            'english': 'en'
        }
        
        # Supported language codes for detection
        self.supported_languages = ['en', 'fr', 'sw', 'rw']

        # Domain glossary for consistent Kinyarwanda phrasing
        # Maps common English/French mental health phrases to preferred Kinyarwanda
        self.rw_glossary = [
            (r"(?i)mental health hotline\s*:?\s*105", "Umurongo wa telefone w'ubufasha mu by'ubuzima bwo mu mutwe: 105"),
            (r"(?i)ligne d'assistance en santé mentale\s*:?\s*105", "Umurongo wa telefone w'ubufasha mu by'ubuzima bwo mu mutwe: 105"),
            (r"(?i)call\s*112", "Hamagara 112 mu gihe cy'ibyago byihutirwa"),
            (r"(?i)emergency", "ibyago byihutirwa"),
            (r"(?i)caraes\s*ndera\s*hospital", "CARAES Ndera"),
            (r"(?i)hdi\s*rwanda\s*counseling", "HDI Rwanda (Inama n'Ubujyanama)"),
            (r"(?i)arct\s*ruhuka", "ARCT Ruhuka"),
            (r"(?i)mental health", "ubuzima bwo mu mutwe"),
            (r"(?i)anxiety", "impungenge"),
            (r"(?i)depression", "agahinda kenshi"),
            (r"(?i)stress", "umunaniro w'ubwonko"),
            (r"(?i)coping strategies", "uburyo bwo kwifasha"),
            (r"(?i)ku bihano[,\s]*", ""),
            (r"(?i)komeza amajwi make ariko akunze", ""),
        ]

    def detect_language(self, text: str) -> str:
        """
        Professional language detection for multilingual chatbot.
        Detects language from user input and returns one of: 'en', 'fr', 'sw', 'rw'
        
        Uses ensemble method combining pattern matching, multiple detectors,
        and domain-specific knowledge for maximum accuracy.
        """
        if not text or not text.strip():
            return 'en'
        
        # Clean the text for better detection
        cleaned_text = re.sub(r'[^\w\s]', '', text.strip().lower())
        
        if len(cleaned_text) < 2:
            return 'en'
        
        try:
            # Primary detection using pattern matching
            pattern_lang = self._detect_by_patterns(text)
            if pattern_lang:
                return pattern_lang
            
            # Secondary detection using langdetect
            detected = detect(text)
            mapped = self._map_code(detected)
            
            # Tertiary validation using domain knowledge
            if mapped in self.supported_languages:
                return mapped
            
            return 'en'

        except Exception as e:
            print(f"Language detection error: {e}")
            return 'en'
    
    def _detect_by_patterns(self, text: str) -> str:
        """
        Detect language using comprehensive pattern matching for better accuracy
        """
        text_lower = text.lower().strip()
        
        # Count matches for each language to determine the strongest signal
        language_scores = {'rw': 0, 'fr': 0, 'sw': 0, 'en': 0}
        
        # Kinyarwanda patterns - more comprehensive
        kinyarwanda_patterns = [
            r'\b(muraho|murakaza|murabe|murakoze|mwiriwe|mwaramutse|murakaza neza|muraho rwose|muraho neza)\b',
            r'\b(ndabizi|ntabwo|ndabishaka|ndabishimira|ndabishimye|ndabishimye cyane|ndumva)\b',
            r'\b(umunsi|umunsi mwiza|umunsi mubi|ejo|ejo hazaza|ejo hashize|uyu munsi)\b',
            r'\b(amahoro|amahoro yose|amahoro yanyu|amahoro yanjye)\b',
            r'\b(ubwoba|ubwoba bubabaje|ubwoba bunyuma|ubwoba bwinshi|umutwe|umereye|nabi)\b',
            r'\b(umutima|umutima wanjye|umutima wanyu|umutima wanjye)\b',
            r'\b(ubuzima|ubuzima bwiza|ubuzima bubi|ubuzima bwinshi)\b',
            r'\b(nshaka|ntabwo|ndabizi|ndabishimira|ndabishimye|ndumva|ndabishimye)\b',
            r'\b(jewe|wewe|we|jewe|twebwe|mwebwe|bo)\b',
            r'\b(murakoze|murakoze cyane|murakoze cane|murakoze rwose)\b',
            r"\b(ntabwo|ntabwo bimeze|ntabwo bimeze nk'uko)\b",
            r'\b(umutwe|umereye|nabi|ndumva|cyane|rwose|neza)\b'
        ]
        
        # French patterns - more comprehensive
        french_patterns = [
            r'\b(bonjour|bonsoir|salut|bonne journée|bonne soirée)\b',
            r'\b(merci|merci beaucoup|merci bien|de rien)\b',
            r'\b(comment allez-vous|comment ça va|ça va bien|ça va mal)\b',
            r'\b(je suis|je vais|je peux|je veux|je dois|je fais)\b',
            r'\b(très bien|très mal|pas mal|comme ci comme ça|ça va)\b',
            r'\b(anxieux|anxieuse|déprimé|déprimée|stressé|stressée)\b',
            r"\b(depuis|pendant|maintenant|hier|demain|aujourd'hui)\b",
            r'\b(problème|difficulté|souci|inquiétude|santé mentale)\b',
            r'\b(santé|mental|psychologique|émotionnel|psychologue)\b',
            r'\b(avec|sans|pour|dans|sur|sous|entre|parmi)\b',
            r'\b(et|ou|mais|donc|car|ni|puis)\b'
        ]
        
        # Kiswahili patterns - more comprehensive
        kiswahili_patterns = [
            r'\b(hujambo|hamjambo|habari|habari yako|habari za asubuhi|habari za mchana)\b',
            r'\b(asante|asante sana|karibu|pole|pole sana|pole kwa ajili)\b',
            r'\b(sijambo|hajambo|hatujambo|hamjambo|hawajambo)\b',
            r'\b(mimi|wewe|yeye|sisi|nyinyi|wao)\b',
            r'\b(nina|una|ana|tuna|mna|wana|niko|uko|ako|tuko|mko|wako)\b',
            r'\b(shida|matatizo|huzuni|furaha|wasiwasi|msongo wa mawazo)\b',
            r'\b(afya ya akili|moyo|roho|hisia|mawazo)\b',
            r'\b(rafiki|mpenzi|mama|baba|mtoto|mzee|mke|mume)\b',
            r'\b(leo|jana|kesho|sasa|zamani|baadaye)\b',
            r'\b(naomba|tafadhali|samahani|pole|pole sana)\b'
        ]
        
        # English patterns - to distinguish from other languages
        english_patterns = [
            r'\b(hello|hi|hey|good morning|good afternoon|good evening)\b',
            r'\b(thank you|thanks|please|sorry|excuse me)\b',
            r"\b(i am|i'm|i have|i can|i will|i would)\b",
            r'\b(help|support|assistance|mental health|anxiety|depression)\b',
            r'\b(how are you|how do you|what is|where is|when is)\b'
        ]
        
        # Count pattern matches
        for pattern in kinyarwanda_patterns:
            if re.search(pattern, text_lower):
                language_scores['rw'] += 1
        
        for pattern in french_patterns:
            if re.search(pattern, text_lower):
                language_scores['fr'] += 1
        
        for pattern in kiswahili_patterns:
            if re.search(pattern, text_lower):
                language_scores['sw'] += 1
                
        for pattern in english_patterns:
            if re.search(pattern, text_lower):
                language_scores['en'] += 1
        
        # Return the language with the highest score
        if max(language_scores.values()) > 0:
            return max(language_scores, key=language_scores.get)
        
        return None

    def _map_code(self, code: str) -> str:
        """Map various detector codes into our set {en, fr, sw, rw}."""
        mapping = {
            'en': 'en', 'eng': 'en',
            'fr': 'fr', 'fra': 'fr', 'fre': 'fr',
            'sw': 'sw', 'swa': 'sw', 'swc': 'sw',
            'rw': 'rw', 'kin': 'rw',
        }
        return mapping.get(code, 'en')

    def _has_strong_kinyarwanda_tokens(self, text_lower: str) -> bool:
        """Check for strong Kinyarwanda indicators"""
        tokens = [
            'muraho', 'mwiriwe', 'mwaramutse', 'murakoze', 'ndumva',
            'ubwoba', 'umutwe', 'umereye', 'nabi', 'amahoro', 'ubuzima',
            'ndabizi', 'ntabwo', 'ndabishaka', 'ndabishimira', 'cyane', 'rwose'
        ]
        return any(t in text_lower for t in tokens)
    
    def _has_strong_french_tokens(self, text_lower: str) -> bool:
        """Check for strong French indicators"""
        tokens = [
            'bonjour', 'bonsoir', 'merci', 'comment', 'allez-vous', 'ça va',
            'je suis', 'je vais', 'je peux', 'très bien', 'très mal',
            'anxieux', 'déprimé', 'stressé', 'santé mentale', 'problème'
        ]
        return any(t in text_lower for t in tokens)
    
    def _has_strong_kiswahili_tokens(self, text_lower: str) -> bool:
        """Check for strong Kiswahili indicators"""
        tokens = [
            'hujambo', 'hamjambo', 'habari', 'asante', 'karibu', 'pole',
            'sijambo', 'hajambo', 'mimi', 'wewe', 'yeye', 'sisi', 'nyinyi',
            'nina', 'una', 'ana', 'tuna', 'mna', 'wana', 'shida', 'matatizo'
        ]
        return any(t in text_lower for t in tokens)
    
    def _is_common_greeting(self, text: str) -> bool:
        """Check if text is a common greeting that should default to English"""
        greetings = ['hello', 'hi', 'hey', 'good morning', 'good afternoon', 'good evening']
        return text.lower().strip() in greetings

    def translate_text(self, text: str, target_language: str) -> str:
        """
        Professional translation using GoogleTranslator exclusively.
        Translates text to target language with high accuracy and natural tone.
        
        Args:
            text: Text to translate
            target_language: Target language code ('en', 'fr', 'sw', 'rw')
            
        Returns:
            Translated text in target language
        """
        if not text or not text.strip():
            return text
            
        if target_language == 'en':
            return text
            
        try:
            # Normalize language code for GoogleTranslator
            target_code = self._normalize_language_code(target_language)
            
            # Translate using GoogleTranslator
            if self.translator:
                translated = GoogleTranslator(source='auto', target=target_code).translate(text)
                
                # Post-process based on target language
                if target_language == 'rw':
                    translated = self.normalize_kinyarwanda(translated)
                elif target_language == 'fr':
                    translated = self.normalize_french(translated)
                elif target_language == 'sw':
                    translated = self.normalize_kiswahili(translated)
                
                return translated
            else:
                return text
                
        except Exception as e:
            print(f"Translation error: {e}")
            return text
    
    def _normalize_language_code(self, lang: str) -> str:
        """Normalize language code to GoogleTranslator format"""
        mapping = {
            'en': 'en', 'english': 'en',
            'fr': 'fr', 'french': 'fr', 'français': 'fr',
            'sw': 'sw', 'kiswahili': 'sw', 'swahili': 'sw',
            'rw': 'rw', 'kinyarwanda': 'rw', 'kin': 'rw', 'ikinyarwanda': 'rw'
        }
        return mapping.get(lang.lower(), 'en')

    def normalize_kinyarwanda(self, text: str) -> str:
        """
        Post-process Kinyarwanda to remove mixed-language fragments and enforce
        consistent, professional terminology using a small domain glossary.
        """
        if not text:
            return text
        
        normalized = text
        # Remove common French connective phrases that sometimes leak in
        french_leak_patterns = [
            r"(?i)ligne d'assistance en santé mentale",
            r"(?i)pour|avec|sans|dans|sur|entre|car|donc|mais|ou",
        ]
        for pat in french_leak_patterns:
            normalized = re.sub(pat, "", normalized)

        # Apply glossary replacements
        for pat, repl in self.rw_glossary:
            normalized = re.sub(pat, repl, normalized)

        # Trim repetitive spaces and stray punctuation
        normalized = re.sub(r"\s+", " ", normalized).strip()
        normalized = re.sub(r"\s+,", ",", normalized)
        normalized = re.sub(r"\s+\.", ".", normalized)
        return normalized
    
    def normalize_french(self, text: str) -> str:
        """
        Post-process French text to ensure natural, professional tone
        """
        if not text:
            return text
            
        normalized = text
        
        # Fix common translation artifacts
        french_fixes = [
            (r'\bje suis\s+je suis\b', 'je suis'),
            (r'\btrès\s+très\b', 'très'),
            (r'\bde\s+de\b', 'de'),
            (r'\bdu\s+du\b', 'du'),
            (r'\bdes\s+des\b', 'des'),
        ]
        
        for pattern, replacement in french_fixes:
            normalized = re.sub(pattern, replacement, normalized, flags=re.IGNORECASE)
        
        # Clean up spacing and punctuation
        normalized = re.sub(r"\s+", " ", normalized).strip()
        normalized = re.sub(r"\s+,", ",", normalized)
        normalized = re.sub(r"\s+\.", ".", normalized)
        
        return normalized
    
    def normalize_kiswahili(self, text: str) -> str:
        """
        Post-process Kiswahili text to ensure natural, professional tone
        """
        if not text:
            return text
            
        normalized = text
        
        # Fix common translation artifacts
        kiswahili_fixes = [
            (r'\bmimi\s+mimi\b', 'mimi'),
            (r'\bwewe\s+wewe\b', 'wewe'),
            (r'\byeye\s+yeye\b', 'yeye'),
            (r'\bsisi\s+sisi\b', 'sisi'),
            (r'\bnyinyi\s+nyinyi\b', 'nyinyi'),
            (r'\bwao\s+wao\b', 'wao'),
        ]
        
        for pattern, replacement in kiswahili_fixes:
            normalized = re.sub(pattern, replacement, normalized, flags=re.IGNORECASE)
        
        # Clean up spacing and punctuation
        normalized = re.sub(r"\s+", " ", normalized).strip()
        normalized = re.sub(r"\s+,", ",", normalized)
        normalized = re.sub(r"\s+\.", ".", normalized)
        
        return normalized

    def get_appropriate_response(self, english_response: str, user_language: str) -> str:
        """
        Get response in the user's detected language with improved reliability.
        This is the main method for ensuring single-language responses.
        """
        if user_language == 'en' or not user_language:
            return english_response
        
        try:
            return self.translate_text(english_response, user_language)
        except Exception as e:
            print(f"Translation failed: {e}")
            return english_response
    
    def process_user_message(self, user_message: str, english_response: str) -> str:
        """
        Main method for professional multilingual chatbot.
        
        Automatically detects the user's language from their message and responds
        exclusively in that same language. This is the primary interface method.
        
        Args:
            user_message: The user's input message
            english_response: The AI-generated response in English
            
        Returns:
            Response translated to the user's detected language
        """
        if not user_message or not english_response:
            return english_response
        
        # Detect language from user's message
        detected_language = self.detect_language(user_message)
        
        print(f"User message language detected: {detected_language}")
        print(f"User message: {user_message[:100]}...")

        return self.get_appropriate_response(english_response, detected_language)

    def get_multilingual_response(self, english_response: str, user_language: str) -> Dict[str, str]:
        responses = {'en': english_response}
        for lang in ['fr', 'sw', 'rw']:
            if lang != user_language:
                responses[lang] = self.translate_text(english_response, lang)
        return responses

    def get_language_name(self, lang_code: str) -> str:
        names = {'en': 'English', 'fr': 'French', 'sw': 'Kiswahili', 'rw': 'Kinyarwanda'}
        return names.get(lang_code, 'English')
    
    def is_supported_language(self, lang_code: str) -> bool:
        return lang_code in self.supported_languages
    
    def get_supported_languages(self) -> List[str]:
        return self.supported_languages

# Global translation service instance
translation_service = TranslationService()

# Convenience function for easy integration
def translate_chatbot_response(user_message: str, english_response: str) -> str:
    """
    Convenience function for translating chatbot responses.
    
    This is the main function to use for integrating the multilingual
    chatbot functionality into your application.
    
    Args:
        user_message: The user's input message
        english_response: The AI-generated response in English
        
    Returns:
        Response translated to the user's detected language
    """
    return translation_service.process_user_message(user_message, english_response)