""" Language Detection Detects the language of user queries to handle multilingual input correctly. Uses lightweight pattern-based detection with langdetect fallback. """ import logging import re from typing import Optional, Dict, Any from dataclasses import dataclass logger = logging.getLogger(__name__) @dataclass class LanguageDetection: """Language detection result""" language: str # ISO 639-1 code (en, ar, am, so, sw, fr) confidence: float # 0.0 to 1.0 method: str # "script", "langdetect", "default" class LanguageDetector: """ Detect language of user queries. Strategy: 1. Script-based detection (Arabic, Amharic) - Fast, 100% accurate 2. langdetect library - Good for Latin scripts 3. Default to English - Safe fallback """ # Unicode ranges for script detection ARABIC_RANGE = (0x0600, 0x06FF) # Arabic script AMHARIC_RANGE = (0x1200, 0x137F) # Ethiopic script # Common words for pattern matching LANGUAGE_PATTERNS = { "so": ["wararka", "habari", "sheeko", "waa", "iyo"], # Somali "sw": ["habari", "leo", "jana", "wiki", "mwezi"], # Swahili "fr": ["nouvelles", "aujourd'hui", "hier", "semaine", "mois"], # French } def __init__(self, cache=None): """ Initialize language detector. Args: cache: Cache adapter for storing detections """ self.cache = cache self._langdetect_available = False self._try_import_langdetect() def _try_import_langdetect(self): """Try to import langdetect library""" try: import langdetect self._langdetect_available = True logger.info("✅ langdetect library available") except ImportError: logger.warning("langdetect not installed, using pattern-based detection only") def detect(self, text: str) -> LanguageDetection: """ Detect language of text. Args: text: Text to detect language for Returns: LanguageDetection with language code and confidence """ if not text or not text.strip(): return LanguageDetection( language="en", confidence=0.5, method="default" ) # Check cache first if self.cache: cache_key = f"lang_detect:{text[:100].lower()}" cached = self.cache.get(cache_key) if cached: logger.debug(f"Language detection cache hit: {text[:50]}") return LanguageDetection(**cached) # Step 1: Script-based detection (fast and accurate) script_result = self._detect_by_script(text) if script_result: self._cache_result(text, script_result) return script_result # Step 2: Pattern-based detection pattern_result = self._detect_by_patterns(text) if pattern_result: self._cache_result(text, pattern_result) return pattern_result # Step 3: langdetect library if self._langdetect_available: langdetect_result = self._detect_with_langdetect(text) if langdetect_result: self._cache_result(text, langdetect_result) return langdetect_result # Step 4: Default to English default_result = LanguageDetection( language="en", confidence=0.5, method="default" ) self._cache_result(text, default_result) return default_result def _detect_by_script(self, text: str) -> Optional[LanguageDetection]: """ Detect language by Unicode script. Very fast and 100% accurate for Arabic and Amharic. """ # Count characters in each script arabic_count = 0 amharic_count = 0 total_chars = 0 for char in text: code = ord(char) if self.ARABIC_RANGE[0] <= code <= self.ARABIC_RANGE[1]: arabic_count += 1 total_chars += 1 elif self.AMHARIC_RANGE[0] <= code <= self.AMHARIC_RANGE[1]: amharic_count += 1 total_chars += 1 elif char.isalpha(): total_chars += 1 if total_chars == 0: return None # If >50% Arabic script → Arabic if arabic_count / total_chars > 0.5: return LanguageDetection( language="ar", confidence=1.0, method="script" ) # If >50% Amharic script → Amharic if amharic_count / total_chars > 0.5: return LanguageDetection( language="am", confidence=1.0, method="script" ) return None def _detect_by_patterns(self, text: str) -> Optional[LanguageDetection]: """ Detect language by common word patterns. Good for Somali, Swahili, French. """ text_lower = text.lower() for lang, patterns in self.LANGUAGE_PATTERNS.items(): matches = sum(1 for pattern in patterns if pattern in text_lower) if matches >= 2: # At least 2 pattern matches return LanguageDetection( language=lang, confidence=0.8, method="pattern" ) return None def _detect_with_langdetect(self, text: str) -> Optional[LanguageDetection]: """ Detect language using langdetect library. Good for Latin-script languages (English, French, Somali, Swahili). """ try: import langdetect # langdetect can be inconsistent, so we detect multiple times detected = langdetect.detect(text) # Map langdetect codes to our supported languages lang_map = { "en": "en", "ar": "ar", "am": "am", "so": "so", "sw": "sw", "fr": "fr", } if detected in lang_map: return LanguageDetection( language=lang_map[detected], confidence=0.85, method="langdetect" ) # If detected language not in our supported set, default to English return LanguageDetection( language="en", confidence=0.6, method="langdetect_fallback" ) except Exception as e: logger.debug(f"langdetect failed: {e}") return None def _cache_result(self, text: str, result: LanguageDetection): """Cache detection result""" if self.cache: cache_key = f"lang_detect:{text[:100].lower()}" self.cache.set( cache_key, { "language": result.language, "confidence": result.confidence, "method": result.method }, expiration=3600 # 1 hour ) # ═══════════════════════════════════════════════════════════════════════════ # SINGLETON INSTANCE # ═══════════════════════════════════════════════════════════════════════════ language_detector: Optional[LanguageDetector] = None def initialize_language_detector(cache=None): """Initialize global language detector instance""" global language_detector language_detector = LanguageDetector(cache) logger.info("Language detector initialized")