Spaces:
Running
Running
| """ | |
| Language Detection | |
| Detects the language of user queries to handle multilingual input correctly. | |
| Uses lightweight pattern-based detection with langdetect fallback. | |
| """ | |
| import logging | |
| import re | |
| from typing import Optional, Dict, Any | |
| from dataclasses import dataclass | |
| logger = logging.getLogger(__name__) | |
| class LanguageDetection: | |
| """Language detection result""" | |
| language: str # ISO 639-1 code (en, ar, am, so, sw, fr) | |
| confidence: float # 0.0 to 1.0 | |
| method: str # "script", "langdetect", "default" | |
| class LanguageDetector: | |
| """ | |
| Detect language of user queries. | |
| Strategy: | |
| 1. Script-based detection (Arabic, Amharic) - Fast, 100% accurate | |
| 2. langdetect library - Good for Latin scripts | |
| 3. Default to English - Safe fallback | |
| """ | |
| # Unicode ranges for script detection | |
| ARABIC_RANGE = (0x0600, 0x06FF) # Arabic script | |
| AMHARIC_RANGE = (0x1200, 0x137F) # Ethiopic script | |
| # Common words for pattern matching | |
| LANGUAGE_PATTERNS = { | |
| "so": ["wararka", "habari", "sheeko", "waa", "iyo"], # Somali | |
| "sw": ["habari", "leo", "jana", "wiki", "mwezi"], # Swahili | |
| "fr": ["nouvelles", "aujourd'hui", "hier", "semaine", "mois"], # French | |
| } | |
| def __init__(self, cache=None): | |
| """ | |
| Initialize language detector. | |
| Args: | |
| cache: Cache adapter for storing detections | |
| """ | |
| self.cache = cache | |
| self._langdetect_available = False | |
| self._try_import_langdetect() | |
| def _try_import_langdetect(self): | |
| """Try to import langdetect library""" | |
| try: | |
| import langdetect | |
| self._langdetect_available = True | |
| logger.info("β langdetect library available") | |
| except ImportError: | |
| logger.warning("langdetect not installed, using pattern-based detection only") | |
| def detect(self, text: str) -> LanguageDetection: | |
| """ | |
| Detect language of text. | |
| Args: | |
| text: Text to detect language for | |
| Returns: | |
| LanguageDetection with language code and confidence | |
| """ | |
| if not text or not text.strip(): | |
| return LanguageDetection( | |
| language="en", | |
| confidence=0.5, | |
| method="default" | |
| ) | |
| # Check cache first | |
| if self.cache: | |
| cache_key = f"lang_detect:{text[:100].lower()}" | |
| cached = self.cache.get(cache_key) | |
| if cached: | |
| logger.debug(f"Language detection cache hit: {text[:50]}") | |
| return LanguageDetection(**cached) | |
| # Step 1: Script-based detection (fast and accurate) | |
| script_result = self._detect_by_script(text) | |
| if script_result: | |
| self._cache_result(text, script_result) | |
| return script_result | |
| # Step 2: Pattern-based detection | |
| pattern_result = self._detect_by_patterns(text) | |
| if pattern_result: | |
| self._cache_result(text, pattern_result) | |
| return pattern_result | |
| # Step 3: langdetect library | |
| if self._langdetect_available: | |
| langdetect_result = self._detect_with_langdetect(text) | |
| if langdetect_result: | |
| self._cache_result(text, langdetect_result) | |
| return langdetect_result | |
| # Step 4: Default to English | |
| default_result = LanguageDetection( | |
| language="en", | |
| confidence=0.5, | |
| method="default" | |
| ) | |
| self._cache_result(text, default_result) | |
| return default_result | |
| def _detect_by_script(self, text: str) -> Optional[LanguageDetection]: | |
| """ | |
| Detect language by Unicode script. | |
| Very fast and 100% accurate for Arabic and Amharic. | |
| """ | |
| # Count characters in each script | |
| arabic_count = 0 | |
| amharic_count = 0 | |
| total_chars = 0 | |
| for char in text: | |
| code = ord(char) | |
| if self.ARABIC_RANGE[0] <= code <= self.ARABIC_RANGE[1]: | |
| arabic_count += 1 | |
| total_chars += 1 | |
| elif self.AMHARIC_RANGE[0] <= code <= self.AMHARIC_RANGE[1]: | |
| amharic_count += 1 | |
| total_chars += 1 | |
| elif char.isalpha(): | |
| total_chars += 1 | |
| if total_chars == 0: | |
| return None | |
| # If >50% Arabic script β Arabic | |
| if arabic_count / total_chars > 0.5: | |
| return LanguageDetection( | |
| language="ar", | |
| confidence=1.0, | |
| method="script" | |
| ) | |
| # If >50% Amharic script β Amharic | |
| if amharic_count / total_chars > 0.5: | |
| return LanguageDetection( | |
| language="am", | |
| confidence=1.0, | |
| method="script" | |
| ) | |
| return None | |
| def _detect_by_patterns(self, text: str) -> Optional[LanguageDetection]: | |
| """ | |
| Detect language by common word patterns. | |
| Good for Somali, Swahili, French. | |
| """ | |
| text_lower = text.lower() | |
| for lang, patterns in self.LANGUAGE_PATTERNS.items(): | |
| matches = sum(1 for pattern in patterns if pattern in text_lower) | |
| if matches >= 2: # At least 2 pattern matches | |
| return LanguageDetection( | |
| language=lang, | |
| confidence=0.8, | |
| method="pattern" | |
| ) | |
| return None | |
| def _detect_with_langdetect(self, text: str) -> Optional[LanguageDetection]: | |
| """ | |
| Detect language using langdetect library. | |
| Good for Latin-script languages (English, French, Somali, Swahili). | |
| """ | |
| try: | |
| import langdetect | |
| # langdetect can be inconsistent, so we detect multiple times | |
| detected = langdetect.detect(text) | |
| # Map langdetect codes to our supported languages | |
| lang_map = { | |
| "en": "en", | |
| "ar": "ar", | |
| "am": "am", | |
| "so": "so", | |
| "sw": "sw", | |
| "fr": "fr", | |
| } | |
| if detected in lang_map: | |
| return LanguageDetection( | |
| language=lang_map[detected], | |
| confidence=0.85, | |
| method="langdetect" | |
| ) | |
| # If detected language not in our supported set, default to English | |
| return LanguageDetection( | |
| language="en", | |
| confidence=0.6, | |
| method="langdetect_fallback" | |
| ) | |
| except Exception as e: | |
| logger.debug(f"langdetect failed: {e}") | |
| return None | |
| def _cache_result(self, text: str, result: LanguageDetection): | |
| """Cache detection result""" | |
| if self.cache: | |
| cache_key = f"lang_detect:{text[:100].lower()}" | |
| self.cache.set( | |
| cache_key, | |
| { | |
| "language": result.language, | |
| "confidence": result.confidence, | |
| "method": result.method | |
| }, | |
| expiration=3600 # 1 hour | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # SINGLETON INSTANCE | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| language_detector: Optional[LanguageDetector] = None | |
| def initialize_language_detector(cache=None): | |
| """Initialize global language detector instance""" | |
| global language_detector | |
| language_detector = LanguageDetector(cache) | |
| logger.info("Language detector initialized") | |