""" models/anomaly-detection/src/utils/language_detector.py Language detection using FastText or lingua-py for Sinhala/Tamil/English """ import os import logging from typing import Tuple, Optional from pathlib import Path import re logger = logging.getLogger("language_detector") # Try FastText first, fallback to lingua try: import fasttext fasttext.FastText.eprint = lambda x: None # Suppress warnings FASTTEXT_AVAILABLE = True except ImportError: FASTTEXT_AVAILABLE = False logger.warning("FastText not available. Install with: pip install fasttext") try: from lingua import Language, LanguageDetectorBuilder LINGUA_AVAILABLE = True except ImportError: LINGUA_AVAILABLE = False logger.warning("Lingua not available. Install with: pip install lingua-language-detector") class LanguageDetector: """ Multilingual language detector supporting Sinhala, Tamil, and English. Uses FastText as primary detector with lingua fallback. """ # Language code mapping LANG_MAP = { "en": "english", "si": "sinhala", "ta": "tamil", "__label__en": "english", "__label__si": "sinhala", "__label__ta": "tamil", "ENGLISH": "english", "SINHALA": "sinhala", "TAMIL": "tamil" } # Unicode ranges for script detection SINHALA_RANGE = (0x0D80, 0x0DFF) TAMIL_RANGE = (0x0B80, 0x0BFF) def __init__(self, models_cache_dir: Optional[str] = None): """ Initialize language detector. Args: models_cache_dir: Directory for cached FastText models """ self.models_cache_dir = models_cache_dir or str( Path(__file__).parent.parent.parent / "models_cache" ) Path(self.models_cache_dir).mkdir(parents=True, exist_ok=True) self.fasttext_model = None self.lingua_detector = None self._init_detectors() def _init_detectors(self): """Initialize detection models""" # Try FastText if FASTTEXT_AVAILABLE: model_path = Path(self.models_cache_dir) / "lid.176.bin" if model_path.exists(): try: self.fasttext_model = fasttext.load_model(str(model_path)) logger.info(f"[LanguageDetector] Loaded FastText model from {model_path}") except Exception as e: logger.warning(f"[LanguageDetector] Failed to load FastText: {e}") else: logger.warning(f"[LanguageDetector] FastText model not found at {model_path}") logger.info("Download from: https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin") # Initialize lingua as fallback if LINGUA_AVAILABLE: try: self.lingua_detector = LanguageDetectorBuilder.from_languages( Language.ENGLISH, Language.TAMIL, # Note: Lingua may not have Sinhala, we'll use script detection ).build() logger.info("[LanguageDetector] Initialized Lingua detector") except Exception as e: logger.warning(f"[LanguageDetector] Failed to init Lingua: {e}") def _detect_by_script(self, text: str) -> Optional[str]: """ Detect language by Unicode script analysis. More reliable for Sinhala/Tamil which have distinct scripts. """ sinhala_count = 0 tamil_count = 0 latin_count = 0 for char in text: code = ord(char) if self.SINHALA_RANGE[0] <= code <= self.SINHALA_RANGE[1]: sinhala_count += 1 elif self.TAMIL_RANGE[0] <= code <= self.TAMIL_RANGE[1]: tamil_count += 1 elif char.isalpha() and code < 128: latin_count += 1 total_alpha = sinhala_count + tamil_count + latin_count if total_alpha == 0: return None # Threshold-based detection if sinhala_count / total_alpha > 0.3: return "sinhala" if tamil_count / total_alpha > 0.3: return "tamil" if latin_count / total_alpha > 0.5: return "english" return None def detect(self, text: str) -> Tuple[str, float]: """ Detect language of text. Args: text: Input text Returns: Tuple of (language_code, confidence) language_code: 'english', 'sinhala', 'tamil', or 'unknown' """ if not text or len(text.strip()) < 3: return "unknown", 0.0 # Clean text clean_text = re.sub(r'http\S+|@\w+|#\w+', '', text) clean_text = clean_text.strip() if not clean_text: return "unknown", 0.0 # 1. First try script detection (most reliable for Sinhala/Tamil) script_lang = self._detect_by_script(clean_text) if script_lang in ["sinhala", "tamil"]: return script_lang, 0.95 # 2. Try FastText if self.fasttext_model: try: predictions = self.fasttext_model.predict(clean_text.replace("\n", " ")) label = predictions[0][0] confidence = predictions[1][0] lang = self.LANG_MAP.get(label, "unknown") if lang != "unknown" and confidence > 0.5: return lang, float(confidence) except Exception as e: logger.debug(f"FastText error: {e}") # 3. Try Lingua if self.lingua_detector: try: detected = self.lingua_detector.detect_language_of(clean_text) if detected: lang = self.LANG_MAP.get(detected.name, "unknown") # Lingua doesn't return confidence, estimate based on text confidence = 0.8 if len(clean_text) > 20 else 0.6 return lang, confidence except Exception as e: logger.debug(f"Lingua error: {e}") # 4. Fallback to script detection result or default if script_lang == "english": return "english", 0.7 return "english", 0.5 # Default to English # Singleton instance _detector: Optional[LanguageDetector] = None def get_detector(models_cache_dir: Optional[str] = None) -> LanguageDetector: """Get or create singleton detector instance""" global _detector if _detector is None: _detector = LanguageDetector(models_cache_dir) return _detector def detect_language(text: str) -> Tuple[str, float]: """ Convenience function for language detection. Args: text: Input text Returns: Tuple of (language: str, confidence: float) """ return get_detector().detect(text)