|
|
""" |
|
|
models/anomaly-detection/src/utils/language_detector.py |
|
|
Language detection using FastText or lingua-py for Sinhala/Tamil/English |
|
|
""" |
|
|
import os |
|
|
import logging |
|
|
from typing import Tuple, Optional |
|
|
from pathlib import Path |
|
|
import re |
|
|
|
|
|
logger = logging.getLogger("language_detector") |
|
|
|
|
|
|
|
|
try: |
|
|
import fasttext |
|
|
fasttext.FastText.eprint = lambda x: None |
|
|
FASTTEXT_AVAILABLE = True |
|
|
except ImportError: |
|
|
FASTTEXT_AVAILABLE = False |
|
|
logger.warning("FastText not available. Install with: pip install fasttext") |
|
|
|
|
|
try: |
|
|
from lingua import Language, LanguageDetectorBuilder |
|
|
LINGUA_AVAILABLE = True |
|
|
except ImportError: |
|
|
LINGUA_AVAILABLE = False |
|
|
logger.warning("Lingua not available. Install with: pip install lingua-language-detector") |
|
|
|
|
|
|
|
|
class LanguageDetector: |
|
|
""" |
|
|
Multilingual language detector supporting Sinhala, Tamil, and English. |
|
|
Uses FastText as primary detector with lingua fallback. |
|
|
""" |
|
|
|
|
|
|
|
|
LANG_MAP = { |
|
|
"en": "english", |
|
|
"si": "sinhala", |
|
|
"ta": "tamil", |
|
|
"__label__en": "english", |
|
|
"__label__si": "sinhala", |
|
|
"__label__ta": "tamil", |
|
|
"ENGLISH": "english", |
|
|
"SINHALA": "sinhala", |
|
|
"TAMIL": "tamil" |
|
|
} |
|
|
|
|
|
|
|
|
SINHALA_RANGE = (0x0D80, 0x0DFF) |
|
|
TAMIL_RANGE = (0x0B80, 0x0BFF) |
|
|
|
|
|
def __init__(self, models_cache_dir: Optional[str] = None): |
|
|
""" |
|
|
Initialize language detector. |
|
|
|
|
|
Args: |
|
|
models_cache_dir: Directory for cached FastText models |
|
|
""" |
|
|
self.models_cache_dir = models_cache_dir or str( |
|
|
Path(__file__).parent.parent.parent / "models_cache" |
|
|
) |
|
|
Path(self.models_cache_dir).mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
self.fasttext_model = None |
|
|
self.lingua_detector = None |
|
|
|
|
|
self._init_detectors() |
|
|
|
|
|
def _init_detectors(self): |
|
|
"""Initialize detection models""" |
|
|
|
|
|
if FASTTEXT_AVAILABLE: |
|
|
model_path = Path(self.models_cache_dir) / "lid.176.bin" |
|
|
if model_path.exists(): |
|
|
try: |
|
|
self.fasttext_model = fasttext.load_model(str(model_path)) |
|
|
logger.info(f"[LanguageDetector] Loaded FastText model from {model_path}") |
|
|
except Exception as e: |
|
|
logger.warning(f"[LanguageDetector] Failed to load FastText: {e}") |
|
|
else: |
|
|
logger.warning(f"[LanguageDetector] FastText model not found at {model_path}") |
|
|
logger.info("Download from: https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin") |
|
|
|
|
|
|
|
|
if LINGUA_AVAILABLE: |
|
|
try: |
|
|
self.lingua_detector = LanguageDetectorBuilder.from_languages( |
|
|
Language.ENGLISH, |
|
|
Language.TAMIL, |
|
|
|
|
|
).build() |
|
|
logger.info("[LanguageDetector] Initialized Lingua detector") |
|
|
except Exception as e: |
|
|
logger.warning(f"[LanguageDetector] Failed to init Lingua: {e}") |
|
|
|
|
|
def _detect_by_script(self, text: str) -> Optional[str]: |
|
|
""" |
|
|
Detect language by Unicode script analysis. |
|
|
More reliable for Sinhala/Tamil which have distinct scripts. |
|
|
""" |
|
|
sinhala_count = 0 |
|
|
tamil_count = 0 |
|
|
latin_count = 0 |
|
|
|
|
|
for char in text: |
|
|
code = ord(char) |
|
|
if self.SINHALA_RANGE[0] <= code <= self.SINHALA_RANGE[1]: |
|
|
sinhala_count += 1 |
|
|
elif self.TAMIL_RANGE[0] <= code <= self.TAMIL_RANGE[1]: |
|
|
tamil_count += 1 |
|
|
elif char.isalpha() and code < 128: |
|
|
latin_count += 1 |
|
|
|
|
|
total_alpha = sinhala_count + tamil_count + latin_count |
|
|
if total_alpha == 0: |
|
|
return None |
|
|
|
|
|
|
|
|
if sinhala_count / total_alpha > 0.3: |
|
|
return "sinhala" |
|
|
if tamil_count / total_alpha > 0.3: |
|
|
return "tamil" |
|
|
if latin_count / total_alpha > 0.5: |
|
|
return "english" |
|
|
|
|
|
return None |
|
|
|
|
|
def detect(self, text: str) -> Tuple[str, float]: |
|
|
""" |
|
|
Detect language of text. |
|
|
|
|
|
Args: |
|
|
text: Input text |
|
|
|
|
|
Returns: |
|
|
Tuple of (language_code, confidence) |
|
|
language_code: 'english', 'sinhala', 'tamil', or 'unknown' |
|
|
""" |
|
|
if not text or len(text.strip()) < 3: |
|
|
return "unknown", 0.0 |
|
|
|
|
|
|
|
|
clean_text = re.sub(r'http\S+|@\w+|#\w+', '', text) |
|
|
clean_text = clean_text.strip() |
|
|
|
|
|
if not clean_text: |
|
|
return "unknown", 0.0 |
|
|
|
|
|
|
|
|
script_lang = self._detect_by_script(clean_text) |
|
|
if script_lang in ["sinhala", "tamil"]: |
|
|
return script_lang, 0.95 |
|
|
|
|
|
|
|
|
if self.fasttext_model: |
|
|
try: |
|
|
predictions = self.fasttext_model.predict(clean_text.replace("\n", " ")) |
|
|
label = predictions[0][0] |
|
|
confidence = predictions[1][0] |
|
|
|
|
|
lang = self.LANG_MAP.get(label, "unknown") |
|
|
if lang != "unknown" and confidence > 0.5: |
|
|
return lang, float(confidence) |
|
|
except Exception as e: |
|
|
logger.debug(f"FastText error: {e}") |
|
|
|
|
|
|
|
|
if self.lingua_detector: |
|
|
try: |
|
|
detected = self.lingua_detector.detect_language_of(clean_text) |
|
|
if detected: |
|
|
lang = self.LANG_MAP.get(detected.name, "unknown") |
|
|
|
|
|
confidence = 0.8 if len(clean_text) > 20 else 0.6 |
|
|
return lang, confidence |
|
|
except Exception as e: |
|
|
logger.debug(f"Lingua error: {e}") |
|
|
|
|
|
|
|
|
if script_lang == "english": |
|
|
return "english", 0.7 |
|
|
|
|
|
return "english", 0.5 |
|
|
|
|
|
|
|
|
|
|
|
_detector: Optional[LanguageDetector] = None |
|
|
|
|
|
|
|
|
def get_detector(models_cache_dir: Optional[str] = None) -> LanguageDetector: |
|
|
"""Get or create singleton detector instance""" |
|
|
global _detector |
|
|
if _detector is None: |
|
|
_detector = LanguageDetector(models_cache_dir) |
|
|
return _detector |
|
|
|
|
|
|
|
|
def detect_language(text: str) -> Tuple[str, float]: |
|
|
""" |
|
|
Convenience function for language detection. |
|
|
|
|
|
Args: |
|
|
text: Input text |
|
|
|
|
|
Returns: |
|
|
Tuple of (language: str, confidence: float) |
|
|
""" |
|
|
return get_detector().detect(text) |
|
|
|