modelx / models /anomaly-detection /src /utils /language_detector.py
nivakaran's picture
Upload folder using huggingface_hub
16ec2cf verified
"""
models/anomaly-detection/src/utils/language_detector.py
Language detection using FastText or lingua-py for Sinhala/Tamil/English
"""
import os
import logging
from typing import Tuple, Optional
from pathlib import Path
import re
logger = logging.getLogger("language_detector")
# Try FastText first, fallback to lingua
try:
import fasttext
fasttext.FastText.eprint = lambda x: None # Suppress warnings
FASTTEXT_AVAILABLE = True
except ImportError:
FASTTEXT_AVAILABLE = False
logger.warning("FastText not available. Install with: pip install fasttext")
try:
from lingua import Language, LanguageDetectorBuilder
LINGUA_AVAILABLE = True
except ImportError:
LINGUA_AVAILABLE = False
logger.warning("Lingua not available. Install with: pip install lingua-language-detector")
class LanguageDetector:
"""
Multilingual language detector supporting Sinhala, Tamil, and English.
Uses FastText as primary detector with lingua fallback.
"""
# Language code mapping
LANG_MAP = {
"en": "english",
"si": "sinhala",
"ta": "tamil",
"__label__en": "english",
"__label__si": "sinhala",
"__label__ta": "tamil",
"ENGLISH": "english",
"SINHALA": "sinhala",
"TAMIL": "tamil"
}
# Unicode ranges for script detection
SINHALA_RANGE = (0x0D80, 0x0DFF)
TAMIL_RANGE = (0x0B80, 0x0BFF)
def __init__(self, models_cache_dir: Optional[str] = None):
"""
Initialize language detector.
Args:
models_cache_dir: Directory for cached FastText models
"""
self.models_cache_dir = models_cache_dir or str(
Path(__file__).parent.parent.parent / "models_cache"
)
Path(self.models_cache_dir).mkdir(parents=True, exist_ok=True)
self.fasttext_model = None
self.lingua_detector = None
self._init_detectors()
def _init_detectors(self):
"""Initialize detection models"""
# Try FastText
if FASTTEXT_AVAILABLE:
model_path = Path(self.models_cache_dir) / "lid.176.bin"
if model_path.exists():
try:
self.fasttext_model = fasttext.load_model(str(model_path))
logger.info(f"[LanguageDetector] Loaded FastText model from {model_path}")
except Exception as e:
logger.warning(f"[LanguageDetector] Failed to load FastText: {e}")
else:
logger.warning(f"[LanguageDetector] FastText model not found at {model_path}")
logger.info("Download from: https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin")
# Initialize lingua as fallback
if LINGUA_AVAILABLE:
try:
self.lingua_detector = LanguageDetectorBuilder.from_languages(
Language.ENGLISH,
Language.TAMIL,
# Note: Lingua may not have Sinhala, we'll use script detection
).build()
logger.info("[LanguageDetector] Initialized Lingua detector")
except Exception as e:
logger.warning(f"[LanguageDetector] Failed to init Lingua: {e}")
def _detect_by_script(self, text: str) -> Optional[str]:
"""
Detect language by Unicode script analysis.
More reliable for Sinhala/Tamil which have distinct scripts.
"""
sinhala_count = 0
tamil_count = 0
latin_count = 0
for char in text:
code = ord(char)
if self.SINHALA_RANGE[0] <= code <= self.SINHALA_RANGE[1]:
sinhala_count += 1
elif self.TAMIL_RANGE[0] <= code <= self.TAMIL_RANGE[1]:
tamil_count += 1
elif char.isalpha() and code < 128:
latin_count += 1
total_alpha = sinhala_count + tamil_count + latin_count
if total_alpha == 0:
return None
# Threshold-based detection
if sinhala_count / total_alpha > 0.3:
return "sinhala"
if tamil_count / total_alpha > 0.3:
return "tamil"
if latin_count / total_alpha > 0.5:
return "english"
return None
def detect(self, text: str) -> Tuple[str, float]:
"""
Detect language of text.
Args:
text: Input text
Returns:
Tuple of (language_code, confidence)
language_code: 'english', 'sinhala', 'tamil', or 'unknown'
"""
if not text or len(text.strip()) < 3:
return "unknown", 0.0
# Clean text
clean_text = re.sub(r'http\S+|@\w+|#\w+', '', text)
clean_text = clean_text.strip()
if not clean_text:
return "unknown", 0.0
# 1. First try script detection (most reliable for Sinhala/Tamil)
script_lang = self._detect_by_script(clean_text)
if script_lang in ["sinhala", "tamil"]:
return script_lang, 0.95
# 2. Try FastText
if self.fasttext_model:
try:
predictions = self.fasttext_model.predict(clean_text.replace("\n", " "))
label = predictions[0][0]
confidence = predictions[1][0]
lang = self.LANG_MAP.get(label, "unknown")
if lang != "unknown" and confidence > 0.5:
return lang, float(confidence)
except Exception as e:
logger.debug(f"FastText error: {e}")
# 3. Try Lingua
if self.lingua_detector:
try:
detected = self.lingua_detector.detect_language_of(clean_text)
if detected:
lang = self.LANG_MAP.get(detected.name, "unknown")
# Lingua doesn't return confidence, estimate based on text
confidence = 0.8 if len(clean_text) > 20 else 0.6
return lang, confidence
except Exception as e:
logger.debug(f"Lingua error: {e}")
# 4. Fallback to script detection result or default
if script_lang == "english":
return "english", 0.7
return "english", 0.5 # Default to English
# Singleton instance
_detector: Optional[LanguageDetector] = None
def get_detector(models_cache_dir: Optional[str] = None) -> LanguageDetector:
"""Get or create singleton detector instance"""
global _detector
if _detector is None:
_detector = LanguageDetector(models_cache_dir)
return _detector
def detect_language(text: str) -> Tuple[str, float]:
"""
Convenience function for language detection.
Args:
text: Input text
Returns:
Tuple of (language: str, confidence: float)
"""
return get_detector().detect(text)