File size: 6,907 Bytes
c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 16ec2cf c7d4394 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
"""
models/anomaly-detection/src/utils/language_detector.py
Language detection using FastText or lingua-py for Sinhala/Tamil/English
"""
import os
import logging
from typing import Tuple, Optional
from pathlib import Path
import re
logger = logging.getLogger("language_detector")
# Try FastText first, fallback to lingua
try:
import fasttext
fasttext.FastText.eprint = lambda x: None # Suppress warnings
FASTTEXT_AVAILABLE = True
except ImportError:
FASTTEXT_AVAILABLE = False
logger.warning("FastText not available. Install with: pip install fasttext")
try:
from lingua import Language, LanguageDetectorBuilder
LINGUA_AVAILABLE = True
except ImportError:
LINGUA_AVAILABLE = False
logger.warning("Lingua not available. Install with: pip install lingua-language-detector")
class LanguageDetector:
"""
Multilingual language detector supporting Sinhala, Tamil, and English.
Uses FastText as primary detector with lingua fallback.
"""
# Language code mapping
LANG_MAP = {
"en": "english",
"si": "sinhala",
"ta": "tamil",
"__label__en": "english",
"__label__si": "sinhala",
"__label__ta": "tamil",
"ENGLISH": "english",
"SINHALA": "sinhala",
"TAMIL": "tamil"
}
# Unicode ranges for script detection
SINHALA_RANGE = (0x0D80, 0x0DFF)
TAMIL_RANGE = (0x0B80, 0x0BFF)
def __init__(self, models_cache_dir: Optional[str] = None):
"""
Initialize language detector.
Args:
models_cache_dir: Directory for cached FastText models
"""
self.models_cache_dir = models_cache_dir or str(
Path(__file__).parent.parent.parent / "models_cache"
)
Path(self.models_cache_dir).mkdir(parents=True, exist_ok=True)
self.fasttext_model = None
self.lingua_detector = None
self._init_detectors()
def _init_detectors(self):
"""Initialize detection models"""
# Try FastText
if FASTTEXT_AVAILABLE:
model_path = Path(self.models_cache_dir) / "lid.176.bin"
if model_path.exists():
try:
self.fasttext_model = fasttext.load_model(str(model_path))
logger.info(f"[LanguageDetector] Loaded FastText model from {model_path}")
except Exception as e:
logger.warning(f"[LanguageDetector] Failed to load FastText: {e}")
else:
logger.warning(f"[LanguageDetector] FastText model not found at {model_path}")
logger.info("Download from: https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin")
# Initialize lingua as fallback
if LINGUA_AVAILABLE:
try:
self.lingua_detector = LanguageDetectorBuilder.from_languages(
Language.ENGLISH,
Language.TAMIL,
# Note: Lingua may not have Sinhala, we'll use script detection
).build()
logger.info("[LanguageDetector] Initialized Lingua detector")
except Exception as e:
logger.warning(f"[LanguageDetector] Failed to init Lingua: {e}")
def _detect_by_script(self, text: str) -> Optional[str]:
"""
Detect language by Unicode script analysis.
More reliable for Sinhala/Tamil which have distinct scripts.
"""
sinhala_count = 0
tamil_count = 0
latin_count = 0
for char in text:
code = ord(char)
if self.SINHALA_RANGE[0] <= code <= self.SINHALA_RANGE[1]:
sinhala_count += 1
elif self.TAMIL_RANGE[0] <= code <= self.TAMIL_RANGE[1]:
tamil_count += 1
elif char.isalpha() and code < 128:
latin_count += 1
total_alpha = sinhala_count + tamil_count + latin_count
if total_alpha == 0:
return None
# Threshold-based detection
if sinhala_count / total_alpha > 0.3:
return "sinhala"
if tamil_count / total_alpha > 0.3:
return "tamil"
if latin_count / total_alpha > 0.5:
return "english"
return None
def detect(self, text: str) -> Tuple[str, float]:
"""
Detect language of text.
Args:
text: Input text
Returns:
Tuple of (language_code, confidence)
language_code: 'english', 'sinhala', 'tamil', or 'unknown'
"""
if not text or len(text.strip()) < 3:
return "unknown", 0.0
# Clean text
clean_text = re.sub(r'http\S+|@\w+|#\w+', '', text)
clean_text = clean_text.strip()
if not clean_text:
return "unknown", 0.0
# 1. First try script detection (most reliable for Sinhala/Tamil)
script_lang = self._detect_by_script(clean_text)
if script_lang in ["sinhala", "tamil"]:
return script_lang, 0.95
# 2. Try FastText
if self.fasttext_model:
try:
predictions = self.fasttext_model.predict(clean_text.replace("\n", " "))
label = predictions[0][0]
confidence = predictions[1][0]
lang = self.LANG_MAP.get(label, "unknown")
if lang != "unknown" and confidence > 0.5:
return lang, float(confidence)
except Exception as e:
logger.debug(f"FastText error: {e}")
# 3. Try Lingua
if self.lingua_detector:
try:
detected = self.lingua_detector.detect_language_of(clean_text)
if detected:
lang = self.LANG_MAP.get(detected.name, "unknown")
# Lingua doesn't return confidence, estimate based on text
confidence = 0.8 if len(clean_text) > 20 else 0.6
return lang, confidence
except Exception as e:
logger.debug(f"Lingua error: {e}")
# 4. Fallback to script detection result or default
if script_lang == "english":
return "english", 0.7
return "english", 0.5 # Default to English
# Singleton instance
_detector: Optional[LanguageDetector] = None
def get_detector(models_cache_dir: Optional[str] = None) -> LanguageDetector:
"""Get or create singleton detector instance"""
global _detector
if _detector is None:
_detector = LanguageDetector(models_cache_dir)
return _detector
def detect_language(text: str) -> Tuple[str, float]:
"""
Convenience function for language detection.
Args:
text: Input text
Returns:
Tuple of (language: str, confidence: float)
"""
return get_detector().detect(text)
|