File size: 6,907 Bytes
c7d4394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16ec2cf
c7d4394
 
 
 
 
 
16ec2cf
c7d4394
 
 
 
 
16ec2cf
c7d4394
 
 
16ec2cf
c7d4394
 
 
 
 
 
 
 
 
 
 
16ec2cf
c7d4394
 
16ec2cf
c7d4394
16ec2cf
c7d4394
 
 
 
 
 
 
 
 
 
 
 
 
 
16ec2cf
c7d4394
 
 
 
 
 
 
 
 
 
 
16ec2cf
c7d4394
 
 
 
 
 
 
 
16ec2cf
c7d4394
 
 
 
 
 
 
 
16ec2cf
c7d4394
 
 
16ec2cf
c7d4394
 
 
 
 
 
 
16ec2cf
c7d4394
16ec2cf
c7d4394
 
 
 
 
 
 
 
 
 
 
 
 
16ec2cf
c7d4394
 
 
16ec2cf
c7d4394
 
16ec2cf
c7d4394
 
 
 
16ec2cf
c7d4394
 
 
 
 
 
16ec2cf
c7d4394
 
 
 
 
16ec2cf
c7d4394
 
 
 
 
 
 
 
 
 
 
16ec2cf
c7d4394
 
 
16ec2cf
c7d4394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
"""
models/anomaly-detection/src/utils/language_detector.py
Language detection using FastText or lingua-py for Sinhala/Tamil/English
"""
import os
import logging
from typing import Tuple, Optional
from pathlib import Path
import re

logger = logging.getLogger("language_detector")

# Try FastText first, fallback to lingua
try:
    import fasttext
    fasttext.FastText.eprint = lambda x: None  # Suppress warnings
    FASTTEXT_AVAILABLE = True
except ImportError:
    FASTTEXT_AVAILABLE = False
    logger.warning("FastText not available. Install with: pip install fasttext")

try:
    from lingua import Language, LanguageDetectorBuilder
    LINGUA_AVAILABLE = True
except ImportError:
    LINGUA_AVAILABLE = False
    logger.warning("Lingua not available. Install with: pip install lingua-language-detector")


class LanguageDetector:
    """
    Multilingual language detector supporting Sinhala, Tamil, and English.
    Uses FastText as primary detector with lingua fallback.
    """

    # Language code mapping
    LANG_MAP = {
        "en": "english",
        "si": "sinhala",
        "ta": "tamil",
        "__label__en": "english",
        "__label__si": "sinhala",
        "__label__ta": "tamil",
        "ENGLISH": "english",
        "SINHALA": "sinhala",
        "TAMIL": "tamil"
    }

    # Unicode ranges for script detection
    SINHALA_RANGE = (0x0D80, 0x0DFF)
    TAMIL_RANGE = (0x0B80, 0x0BFF)

    def __init__(self, models_cache_dir: Optional[str] = None):
        """
        Initialize language detector.
        
        Args:
            models_cache_dir: Directory for cached FastText models
        """
        self.models_cache_dir = models_cache_dir or str(
            Path(__file__).parent.parent.parent / "models_cache"
        )
        Path(self.models_cache_dir).mkdir(parents=True, exist_ok=True)

        self.fasttext_model = None
        self.lingua_detector = None

        self._init_detectors()

    def _init_detectors(self):
        """Initialize detection models"""
        # Try FastText
        if FASTTEXT_AVAILABLE:
            model_path = Path(self.models_cache_dir) / "lid.176.bin"
            if model_path.exists():
                try:
                    self.fasttext_model = fasttext.load_model(str(model_path))
                    logger.info(f"[LanguageDetector] Loaded FastText model from {model_path}")
                except Exception as e:
                    logger.warning(f"[LanguageDetector] Failed to load FastText: {e}")
            else:
                logger.warning(f"[LanguageDetector] FastText model not found at {model_path}")
                logger.info("Download from: https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin")

        # Initialize lingua as fallback
        if LINGUA_AVAILABLE:
            try:
                self.lingua_detector = LanguageDetectorBuilder.from_languages(
                    Language.ENGLISH,
                    Language.TAMIL,
                    # Note: Lingua may not have Sinhala, we'll use script detection
                ).build()
                logger.info("[LanguageDetector] Initialized Lingua detector")
            except Exception as e:
                logger.warning(f"[LanguageDetector] Failed to init Lingua: {e}")

    def _detect_by_script(self, text: str) -> Optional[str]:
        """
        Detect language by Unicode script analysis.
        More reliable for Sinhala/Tamil which have distinct scripts.
        """
        sinhala_count = 0
        tamil_count = 0
        latin_count = 0

        for char in text:
            code = ord(char)
            if self.SINHALA_RANGE[0] <= code <= self.SINHALA_RANGE[1]:
                sinhala_count += 1
            elif self.TAMIL_RANGE[0] <= code <= self.TAMIL_RANGE[1]:
                tamil_count += 1
            elif char.isalpha() and code < 128:
                latin_count += 1

        total_alpha = sinhala_count + tamil_count + latin_count
        if total_alpha == 0:
            return None

        # Threshold-based detection
        if sinhala_count / total_alpha > 0.3:
            return "sinhala"
        if tamil_count / total_alpha > 0.3:
            return "tamil"
        if latin_count / total_alpha > 0.5:
            return "english"

        return None

    def detect(self, text: str) -> Tuple[str, float]:
        """
        Detect language of text.
        
        Args:
            text: Input text
            
        Returns:
            Tuple of (language_code, confidence)
            language_code: 'english', 'sinhala', 'tamil', or 'unknown'
        """
        if not text or len(text.strip()) < 3:
            return "unknown", 0.0

        # Clean text
        clean_text = re.sub(r'http\S+|@\w+|#\w+', '', text)
        clean_text = clean_text.strip()

        if not clean_text:
            return "unknown", 0.0

        # 1. First try script detection (most reliable for Sinhala/Tamil)
        script_lang = self._detect_by_script(clean_text)
        if script_lang in ["sinhala", "tamil"]:
            return script_lang, 0.95

        # 2. Try FastText
        if self.fasttext_model:
            try:
                predictions = self.fasttext_model.predict(clean_text.replace("\n", " "))
                label = predictions[0][0]
                confidence = predictions[1][0]

                lang = self.LANG_MAP.get(label, "unknown")
                if lang != "unknown" and confidence > 0.5:
                    return lang, float(confidence)
            except Exception as e:
                logger.debug(f"FastText error: {e}")

        # 3. Try Lingua
        if self.lingua_detector:
            try:
                detected = self.lingua_detector.detect_language_of(clean_text)
                if detected:
                    lang = self.LANG_MAP.get(detected.name, "unknown")
                    # Lingua doesn't return confidence, estimate based on text
                    confidence = 0.8 if len(clean_text) > 20 else 0.6
                    return lang, confidence
            except Exception as e:
                logger.debug(f"Lingua error: {e}")

        # 4. Fallback to script detection result or default
        if script_lang == "english":
            return "english", 0.7

        return "english", 0.5  # Default to English


# Singleton instance
_detector: Optional[LanguageDetector] = None


def get_detector(models_cache_dir: Optional[str] = None) -> LanguageDetector:
    """Get or create singleton detector instance"""
    global _detector
    if _detector is None:
        _detector = LanguageDetector(models_cache_dir)
    return _detector


def detect_language(text: str) -> Tuple[str, float]:
    """
    Convenience function for language detection.
    
    Args:
        text: Input text
        
    Returns:
        Tuple of (language: str, confidence: float)
    """
    return get_detector().detect(text)