rag-api-node-1 / src /infrastructure /adapters /language_detector.py
Peterase's picture
feat: Add query enhancements and flexible prompting (v2.1)
6246bba
"""
Language Detection
Detects the language of user queries to handle multilingual input correctly.
Uses lightweight pattern-based detection with langdetect fallback.
"""
import logging
import re
from typing import Optional, Dict, Any
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class LanguageDetection:
"""Language detection result"""
language: str # ISO 639-1 code (en, ar, am, so, sw, fr)
confidence: float # 0.0 to 1.0
method: str # "script", "langdetect", "default"
class LanguageDetector:
"""
Detect language of user queries.
Strategy:
1. Script-based detection (Arabic, Amharic) - Fast, 100% accurate
2. langdetect library - Good for Latin scripts
3. Default to English - Safe fallback
"""
# Unicode ranges for script detection
ARABIC_RANGE = (0x0600, 0x06FF) # Arabic script
AMHARIC_RANGE = (0x1200, 0x137F) # Ethiopic script
# Common words for pattern matching
LANGUAGE_PATTERNS = {
"so": ["wararka", "habari", "sheeko", "waa", "iyo"], # Somali
"sw": ["habari", "leo", "jana", "wiki", "mwezi"], # Swahili
"fr": ["nouvelles", "aujourd'hui", "hier", "semaine", "mois"], # French
}
def __init__(self, cache=None):
"""
Initialize language detector.
Args:
cache: Cache adapter for storing detections
"""
self.cache = cache
self._langdetect_available = False
self._try_import_langdetect()
def _try_import_langdetect(self):
"""Try to import langdetect library"""
try:
import langdetect
self._langdetect_available = True
logger.info("βœ… langdetect library available")
except ImportError:
logger.warning("langdetect not installed, using pattern-based detection only")
def detect(self, text: str) -> LanguageDetection:
"""
Detect language of text.
Args:
text: Text to detect language for
Returns:
LanguageDetection with language code and confidence
"""
if not text or not text.strip():
return LanguageDetection(
language="en",
confidence=0.5,
method="default"
)
# Check cache first
if self.cache:
cache_key = f"lang_detect:{text[:100].lower()}"
cached = self.cache.get(cache_key)
if cached:
logger.debug(f"Language detection cache hit: {text[:50]}")
return LanguageDetection(**cached)
# Step 1: Script-based detection (fast and accurate)
script_result = self._detect_by_script(text)
if script_result:
self._cache_result(text, script_result)
return script_result
# Step 2: Pattern-based detection
pattern_result = self._detect_by_patterns(text)
if pattern_result:
self._cache_result(text, pattern_result)
return pattern_result
# Step 3: langdetect library
if self._langdetect_available:
langdetect_result = self._detect_with_langdetect(text)
if langdetect_result:
self._cache_result(text, langdetect_result)
return langdetect_result
# Step 4: Default to English
default_result = LanguageDetection(
language="en",
confidence=0.5,
method="default"
)
self._cache_result(text, default_result)
return default_result
def _detect_by_script(self, text: str) -> Optional[LanguageDetection]:
"""
Detect language by Unicode script.
Very fast and 100% accurate for Arabic and Amharic.
"""
# Count characters in each script
arabic_count = 0
amharic_count = 0
total_chars = 0
for char in text:
code = ord(char)
if self.ARABIC_RANGE[0] <= code <= self.ARABIC_RANGE[1]:
arabic_count += 1
total_chars += 1
elif self.AMHARIC_RANGE[0] <= code <= self.AMHARIC_RANGE[1]:
amharic_count += 1
total_chars += 1
elif char.isalpha():
total_chars += 1
if total_chars == 0:
return None
# If >50% Arabic script β†’ Arabic
if arabic_count / total_chars > 0.5:
return LanguageDetection(
language="ar",
confidence=1.0,
method="script"
)
# If >50% Amharic script β†’ Amharic
if amharic_count / total_chars > 0.5:
return LanguageDetection(
language="am",
confidence=1.0,
method="script"
)
return None
def _detect_by_patterns(self, text: str) -> Optional[LanguageDetection]:
"""
Detect language by common word patterns.
Good for Somali, Swahili, French.
"""
text_lower = text.lower()
for lang, patterns in self.LANGUAGE_PATTERNS.items():
matches = sum(1 for pattern in patterns if pattern in text_lower)
if matches >= 2: # At least 2 pattern matches
return LanguageDetection(
language=lang,
confidence=0.8,
method="pattern"
)
return None
def _detect_with_langdetect(self, text: str) -> Optional[LanguageDetection]:
"""
Detect language using langdetect library.
Good for Latin-script languages (English, French, Somali, Swahili).
"""
try:
import langdetect
# langdetect can be inconsistent, so we detect multiple times
detected = langdetect.detect(text)
# Map langdetect codes to our supported languages
lang_map = {
"en": "en",
"ar": "ar",
"am": "am",
"so": "so",
"sw": "sw",
"fr": "fr",
}
if detected in lang_map:
return LanguageDetection(
language=lang_map[detected],
confidence=0.85,
method="langdetect"
)
# If detected language not in our supported set, default to English
return LanguageDetection(
language="en",
confidence=0.6,
method="langdetect_fallback"
)
except Exception as e:
logger.debug(f"langdetect failed: {e}")
return None
def _cache_result(self, text: str, result: LanguageDetection):
"""Cache detection result"""
if self.cache:
cache_key = f"lang_detect:{text[:100].lower()}"
self.cache.set(
cache_key,
{
"language": result.language,
"confidence": result.confidence,
"method": result.method
},
expiration=3600 # 1 hour
)
# ═══════════════════════════════════════════════════════════════════════════
# SINGLETON INSTANCE
# ═══════════════════════════════════════════════════════════════════════════
language_detector: Optional[LanguageDetector] = None
def initialize_language_detector(cache=None):
"""Initialize global language detector instance"""
global language_detector
language_detector = LanguageDetector(cache)
logger.info("Language detector initialized")