| | """
|
| | Language Detection Module.
|
| |
|
| | Provides multi-language detection for:
|
| | - English (en)
|
| | - Hindi (hi)
|
| | - Hinglish (code-mixed Hindi and English)
|
| |
|
| | Uses langdetect library with custom Hinglish detection logic.
|
| | Performance target: <100ms per detection.
|
| | """
|
| |
|
| | import time
|
| | from typing import Tuple, Optional
|
| |
|
| | import langdetect
|
| | from langdetect import detect_langs, DetectorFactory
|
| | from langdetect.lang_detect_exception import LangDetectException
|
| |
|
| | from app.utils.logger import get_logger
|
| |
|
| | logger = get_logger(__name__)
|
| |
|
| |
|
| | DetectorFactory.seed = 0
|
| |
|
| |
|
| | SUPPORTED_LANGUAGES = {"en", "hi", "hinglish"}
|
| |
|
| |
|
| | DEFAULT_LANGUAGE = "en"
|
| | DEFAULT_CONFIDENCE = 0.3
|
| | ERROR_CONFIDENCE = 0.3
|
| |
|
| |
|
| | HINGLISH_MIN_RATIO = 0.1
|
| |
|
| |
|
| | class LanguageDetector:
|
| | """
|
| | Language detection for English, Hindi, and Hinglish.
|
| |
|
| | Uses langdetect library with custom Hinglish detection logic.
|
| | Thread-safe with deterministic results.
|
| |
|
| | Attributes:
|
| | _initialized: Flag indicating successful initialization
|
| | """
|
| |
|
| | def __init__(self) -> None:
|
| | """
|
| | Initialize the LanguageDetector.
|
| |
|
| | Sets the seed for reproducible results.
|
| | """
|
| | self._initialized = False
|
| | try:
|
| |
|
| | DetectorFactory.seed = 0
|
| | self._initialized = True
|
| | logger.debug("LanguageDetector initialized successfully")
|
| | except Exception as e:
|
| | logger.error(f"Failed to initialize LanguageDetector: {e}")
|
| | self._initialized = False
|
| |
|
| | def detect(self, text: str) -> Tuple[str, float]:
|
| | """
|
| | Detect the language of input text.
|
| |
|
| | Args:
|
| | text: Input text to analyze
|
| |
|
| | Returns:
|
| | Tuple of (language_code, confidence)
|
| | language_code: 'en', 'hi', or 'hinglish'
|
| | confidence: 0.0-1.0
|
| |
|
| | Raises:
|
| | No exceptions - returns fallback on error
|
| | """
|
| | return detect_language(text)
|
| |
|
| | def is_hinglish(self, text: str) -> bool:
|
| | """
|
| | Check if text is Hinglish (code-mixed).
|
| |
|
| | Hinglish is detected when text contains both:
|
| | - Devanagari characters (Hindi script)
|
| | - Latin characters (English script)
|
| |
|
| | Args:
|
| | text: Input text
|
| |
|
| | Returns:
|
| | True if text contains both Devanagari and Latin characters
|
| | """
|
| | return has_devanagari(text) and has_latin(text)
|
| |
|
| | def get_script_ratios(self, text: str) -> dict:
|
| | """
|
| | Calculate the ratio of different scripts in text.
|
| |
|
| | Args:
|
| | text: Input text
|
| |
|
| | Returns:
|
| | Dictionary with ratios for each script type
|
| | """
|
| | if not text:
|
| | return {"devanagari": 0.0, "latin": 0.0, "other": 0.0}
|
| |
|
| | total_chars = len(text)
|
| | devanagari_count = sum(1 for char in text if is_devanagari_char(char))
|
| | latin_count = sum(1 for char in text if is_latin_char(char))
|
| | other_count = total_chars - devanagari_count - latin_count
|
| |
|
| | return {
|
| | "devanagari": devanagari_count / total_chars,
|
| | "latin": latin_count / total_chars,
|
| | "other": other_count / total_chars,
|
| | }
|
| |
|
| |
|
| | def detect_language(text: str) -> Tuple[str, float]:
|
| | """
|
| | Detect language of text.
|
| |
|
| | Detection priority:
|
| | 1. Check for Hinglish (mixed scripts) first
|
| | 2. Use langdetect for primary detection
|
| | 3. Fallback to character-based detection if langdetect fails
|
| | 4. Default to English with low confidence on error
|
| |
|
| | Args:
|
| | text: Input message
|
| |
|
| | Returns:
|
| | Tuple of (language_code, confidence)
|
| | language_code: 'en', 'hi', or 'hinglish'
|
| | confidence: 0.0-1.0
|
| | """
|
| | start_time = time.time()
|
| |
|
| |
|
| | if not text or not text.strip():
|
| | logger.debug("Empty text provided, returning default")
|
| | return (DEFAULT_LANGUAGE, ERROR_CONFIDENCE)
|
| |
|
| | text = text.strip()
|
| |
|
| | try:
|
| |
|
| |
|
| | has_dev = has_devanagari(text)
|
| | has_lat = has_latin(text)
|
| |
|
| | if has_dev and has_lat:
|
| |
|
| | ratios = _get_script_ratios(text)
|
| |
|
| |
|
| | if ratios["devanagari"] >= HINGLISH_MIN_RATIO and ratios["latin"] >= HINGLISH_MIN_RATIO:
|
| | confidence = min(0.95, 0.7 + (min(ratios["devanagari"], ratios["latin"]) * 2))
|
| | _log_detection("hinglish", confidence, start_time)
|
| | return ("hinglish", confidence)
|
| |
|
| |
|
| | detected_langs = detect_langs(text)
|
| |
|
| | if detected_langs:
|
| | top_detection = detected_langs[0]
|
| | lang_code = top_detection.lang
|
| | confidence = top_detection.prob
|
| |
|
| |
|
| | if lang_code == "en":
|
| | _log_detection("en", confidence, start_time)
|
| | return ("en", confidence)
|
| | elif lang_code == "hi":
|
| | _log_detection("hi", confidence, start_time)
|
| | return ("hi", confidence)
|
| | else:
|
| |
|
| |
|
| | return _character_based_detection(text, has_dev, has_lat, start_time)
|
| |
|
| |
|
| | return _character_based_detection(text, has_dev, has_lat, start_time)
|
| |
|
| | except LangDetectException as e:
|
| | logger.debug(f"LangDetect exception: {e}")
|
| |
|
| | return _character_based_detection(text, has_devanagari(text), has_latin(text), start_time)
|
| |
|
| | except Exception as e:
|
| | logger.warning(f"Language detection error: {e}")
|
| | _log_detection(DEFAULT_LANGUAGE, ERROR_CONFIDENCE, start_time)
|
| | return (DEFAULT_LANGUAGE, ERROR_CONFIDENCE)
|
| |
|
| |
|
| | def _character_based_detection(
|
| | text: str,
|
| | has_dev: bool,
|
| | has_lat: bool,
|
| | start_time: float
|
| | ) -> Tuple[str, float]:
|
| | """
|
| | Fallback detection using character analysis.
|
| |
|
| | Args:
|
| | text: Input text
|
| | has_dev: Whether text contains Devanagari
|
| | has_lat: Whether text contains Latin
|
| | start_time: Detection start time for logging
|
| |
|
| | Returns:
|
| | Tuple of (language_code, confidence)
|
| | """
|
| | if has_dev and has_lat:
|
| | _log_detection("hinglish", 0.7, start_time)
|
| | return ("hinglish", 0.7)
|
| | elif has_dev:
|
| | _log_detection("hi", 0.85, start_time)
|
| | return ("hi", 0.85)
|
| | elif has_lat:
|
| | _log_detection("en", 0.75, start_time)
|
| | return ("en", 0.75)
|
| | else:
|
| |
|
| | _log_detection(DEFAULT_LANGUAGE, 0.5, start_time)
|
| | return (DEFAULT_LANGUAGE, 0.5)
|
| |
|
| |
|
| | def _get_script_ratios(text: str) -> dict:
|
| | """
|
| | Calculate the ratio of different scripts in text.
|
| |
|
| | Args:
|
| | text: Input text
|
| |
|
| | Returns:
|
| | Dictionary with ratios for each script type
|
| | """
|
| | if not text:
|
| | return {"devanagari": 0.0, "latin": 0.0, "other": 0.0}
|
| |
|
| |
|
| | alpha_chars = [char for char in text if char.isalpha()]
|
| |
|
| | if not alpha_chars:
|
| | return {"devanagari": 0.0, "latin": 0.0, "other": 0.0}
|
| |
|
| | total_alpha = len(alpha_chars)
|
| | devanagari_count = sum(1 for char in alpha_chars if is_devanagari_char(char))
|
| | latin_count = sum(1 for char in alpha_chars if is_latin_char(char))
|
| | other_count = total_alpha - devanagari_count - latin_count
|
| |
|
| | return {
|
| | "devanagari": devanagari_count / total_alpha,
|
| | "latin": latin_count / total_alpha,
|
| | "other": other_count / total_alpha,
|
| | }
|
| |
|
| |
|
| | def _log_detection(lang: str, confidence: float, start_time: float) -> None:
|
| | """Log detection result with timing."""
|
| | elapsed_ms = (time.time() - start_time) * 1000
|
| | logger.debug(f"Detected language: {lang}, confidence: {confidence:.2f}, time: {elapsed_ms:.2f}ms")
|
| |
|
| |
|
| | def has_devanagari(text: str) -> bool:
|
| | """
|
| | Check if text contains Devanagari characters.
|
| |
|
| | Devanagari Unicode range: U+0900 to U+097F
|
| |
|
| | Args:
|
| | text: Input text
|
| |
|
| | Returns:
|
| | True if text contains Devanagari Unicode characters
|
| | """
|
| | if not text:
|
| | return False
|
| | return any(is_devanagari_char(char) for char in text)
|
| |
|
| |
|
| | def has_latin(text: str) -> bool:
|
| | """
|
| | Check if text contains Latin characters.
|
| |
|
| | Args:
|
| | text: Input text
|
| |
|
| | Returns:
|
| | True if text contains ASCII letters (a-z, A-Z)
|
| | """
|
| | if not text:
|
| | return False
|
| | return any(is_latin_char(char) for char in text)
|
| |
|
| |
|
| | def is_devanagari_char(char: str) -> bool:
|
| | """
|
| | Check if a single character is Devanagari.
|
| |
|
| | Args:
|
| | char: Single character
|
| |
|
| | Returns:
|
| | True if character is in Devanagari Unicode range
|
| | """
|
| | return "\u0900" <= char <= "\u097F"
|
| |
|
| |
|
| | def is_latin_char(char: str) -> bool:
|
| | """
|
| | Check if a single character is Latin.
|
| |
|
| | Args:
|
| | char: Single character
|
| |
|
| | Returns:
|
| | True if character is ASCII letter
|
| | """
|
| | return "a" <= char.lower() <= "z"
|
| |
|
| |
|
| | def get_language_name(code: str) -> str:
|
| | """
|
| | Get human-readable language name from code.
|
| |
|
| | Args:
|
| | code: Language code ('en', 'hi', 'hinglish')
|
| |
|
| | Returns:
|
| | Human-readable language name
|
| | """
|
| | names = {
|
| | "en": "English",
|
| | "hi": "Hindi",
|
| | "hinglish": "Hinglish (Code-Mixed)",
|
| | }
|
| | return names.get(code, "Unknown")
|
| |
|