scam / app /models /language.py
Gankit12's picture
Upload 129 files
31f0e50 verified
"""
Language Detection Module.
Provides multi-language detection for:
- English (en)
- Hindi (hi)
- Hinglish (code-mixed Hindi and English)
Uses langdetect library with custom Hinglish detection logic.
Performance target: <100ms per detection.
"""
import time
from typing import Tuple, Optional
import langdetect
from langdetect import detect_langs, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from app.utils.logger import get_logger
logger = get_logger(__name__)
# Set seed for reproducible results in langdetect
DetectorFactory.seed = 0
# Supported language codes
SUPPORTED_LANGUAGES = {"en", "hi", "hinglish"}
# Default fallback values
DEFAULT_LANGUAGE = "en"
DEFAULT_CONFIDENCE = 0.3
ERROR_CONFIDENCE = 0.3
# Hinglish detection threshold - minimum ratio of each script type
HINGLISH_MIN_RATIO = 0.1
class LanguageDetector:
"""
Language detection for English, Hindi, and Hinglish.
Uses langdetect library with custom Hinglish detection logic.
Thread-safe with deterministic results.
Attributes:
_initialized: Flag indicating successful initialization
"""
def __init__(self) -> None:
"""
Initialize the LanguageDetector.
Sets the seed for reproducible results.
"""
self._initialized = False
try:
# Ensure deterministic results
DetectorFactory.seed = 0
self._initialized = True
logger.debug("LanguageDetector initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize LanguageDetector: {e}")
self._initialized = False
def detect(self, text: str) -> Tuple[str, float]:
"""
Detect the language of input text.
Args:
text: Input text to analyze
Returns:
Tuple of (language_code, confidence)
language_code: 'en', 'hi', or 'hinglish'
confidence: 0.0-1.0
Raises:
No exceptions - returns fallback on error
"""
return detect_language(text)
def is_hinglish(self, text: str) -> bool:
"""
Check if text is Hinglish (code-mixed).
Hinglish is detected when text contains both:
- Devanagari characters (Hindi script)
- Latin characters (English script)
Args:
text: Input text
Returns:
True if text contains both Devanagari and Latin characters
"""
return has_devanagari(text) and has_latin(text)
def get_script_ratios(self, text: str) -> dict:
"""
Calculate the ratio of different scripts in text.
Args:
text: Input text
Returns:
Dictionary with ratios for each script type
"""
if not text:
return {"devanagari": 0.0, "latin": 0.0, "other": 0.0}
total_chars = len(text)
devanagari_count = sum(1 for char in text if is_devanagari_char(char))
latin_count = sum(1 for char in text if is_latin_char(char))
other_count = total_chars - devanagari_count - latin_count
return {
"devanagari": devanagari_count / total_chars,
"latin": latin_count / total_chars,
"other": other_count / total_chars,
}
def detect_language(text: str) -> Tuple[str, float]:
"""
Detect language of text.
Detection priority:
1. Check for Hinglish (mixed scripts) first
2. Use langdetect for primary detection
3. Fallback to character-based detection if langdetect fails
4. Default to English with low confidence on error
Args:
text: Input message
Returns:
Tuple of (language_code, confidence)
language_code: 'en', 'hi', or 'hinglish'
confidence: 0.0-1.0
"""
start_time = time.time()
# Validate input
if not text or not text.strip():
logger.debug("Empty text provided, returning default")
return (DEFAULT_LANGUAGE, ERROR_CONFIDENCE)
text = text.strip()
try:
# Step 1: Check for Hinglish (code-mixed) first
# Hinglish contains both Devanagari and Latin characters
has_dev = has_devanagari(text)
has_lat = has_latin(text)
if has_dev and has_lat:
# Calculate script ratios for confidence
ratios = _get_script_ratios(text)
# Both scripts must have significant presence for Hinglish
if ratios["devanagari"] >= HINGLISH_MIN_RATIO and ratios["latin"] >= HINGLISH_MIN_RATIO:
confidence = min(0.95, 0.7 + (min(ratios["devanagari"], ratios["latin"]) * 2))
_log_detection("hinglish", confidence, start_time)
return ("hinglish", confidence)
# Step 2: Use langdetect for primary detection
detected_langs = detect_langs(text)
if detected_langs:
top_detection = detected_langs[0]
lang_code = top_detection.lang
confidence = top_detection.prob
# Map to our supported categories
if lang_code == "en":
_log_detection("en", confidence, start_time)
return ("en", confidence)
elif lang_code == "hi":
_log_detection("hi", confidence, start_time)
return ("hi", confidence)
else:
# Unsupported language detected
# Use character-based fallback
return _character_based_detection(text, has_dev, has_lat, start_time)
# No detection result
return _character_based_detection(text, has_dev, has_lat, start_time)
except LangDetectException as e:
logger.debug(f"LangDetect exception: {e}")
# Fallback to character-based detection
return _character_based_detection(text, has_devanagari(text), has_latin(text), start_time)
except Exception as e:
logger.warning(f"Language detection error: {e}")
_log_detection(DEFAULT_LANGUAGE, ERROR_CONFIDENCE, start_time)
return (DEFAULT_LANGUAGE, ERROR_CONFIDENCE)
def _character_based_detection(
text: str,
has_dev: bool,
has_lat: bool,
start_time: float
) -> Tuple[str, float]:
"""
Fallback detection using character analysis.
Args:
text: Input text
has_dev: Whether text contains Devanagari
has_lat: Whether text contains Latin
start_time: Detection start time for logging
Returns:
Tuple of (language_code, confidence)
"""
if has_dev and has_lat:
_log_detection("hinglish", 0.7, start_time)
return ("hinglish", 0.7)
elif has_dev:
_log_detection("hi", 0.85, start_time)
return ("hi", 0.85)
elif has_lat:
_log_detection("en", 0.75, start_time)
return ("en", 0.75)
else:
# No recognizable characters
_log_detection(DEFAULT_LANGUAGE, 0.5, start_time)
return (DEFAULT_LANGUAGE, 0.5)
def _get_script_ratios(text: str) -> dict:
"""
Calculate the ratio of different scripts in text.
Args:
text: Input text
Returns:
Dictionary with ratios for each script type
"""
if not text:
return {"devanagari": 0.0, "latin": 0.0, "other": 0.0}
# Only count alphabetic characters (ignore spaces, numbers, punctuation)
alpha_chars = [char for char in text if char.isalpha()]
if not alpha_chars:
return {"devanagari": 0.0, "latin": 0.0, "other": 0.0}
total_alpha = len(alpha_chars)
devanagari_count = sum(1 for char in alpha_chars if is_devanagari_char(char))
latin_count = sum(1 for char in alpha_chars if is_latin_char(char))
other_count = total_alpha - devanagari_count - latin_count
return {
"devanagari": devanagari_count / total_alpha,
"latin": latin_count / total_alpha,
"other": other_count / total_alpha,
}
def _log_detection(lang: str, confidence: float, start_time: float) -> None:
"""Log detection result with timing."""
elapsed_ms = (time.time() - start_time) * 1000
logger.debug(f"Detected language: {lang}, confidence: {confidence:.2f}, time: {elapsed_ms:.2f}ms")
def has_devanagari(text: str) -> bool:
"""
Check if text contains Devanagari characters.
Devanagari Unicode range: U+0900 to U+097F
Args:
text: Input text
Returns:
True if text contains Devanagari Unicode characters
"""
if not text:
return False
return any(is_devanagari_char(char) for char in text)
def has_latin(text: str) -> bool:
"""
Check if text contains Latin characters.
Args:
text: Input text
Returns:
True if text contains ASCII letters (a-z, A-Z)
"""
if not text:
return False
return any(is_latin_char(char) for char in text)
def is_devanagari_char(char: str) -> bool:
"""
Check if a single character is Devanagari.
Args:
char: Single character
Returns:
True if character is in Devanagari Unicode range
"""
return "\u0900" <= char <= "\u097F"
def is_latin_char(char: str) -> bool:
"""
Check if a single character is Latin.
Args:
char: Single character
Returns:
True if character is ASCII letter
"""
return "a" <= char.lower() <= "z"
def get_language_name(code: str) -> str:
"""
Get human-readable language name from code.
Args:
code: Language code ('en', 'hi', 'hinglish')
Returns:
Human-readable language name
"""
names = {
"en": "English",
"hi": "Hindi",
"hinglish": "Hinglish (Code-Mixed)",
}
return names.get(code, "Unknown")