Spaces:

Gankit12
/

scam

Sleeping

App Files Files Community

scam / app /models /language.py

Gankit12

Upload 129 files

31f0e50 verified about 1 month ago

raw

history blame contribute delete

10.4 kB

	"""
	Language Detection Module.

	Provides multi-language detection for:
	- English (en)
	- Hindi (hi)
	- Hinglish (code-mixed Hindi and English)

	Uses langdetect library with custom Hinglish detection logic.
	Performance target: <100ms per detection.
	"""

	import time
	from typing import Tuple, Optional

	import langdetect
	from langdetect import detect_langs, DetectorFactory
	from langdetect.lang_detect_exception import LangDetectException

	from app.utils.logger import get_logger

	logger = get_logger(__name__)

	# Set seed for reproducible results in langdetect
	DetectorFactory.seed = 0

	# Supported language codes
	SUPPORTED_LANGUAGES = {"en", "hi", "hinglish"}

	# Default fallback values
	DEFAULT_LANGUAGE = "en"
	DEFAULT_CONFIDENCE = 0.3
	ERROR_CONFIDENCE = 0.3

	# Hinglish detection threshold - minimum ratio of each script type
	HINGLISH_MIN_RATIO = 0.1


	class LanguageDetector:
	"""
	Language detection for English, Hindi, and Hinglish.

	Uses langdetect library with custom Hinglish detection logic.
	Thread-safe with deterministic results.

	Attributes:
	_initialized: Flag indicating successful initialization
	"""

	def __init__(self) -> None:
	"""
	Initialize the LanguageDetector.

	Sets the seed for reproducible results.
	"""
	self._initialized = False
	try:
	# Ensure deterministic results
	DetectorFactory.seed = 0
	self._initialized = True
	logger.debug("LanguageDetector initialized successfully")
	except Exception as e:
	logger.error(f"Failed to initialize LanguageDetector: {e}")
	self._initialized = False

	def detect(self, text: str) -> Tuple[str, float]:
	"""
	Detect the language of input text.

	Args:
	text: Input text to analyze

	Returns:
	Tuple of (language_code, confidence)
	language_code: 'en', 'hi', or 'hinglish'
	confidence: 0.0-1.0

	Raises:
	No exceptions - returns fallback on error
	"""
	return detect_language(text)

	def is_hinglish(self, text: str) -> bool:
	"""
	Check if text is Hinglish (code-mixed).

	Hinglish is detected when text contains both:
	- Devanagari characters (Hindi script)
	- Latin characters (English script)

	Args:
	text: Input text

	Returns:
	True if text contains both Devanagari and Latin characters
	"""
	return has_devanagari(text) and has_latin(text)

	def get_script_ratios(self, text: str) -> dict:
	"""
	Calculate the ratio of different scripts in text.

	Args:
	text: Input text

	Returns:
	Dictionary with ratios for each script type
	"""
	if not text:
	return {"devanagari": 0.0, "latin": 0.0, "other": 0.0}

	total_chars = len(text)
	devanagari_count = sum(1 for char in text if is_devanagari_char(char))
	latin_count = sum(1 for char in text if is_latin_char(char))
	other_count = total_chars - devanagari_count - latin_count

	return {
	"devanagari": devanagari_count / total_chars,
	"latin": latin_count / total_chars,
	"other": other_count / total_chars,
	}


	def detect_language(text: str) -> Tuple[str, float]:
	"""
	Detect language of text.

	Detection priority:
	1. Check for Hinglish (mixed scripts) first
	2. Use langdetect for primary detection
	3. Fallback to character-based detection if langdetect fails
	4. Default to English with low confidence on error

	Args:
	text: Input message

	Returns:
	Tuple of (language_code, confidence)
	language_code: 'en', 'hi', or 'hinglish'
	confidence: 0.0-1.0
	"""
	start_time = time.time()

	# Validate input
	if not text or not text.strip():
	logger.debug("Empty text provided, returning default")
	return (DEFAULT_LANGUAGE, ERROR_CONFIDENCE)

	text = text.strip()

	try:
	# Step 1: Check for Hinglish (code-mixed) first
	# Hinglish contains both Devanagari and Latin characters
	has_dev = has_devanagari(text)
	has_lat = has_latin(text)

	if has_dev and has_lat:
	# Calculate script ratios for confidence
	ratios = _get_script_ratios(text)

	# Both scripts must have significant presence for Hinglish
	if ratios["devanagari"] >= HINGLISH_MIN_RATIO and ratios["latin"] >= HINGLISH_MIN_RATIO:
	confidence = min(0.95, 0.7 + (min(ratios["devanagari"], ratios["latin"]) * 2))
	_log_detection("hinglish", confidence, start_time)
	return ("hinglish", confidence)

	# Step 2: Use langdetect for primary detection
	detected_langs = detect_langs(text)

	if detected_langs:
	top_detection = detected_langs[0]
	lang_code = top_detection.lang
	confidence = top_detection.prob

	# Map to our supported categories
	if lang_code == "en":
	_log_detection("en", confidence, start_time)
	return ("en", confidence)
	elif lang_code == "hi":
	_log_detection("hi", confidence, start_time)
	return ("hi", confidence)
	else:
	# Unsupported language detected
	# Use character-based fallback
	return _character_based_detection(text, has_dev, has_lat, start_time)

	# No detection result
	return _character_based_detection(text, has_dev, has_lat, start_time)

	except LangDetectException as e:
	logger.debug(f"LangDetect exception: {e}")
	# Fallback to character-based detection
	return _character_based_detection(text, has_devanagari(text), has_latin(text), start_time)

	except Exception as e:
	logger.warning(f"Language detection error: {e}")
	_log_detection(DEFAULT_LANGUAGE, ERROR_CONFIDENCE, start_time)
	return (DEFAULT_LANGUAGE, ERROR_CONFIDENCE)


	def _character_based_detection(
	text: str,
	has_dev: bool,
	has_lat: bool,
	start_time: float
	) -> Tuple[str, float]:
	"""
	Fallback detection using character analysis.

	Args:
	text: Input text
	has_dev: Whether text contains Devanagari
	has_lat: Whether text contains Latin
	start_time: Detection start time for logging

	Returns:
	Tuple of (language_code, confidence)
	"""
	if has_dev and has_lat:
	_log_detection("hinglish", 0.7, start_time)
	return ("hinglish", 0.7)
	elif has_dev:
	_log_detection("hi", 0.85, start_time)
	return ("hi", 0.85)
	elif has_lat:
	_log_detection("en", 0.75, start_time)
	return ("en", 0.75)
	else:
	# No recognizable characters
	_log_detection(DEFAULT_LANGUAGE, 0.5, start_time)
	return (DEFAULT_LANGUAGE, 0.5)


	def _get_script_ratios(text: str) -> dict:
	"""
	Calculate the ratio of different scripts in text.

	Args:
	text: Input text

	Returns:
	Dictionary with ratios for each script type
	"""
	if not text:
	return {"devanagari": 0.0, "latin": 0.0, "other": 0.0}

	# Only count alphabetic characters (ignore spaces, numbers, punctuation)
	alpha_chars = [char for char in text if char.isalpha()]

	if not alpha_chars:
	return {"devanagari": 0.0, "latin": 0.0, "other": 0.0}

	total_alpha = len(alpha_chars)
	devanagari_count = sum(1 for char in alpha_chars if is_devanagari_char(char))
	latin_count = sum(1 for char in alpha_chars if is_latin_char(char))
	other_count = total_alpha - devanagari_count - latin_count

	return {
	"devanagari": devanagari_count / total_alpha,
	"latin": latin_count / total_alpha,
	"other": other_count / total_alpha,
	}


	def _log_detection(lang: str, confidence: float, start_time: float) -> None:
	"""Log detection result with timing."""
	elapsed_ms = (time.time() - start_time) * 1000
	logger.debug(f"Detected language: {lang}, confidence: {confidence:.2f}, time: {elapsed_ms:.2f}ms")


	def has_devanagari(text: str) -> bool:
	"""
	Check if text contains Devanagari characters.

	Devanagari Unicode range: U+0900 to U+097F

	Args:
	text: Input text

	Returns:
	True if text contains Devanagari Unicode characters
	"""
	if not text:
	return False
	return any(is_devanagari_char(char) for char in text)


	def has_latin(text: str) -> bool:
	"""
	Check if text contains Latin characters.

	Args:
	text: Input text

	Returns:
	True if text contains ASCII letters (a-z, A-Z)
	"""
	if not text:
	return False
	return any(is_latin_char(char) for char in text)


	def is_devanagari_char(char: str) -> bool:
	"""
	Check if a single character is Devanagari.

	Args:
	char: Single character

	Returns:
	True if character is in Devanagari Unicode range
	"""
	return "\u0900" <= char <= "\u097F"


	def is_latin_char(char: str) -> bool:
	"""
	Check if a single character is Latin.

	Args:
	char: Single character

	Returns:
	True if character is ASCII letter
	"""
	return "a" <= char.lower() <= "z"


	def get_language_name(code: str) -> str:
	"""
	Get human-readable language name from code.

	Args:
	code: Language code ('en', 'hi', 'hinglish')

	Returns:
	Human-readable language name
	"""
	names = {
	"en": "English",
	"hi": "Hindi",
	"hinglish": "Hinglish (Code-Mixed)",
	}
	return names.get(code, "Unknown")