Spaces:

nivakaran
/

modelx

Sleeping

App Files Files Community

modelx / models /anomaly-detection /src /utils /language_detector.py

nivakaran

Upload folder using huggingface_hub

16ec2cf verified about 1 month ago

raw

history blame contribute delete

6.91 kB

	"""
	models/anomaly-detection/src/utils/language_detector.py
	Language detection using FastText or lingua-py for Sinhala/Tamil/English
	"""
	import os
	import logging
	from typing import Tuple, Optional
	from pathlib import Path
	import re

	logger = logging.getLogger("language_detector")

	# Try FastText first, fallback to lingua
	try:
	import fasttext
	fasttext.FastText.eprint = lambda x: None # Suppress warnings
	FASTTEXT_AVAILABLE = True
	except ImportError:
	FASTTEXT_AVAILABLE = False
	logger.warning("FastText not available. Install with: pip install fasttext")

	try:
	from lingua import Language, LanguageDetectorBuilder
	LINGUA_AVAILABLE = True
	except ImportError:
	LINGUA_AVAILABLE = False
	logger.warning("Lingua not available. Install with: pip install lingua-language-detector")


	class LanguageDetector:
	"""
	Multilingual language detector supporting Sinhala, Tamil, and English.
	Uses FastText as primary detector with lingua fallback.
	"""

	# Language code mapping
	LANG_MAP = {
	"en": "english",
	"si": "sinhala",
	"ta": "tamil",
	"__label__en": "english",
	"__label__si": "sinhala",
	"__label__ta": "tamil",
	"ENGLISH": "english",
	"SINHALA": "sinhala",
	"TAMIL": "tamil"
	}

	# Unicode ranges for script detection
	SINHALA_RANGE = (0x0D80, 0x0DFF)
	TAMIL_RANGE = (0x0B80, 0x0BFF)

	def __init__(self, models_cache_dir: Optional[str] = None):
	"""
	Initialize language detector.

	Args:
	models_cache_dir: Directory for cached FastText models
	"""
	self.models_cache_dir = models_cache_dir or str(
	Path(__file__).parent.parent.parent / "models_cache"
	)
	Path(self.models_cache_dir).mkdir(parents=True, exist_ok=True)

	self.fasttext_model = None
	self.lingua_detector = None

	self._init_detectors()

	def _init_detectors(self):
	"""Initialize detection models"""
	# Try FastText
	if FASTTEXT_AVAILABLE:
	model_path = Path(self.models_cache_dir) / "lid.176.bin"
	if model_path.exists():
	try:
	self.fasttext_model = fasttext.load_model(str(model_path))
	logger.info(f"[LanguageDetector] Loaded FastText model from {model_path}")
	except Exception as e:
	logger.warning(f"[LanguageDetector] Failed to load FastText: {e}")
	else:
	logger.warning(f"[LanguageDetector] FastText model not found at {model_path}")
	logger.info("Download from: https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin")

	# Initialize lingua as fallback
	if LINGUA_AVAILABLE:
	try:
	self.lingua_detector = LanguageDetectorBuilder.from_languages(
	Language.ENGLISH,
	Language.TAMIL,
	# Note: Lingua may not have Sinhala, we'll use script detection
	).build()
	logger.info("[LanguageDetector] Initialized Lingua detector")
	except Exception as e:
	logger.warning(f"[LanguageDetector] Failed to init Lingua: {e}")

	def _detect_by_script(self, text: str) -> Optional[str]:
	"""
	Detect language by Unicode script analysis.
	More reliable for Sinhala/Tamil which have distinct scripts.
	"""
	sinhala_count = 0
	tamil_count = 0
	latin_count = 0

	for char in text:
	code = ord(char)
	if self.SINHALA_RANGE[0] <= code <= self.SINHALA_RANGE[1]:
	sinhala_count += 1
	elif self.TAMIL_RANGE[0] <= code <= self.TAMIL_RANGE[1]:
	tamil_count += 1
	elif char.isalpha() and code < 128:
	latin_count += 1

	total_alpha = sinhala_count + tamil_count + latin_count
	if total_alpha == 0:
	return None

	# Threshold-based detection
	if sinhala_count / total_alpha > 0.3:
	return "sinhala"
	if tamil_count / total_alpha > 0.3:
	return "tamil"
	if latin_count / total_alpha > 0.5:
	return "english"

	return None

	def detect(self, text: str) -> Tuple[str, float]:
	"""
	Detect language of text.

	Args:
	text: Input text

	Returns:
	Tuple of (language_code, confidence)
	language_code: 'english', 'sinhala', 'tamil', or 'unknown'
	"""
	if not text or len(text.strip()) < 3:
	return "unknown", 0.0

	# Clean text
	clean_text = re.sub(r'http\S+\|@\w+\|#\w+', '', text)
	clean_text = clean_text.strip()

	if not clean_text:
	return "unknown", 0.0

	# 1. First try script detection (most reliable for Sinhala/Tamil)
	script_lang = self._detect_by_script(clean_text)
	if script_lang in ["sinhala", "tamil"]:
	return script_lang, 0.95

	# 2. Try FastText
	if self.fasttext_model:
	try:
	predictions = self.fasttext_model.predict(clean_text.replace("\n", " "))
	label = predictions[0][0]
	confidence = predictions[1][0]

	lang = self.LANG_MAP.get(label, "unknown")
	if lang != "unknown" and confidence > 0.5:
	return lang, float(confidence)
	except Exception as e:
	logger.debug(f"FastText error: {e}")

	# 3. Try Lingua
	if self.lingua_detector:
	try:
	detected = self.lingua_detector.detect_language_of(clean_text)
	if detected:
	lang = self.LANG_MAP.get(detected.name, "unknown")
	# Lingua doesn't return confidence, estimate based on text
	confidence = 0.8 if len(clean_text) > 20 else 0.6
	return lang, confidence
	except Exception as e:
	logger.debug(f"Lingua error: {e}")

	# 4. Fallback to script detection result or default
	if script_lang == "english":
	return "english", 0.7

	return "english", 0.5 # Default to English


	# Singleton instance
	_detector: Optional[LanguageDetector] = None


	def get_detector(models_cache_dir: Optional[str] = None) -> LanguageDetector:
	"""Get or create singleton detector instance"""
	global _detector
	if _detector is None:
	_detector = LanguageDetector(models_cache_dir)
	return _detector


	def detect_language(text: str) -> Tuple[str, float]:
	"""
	Convenience function for language detection.

	Args:
	text: Input text

	Returns:
	Tuple of (language: str, confidence: float)
	"""
	return get_detector().detect(text)