Hateshield-bn / models /language_detector.py
sgAtdbd's picture
Initial deployment of HateShield backend
8ad9255
raw
history blame
2.66 kB
from langdetect import detect, DetectorFactory, LangDetectException
import re
# Set seed for consistent results
DetectorFactory.seed = 0
def detect_language(text: str) -> str:
"""
Detect if text is English, Bengali, Mixed, or Unknown
Uses multiple detection strategies for accuracy
"""
if not text or len(text.strip()) < 3:
return "unknown"
# Strategy 1: Check for Bengali Unicode characters
bengali_pattern = r'[\u0980-\u09FF]'
has_bengali = bool(re.search(bengali_pattern, text))
# Strategy 2: Check for English characters
english_pattern = r'[a-zA-Z]'
has_english = bool(re.search(english_pattern, text))
# If both present, it's mixed
if has_bengali and has_english:
bengali_chars = len(re.findall(bengali_pattern, text))
english_chars = len(re.findall(english_pattern, text))
# If one language dominates heavily (>80%), classify as that language
total_chars = bengali_chars + english_chars
if bengali_chars / total_chars > 0.8:
return "bengali"
elif english_chars / total_chars > 0.8:
return "english"
else:
return "mixed"
# If only Bengali
if has_bengali:
return "bengali"
# If only English
if has_english:
try:
# Use langdetect for confirmation
detected = detect(text)
if detected == 'en':
return "english"
elif detected == 'bn':
return "bengali"
else:
# If langdetect finds another language but we have English chars
return "english"
except LangDetectException:
return "english"
# Fallback to langdetect
try:
detected = detect(text)
if detected == 'en':
return "english"
elif detected == 'bn':
return "bengali"
else:
return "unknown"
except LangDetectException:
return "unknown"
def get_language_script_info(text: str) -> dict:
"""
Get detailed information about the scripts used in text
Useful for debugging and fine-tuning
"""
bengali_chars = len(re.findall(r'[\u0980-\u09FF]', text))
english_chars = len(re.findall(r'[a-zA-Z]', text))
digits = len(re.findall(r'\d', text))
other_chars = len(text) - bengali_chars - english_chars - digits
return {
"bengali_characters": bengali_chars,
"english_characters": english_chars,
"digits": digits,
"other_characters": other_chars,
"total_length": len(text)
}