Spaces:

sgAtdbd
/

Hateshield-bn

Sleeping

App Files Files Community

Hateshield-bn / models /language_detector.py

sgAtdbd

Initial deployment of HateShield backend

8ad9255 about 1 month ago

raw

history blame

2.66 kB

	from langdetect import detect, DetectorFactory, LangDetectException
	import re

	# Set seed for consistent results
	DetectorFactory.seed = 0

	def detect_language(text: str) -> str:
	"""
	Detect if text is English, Bengali, Mixed, or Unknown
	Uses multiple detection strategies for accuracy
	"""

	if not text or len(text.strip()) < 3:
	return "unknown"

	# Strategy 1: Check for Bengali Unicode characters
	bengali_pattern = r'[\u0980-\u09FF]'
	has_bengali = bool(re.search(bengali_pattern, text))

	# Strategy 2: Check for English characters
	english_pattern = r'[a-zA-Z]'
	has_english = bool(re.search(english_pattern, text))

	# If both present, it's mixed
	if has_bengali and has_english:
	bengali_chars = len(re.findall(bengali_pattern, text))
	english_chars = len(re.findall(english_pattern, text))

	# If one language dominates heavily (>80%), classify as that language
	total_chars = bengali_chars + english_chars
	if bengali_chars / total_chars > 0.8:
	return "bengali"
	elif english_chars / total_chars > 0.8:
	return "english"
	else:
	return "mixed"

	# If only Bengali
	if has_bengali:
	return "bengali"

	# If only English
	if has_english:
	try:
	# Use langdetect for confirmation
	detected = detect(text)
	if detected == 'en':
	return "english"
	elif detected == 'bn':
	return "bengali"
	else:
	# If langdetect finds another language but we have English chars
	return "english"
	except LangDetectException:
	return "english"

	# Fallback to langdetect
	try:
	detected = detect(text)
	if detected == 'en':
	return "english"
	elif detected == 'bn':
	return "bengali"
	else:
	return "unknown"
	except LangDetectException:
	return "unknown"

	def get_language_script_info(text: str) -> dict:
	"""
	Get detailed information about the scripts used in text
	Useful for debugging and fine-tuning
	"""
	bengali_chars = len(re.findall(r'[\u0980-\u09FF]', text))
	english_chars = len(re.findall(r'[a-zA-Z]', text))
	digits = len(re.findall(r'\d', text))
	other_chars = len(text) - bengali_chars - english_chars - digits

	return {
	"bengali_characters": bengali_chars,
	"english_characters": english_chars,
	"digits": digits,
	"other_characters": other_chars,
	"total_length": len(text)
	}