Spaces:

DIrtyCha
/

PainReportHuggingFace

Sleeping

App Files Files Community

PainReportHuggingFace / Backend /utils /language_detector.py

DIrtyCha

Initial commit from PainReport

acaf471 about 1 month ago

raw

history blame contribute delete

3.47 kB

	"""
	Language Detection Utility
	Detects the language of input text for multilingual pain assessment
	Supports: Chinese, Korean, Spanish, Hmong, English
	"""
	import re
	from typing import Literal

	LanguageCode = Literal['zh', 'ko', 'es', 'hmong', 'en']

	def detect_language(text: str) -> LanguageCode:
	"""
	Detect the primary language of the input text.

	Uses character-based heuristics:
	- Chinese: CJK Unified Ideographs (U+4E00–U+9FFF)
	- Korean: Hangul Syllables (U+AC00–U+D7A3)
	- Spanish: Spanish-specific characters (ñ, á, é, í, ó, ú, ü, ¿, ¡)
	- Hmong: Latin script with specific patterns
	- English: Default fallback

	Args:
	text: Input text to detect language from

	Returns:
	Language code: 'zh', 'ko', 'es', 'hmong', or 'en'
	"""
	if not text or not text.strip():
	return 'en'

	# Count characters by script
	chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
	korean_chars = len(re.findall(r'[\uac00-\ud7a3]', text))
	spanish_chars = len(re.findall(r'[ñáéíóúü¿¡]', text, re.IGNORECASE))

	# Total characters (excluding whitespace)
	total_chars = len(re.findall(r'\S', text))

	if total_chars == 0:
	return 'en'

	# Chinese detection (>30% CJK characters)
	if chinese_chars / total_chars > 0.3:
	return 'zh'

	# Korean detection (>30% Hangul characters)
	if korean_chars / total_chars > 0.3:
	return 'ko'

	# Spanish detection (Spanish-specific characters OR common Spanish words)
	spanish_keywords = [
	'tengo', 'dolor', 'muy', 'que', 'para', 'con', 'por',
	'esta', 'tiene', 'cuando', 'donde', 'como', 'agudo', 'punzante'
	]
	text_lower = text.lower()
	spanish_word_matches = sum(1 for kw in spanish_keywords if f' {kw} ' in f' {text_lower} ')

	if spanish_chars > 0 or spanish_word_matches >= 2:
	return 'es'

	# Hmong detection (heuristic: common Hmong words)
	hmong_keywords = [
	'mob', 'txoj', 'kev', 'kuv', 'koj', 'nws', 'lawv',
	'ntawm', 'rau', 'los', 'thiab', 'muaj', 'yog', 'tsis'
	]
	text_lower = text.lower()
	hmong_matches = sum(1 for kw in hmong_keywords if kw in text_lower)

	if hmong_matches >= 2: # At least 2 Hmong keywords
	return 'hmong'

	# Default to English
	return 'en'

	def get_language_name(code: LanguageCode) -> str:
	"""
	Get full language name from language code.

	Args:
	code: Language code

	Returns:
	Full language name
	"""
	names = {
	'zh': 'Chinese',
	'ko': 'Korean',
	'es': 'Spanish',
	'hmong': 'Hmong',
	'en': 'English'
	}
	return names.get(code, 'Unknown')

	# Test cases
	if __name__ == '__main__':
	test_cases = [
	("我有火辣辣的疼痛", "zh"),
	("허리가 따끔거리듯이 아프다", "ko"),
	("Tengo un dolor agudo y punzante", "es"),
	("Kuv mob mob heev", "hmong"),
	("I have a sharp stabbing pain", "en"),
	("My back hurts so bad", "en"),
	]

	print("Language Detection Tests:")
	print("=" * 60)
	for text, expected in test_cases:
	detected = detect_language(text)
	status = "✅" if detected == expected else "❌"
	print(f"{status} '{text}'")
	print(f" Expected: {expected}, Detected: {detected} ({get_language_name(detected)})")
	print()