Spaces:
Sleeping
Sleeping
| """ | |
| Language Detection Utility | |
| Detects the language of input text for multilingual pain assessment | |
| Supports: Chinese, Korean, Spanish, Hmong, English | |
| """ | |
| import re | |
| from typing import Literal | |
| LanguageCode = Literal['zh', 'ko', 'es', 'hmong', 'en'] | |
| def detect_language(text: str) -> LanguageCode: | |
| """ | |
| Detect the primary language of the input text. | |
| Uses character-based heuristics: | |
| - Chinese: CJK Unified Ideographs (U+4E00–U+9FFF) | |
| - Korean: Hangul Syllables (U+AC00–U+D7A3) | |
| - Spanish: Spanish-specific characters (ñ, á, é, í, ó, ú, ü, ¿, ¡) | |
| - Hmong: Latin script with specific patterns | |
| - English: Default fallback | |
| Args: | |
| text: Input text to detect language from | |
| Returns: | |
| Language code: 'zh', 'ko', 'es', 'hmong', or 'en' | |
| """ | |
| if not text or not text.strip(): | |
| return 'en' | |
| # Count characters by script | |
| chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text)) | |
| korean_chars = len(re.findall(r'[\uac00-\ud7a3]', text)) | |
| spanish_chars = len(re.findall(r'[ñáéíóúü¿¡]', text, re.IGNORECASE)) | |
| # Total characters (excluding whitespace) | |
| total_chars = len(re.findall(r'\S', text)) | |
| if total_chars == 0: | |
| return 'en' | |
| # Chinese detection (>30% CJK characters) | |
| if chinese_chars / total_chars > 0.3: | |
| return 'zh' | |
| # Korean detection (>30% Hangul characters) | |
| if korean_chars / total_chars > 0.3: | |
| return 'ko' | |
| # Spanish detection (Spanish-specific characters OR common Spanish words) | |
| spanish_keywords = [ | |
| 'tengo', 'dolor', 'muy', 'que', 'para', 'con', 'por', | |
| 'esta', 'tiene', 'cuando', 'donde', 'como', 'agudo', 'punzante' | |
| ] | |
| text_lower = text.lower() | |
| spanish_word_matches = sum(1 for kw in spanish_keywords if f' {kw} ' in f' {text_lower} ') | |
| if spanish_chars > 0 or spanish_word_matches >= 2: | |
| return 'es' | |
| # Hmong detection (heuristic: common Hmong words) | |
| hmong_keywords = [ | |
| 'mob', 'txoj', 'kev', 'kuv', 'koj', 'nws', 'lawv', | |
| 'ntawm', 'rau', 'los', 'thiab', 'muaj', 'yog', 'tsis' | |
| ] | |
| text_lower = text.lower() | |
| hmong_matches = sum(1 for kw in hmong_keywords if kw in text_lower) | |
| if hmong_matches >= 2: # At least 2 Hmong keywords | |
| return 'hmong' | |
| # Default to English | |
| return 'en' | |
| def get_language_name(code: LanguageCode) -> str: | |
| """ | |
| Get full language name from language code. | |
| Args: | |
| code: Language code | |
| Returns: | |
| Full language name | |
| """ | |
| names = { | |
| 'zh': 'Chinese', | |
| 'ko': 'Korean', | |
| 'es': 'Spanish', | |
| 'hmong': 'Hmong', | |
| 'en': 'English' | |
| } | |
| return names.get(code, 'Unknown') | |
| # Test cases | |
| if __name__ == '__main__': | |
| test_cases = [ | |
| ("我有火辣辣的疼痛", "zh"), | |
| ("허리가 따끔거리듯이 아프다", "ko"), | |
| ("Tengo un dolor agudo y punzante", "es"), | |
| ("Kuv mob mob heev", "hmong"), | |
| ("I have a sharp stabbing pain", "en"), | |
| ("My back hurts so bad", "en"), | |
| ] | |
| print("Language Detection Tests:") | |
| print("=" * 60) | |
| for text, expected in test_cases: | |
| detected = detect_language(text) | |
| status = "✅" if detected == expected else "❌" | |
| print(f"{status} '{text}'") | |
| print(f" Expected: {expected}, Detected: {detected} ({get_language_name(detected)})") | |
| print() | |