File size: 3,468 Bytes
acaf471
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Language Detection Utility
Detects the language of input text for multilingual pain assessment
Supports: Chinese, Korean, Spanish, Hmong, English
"""
import re
from typing import Literal

LanguageCode = Literal['zh', 'ko', 'es', 'hmong', 'en']

def detect_language(text: str) -> LanguageCode:
    """
    Detect the primary language of the input text.
    
    Uses character-based heuristics:
    - Chinese: CJK Unified Ideographs (U+4E00–U+9FFF)
    - Korean: Hangul Syllables (U+AC00–U+D7A3)
    - Spanish: Spanish-specific characters (ñ, á, é, í, ó, ú, ü, ¿, ¡)
    - Hmong: Latin script with specific patterns
    - English: Default fallback
    
    Args:
        text: Input text to detect language from
        
    Returns:
        Language code: 'zh', 'ko', 'es', 'hmong', or 'en'
    """
    if not text or not text.strip():
        return 'en'
    
    # Count characters by script
    chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
    korean_chars = len(re.findall(r'[\uac00-\ud7a3]', text))
    spanish_chars = len(re.findall(r'[ñáéíóúü¿¡]', text, re.IGNORECASE))
    
    # Total characters (excluding whitespace)
    total_chars = len(re.findall(r'\S', text))
    
    if total_chars == 0:
        return 'en'
    
    # Chinese detection (>30% CJK characters)
    if chinese_chars / total_chars > 0.3:
        return 'zh'
    
    # Korean detection (>30% Hangul characters)
    if korean_chars / total_chars > 0.3:
        return 'ko'
    
    # Spanish detection (Spanish-specific characters OR common Spanish words)
    spanish_keywords = [
        'tengo', 'dolor', 'muy', 'que', 'para', 'con', 'por',
        'esta', 'tiene', 'cuando', 'donde', 'como', 'agudo', 'punzante'
    ]
    text_lower = text.lower()
    spanish_word_matches = sum(1 for kw in spanish_keywords if f' {kw} ' in f' {text_lower} ')
    
    if spanish_chars > 0 or spanish_word_matches >= 2:
        return 'es'
    
    # Hmong detection (heuristic: common Hmong words)
    hmong_keywords = [
        'mob', 'txoj', 'kev', 'kuv', 'koj', 'nws', 'lawv',
        'ntawm', 'rau', 'los', 'thiab', 'muaj', 'yog', 'tsis'
    ]
    text_lower = text.lower()
    hmong_matches = sum(1 for kw in hmong_keywords if kw in text_lower)
    
    if hmong_matches >= 2:  # At least 2 Hmong keywords
        return 'hmong'
    
    # Default to English
    return 'en'

def get_language_name(code: LanguageCode) -> str:
    """
    Get full language name from language code.
    
    Args:
        code: Language code
        
    Returns:
        Full language name
    """
    names = {
        'zh': 'Chinese',
        'ko': 'Korean',
        'es': 'Spanish',
        'hmong': 'Hmong',
        'en': 'English'
    }
    return names.get(code, 'Unknown')

# Test cases
if __name__ == '__main__':
    test_cases = [
        ("我有火辣辣的疼痛", "zh"),
        ("허리가 따끔거리듯이 아프다", "ko"),
        ("Tengo un dolor agudo y punzante", "es"),
        ("Kuv mob mob heev", "hmong"),
        ("I have a sharp stabbing pain", "en"),
        ("My back hurts so bad", "en"),
    ]
    
    print("Language Detection Tests:")
    print("=" * 60)
    for text, expected in test_cases:
        detected = detect_language(text)
        status = "✅" if detected == expected else "❌"
        print(f"{status} '{text}'")
        print(f"   Expected: {expected}, Detected: {detected} ({get_language_name(detected)})")
        print()