File size: 915 Bytes
1e90554
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from typing import Literal

Lang = Literal["vi", "en", "mix", "unknown"]

VI_CHARS = set("ăâđêôơưáàảãạấầẩẫậắằẳẵặéèẻẽẹếềểễệíìỉĩịóòỏõọốồổỗộớờởỡợúùủũụứừửữựýỳỷỹỵ")
EN_CHARS = set("abcdefghijklmnopqrstuvwxyz")

def detect_language(text: str) -> Lang:
    """Very small heuristic detector for Vietnamese vs English vs mixed."""
    t = text.lower()

    has_vi = any(ch in VI_CHARS for ch in t)
    has_en = any(ch in EN_CHARS for ch in t)

    if has_vi and has_en:
        return "mix"
    if has_vi:
        return "vi"
    if has_en:
        return "en"
    return "unknown"

if __name__ == "__main__":
    tests = [
        "Đi tới phòng khách",
        "Turn on the lights",
        "Đi tới living room",
        "12345 !!!",
    ]
    for t in tests:
        print(t, "->", detect_language(t))