Piyazon commited on
Commit
6fea37c
·
1 Parent(s): fc3f1b1
Files changed (5) hide show
  1. .gitignore +5 -0
  2. app.py +156 -0
  3. detect_language.py +139 -0
  4. lid.176.bin +3 -0
  5. requirements.txt +8 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__/
2
+ DS_Store
3
+ .env
4
+ .vscode/
5
+ *.pyc
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
+ import logging
5
+ from detect_language import detect_language
6
+
7
+ # --- 1. SETUP ---
8
+ logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
9
+ MODEL_ID = "piyazon/uyghur_translate_v1"
10
+
11
+ if torch.cuda.is_available():
12
+ device = torch.device("cuda")
13
+ elif torch.backends.mps.is_available():
14
+ device = torch.device("mps")
15
+ else:
16
+ device = torch.device("cpu")
17
+
18
+ print(f"Loading model: {MODEL_ID} on device: {device}...")
19
+
20
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
21
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID).to(device)
22
+ print("Model loaded successfully.")
23
+
24
+ LANG_CODES = {
25
+ "Uyghur": "uig_Arab",
26
+ "English": "eng_Latn",
27
+ "Chinese (Simplified)": "zho_Hans",
28
+ }
29
+
30
+ # --- 2. TRANSLATION LOGIC ---
31
+ def predict(text, tgt_choice):
32
+ if not text:
33
+ # Return: translation_update, status_markdown
34
+ return gr.update(value="", rtl=False, text_align="left"), ""
35
+
36
+ # A. AUTO DETECT SOURCE
37
+ try:
38
+ src_lang, detected_code, conf = detect_language(text)
39
+ status_message = f"Detected Language: {detected_code}, Confidence: {conf:.2f}"
40
+ print(f"Detected: {detected_code} -> Using Source: {src_lang}")
41
+ except Exception:
42
+ src_lang = "eng_Latn"
43
+ status_message = "Detected Language: Unknown (Defaulting to English)"
44
+
45
+ # B. GET TARGET
46
+ tgt_lang = LANG_CODES.get(tgt_choice, "uig_Arab")
47
+ tokenizer.src_lang = src_lang
48
+
49
+ # C. GENERATE
50
+ inputs = tokenizer(
51
+ text,
52
+ return_tensors="pt",
53
+ padding=True,
54
+ truncation=True,
55
+ max_length=512
56
+ ).to(device)
57
+
58
+ forced_bos_token_id = tokenizer.convert_tokens_to_ids(tgt_lang)
59
+
60
+ with torch.no_grad():
61
+ out = model.generate(
62
+ **inputs,
63
+ forced_bos_token_id=forced_bos_token_id,
64
+ max_new_tokens=256,
65
+ num_beams=4,
66
+ no_repeat_ngram_size=3
67
+ )
68
+
69
+ translation = tokenizer.batch_decode(out, skip_special_tokens=True)[0]
70
+
71
+ # D. HANDLE UI DIRECTION
72
+ if tgt_choice == "Uyghur":
73
+ translation_update = gr.update(value=translation, rtl=True, text_align="right")
74
+ else:
75
+ translation_update = gr.update(value=translation, rtl=False, text_align="left")
76
+
77
+ return translation_update, status_message
78
+
79
+
80
+ title = "Uyghur Translate"
81
+ description = "An AI-powered translator that auto-detects source language and translates to your chosen target language."
82
+
83
+
84
+ with gr.Blocks(
85
+ theme=gr.themes.Glass(),
86
+ title=title) as demo:
87
+ gr.Markdown(
88
+ f"""
89
+ <div style="text-align:center">
90
+ <h1 style="margin-bottom:0.25rem">{title}</h1>
91
+ <p style="margin-top:0">{description}</p>
92
+ </div>
93
+ """
94
+ )
95
+
96
+
97
+ with gr.Row():
98
+ # LEFT: input + status
99
+ with gr.Column(scale=1):
100
+ src_text = gr.Textbox(
101
+ label="Input Text (Auto Detect)",
102
+ placeholder="...",
103
+ lines=5,
104
+ elem_id="src_text",
105
+ )
106
+ src_status = gr.Markdown(value="") # status under input (left)
107
+
108
+ # RIGHT: target language + translation
109
+ with gr.Column(scale=1):
110
+ tgt_lang_dropdown = gr.Dropdown(
111
+ choices=list(LANG_CODES.keys()),
112
+ value="Uyghur",
113
+ label="Target Language"
114
+ )
115
+ tgt_text = gr.Textbox(
116
+ label="Translation",
117
+ lines=5,
118
+ interactive=False
119
+ )
120
+
121
+ translate_btn = gr.Button("Translate", variant="primary")
122
+
123
+ # Examples (clicking an example will also run predict)
124
+ gr.Examples(
125
+ examples=[
126
+ ["Hello, how are you today?", "Uyghur"],
127
+ ["The radius of the Earth is 6371 km.", "Uyghur"],
128
+ ["今天天气很好。", "English"],
129
+ ["ياخشىمۇسىز؟", "Chinese (Simplified)"],
130
+ ],
131
+ inputs=[src_text, tgt_lang_dropdown],
132
+ outputs=[tgt_text, src_status],
133
+ fn=predict,
134
+ cache_examples=False
135
+ )
136
+
137
+ translate_btn.click(
138
+ fn=predict,
139
+ inputs=[src_text, tgt_lang_dropdown],
140
+ outputs=[tgt_text, src_status]
141
+ )
142
+
143
+ custom_css = """
144
+ @import url('https://fonts.googleapis.com/css2?family=Noto+Sans+Arabic&display=swap');
145
+ textarea {
146
+ direction: auto !important; /* matches LTR/RTL automatically */
147
+ text-align: start !important; /* matches LTR/RTL automatically */
148
+ font-family: "Noto Sans Arabic" !important;
149
+ }
150
+ .table-wrap{font-family: "Noto Sans Arabic" !important;}
151
+ """
152
+
153
+ demo.launch(css=custom_css)
154
+
155
+
156
+
detect_language.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fasttext
2
+ import re
3
+
4
+ lid_model = fasttext.load_model("lid.176.bin")
5
+
6
+ FT_TO_NORMAL = {
7
+ "af": "afr_Latn",
8
+ "als": "gsw_Latn",
9
+ "am": "amh_Ethi",
10
+ "ar": "arb_Arab",
11
+ "arz": "arz_Arab",
12
+ "as": "asm_Beng",
13
+ "ast": "ast_Latn",
14
+ "az": "azj_Latn",
15
+ "azb": "azb_Arab",
16
+ "ba": "bak_Cyrl",
17
+ "bcl": "bcl_Latn",
18
+ "be": "bel_Cyrl",
19
+ "bg": "bul_Cyrl",
20
+ "bh": "bho_Deva",
21
+ "bn": "ben_Beng",
22
+ "bo": "bod_Tibt",
23
+ "bs": "bos_Latn",
24
+ "ca": "cat_Latn",
25
+ "ceb": "ceb_Latn",
26
+ "ckb": "ckb_Arab",
27
+ "cs": "ces_Latn",
28
+ "cy": "cym_Latn",
29
+ "da": "dan_Latn",
30
+ "de": "deu_Latn",
31
+ "el": "ell_Grek",
32
+ "en": "eng_Latn",
33
+ "eo": "epo_Latn",
34
+ "es": "spa_Latn",
35
+ "et": "est_Latn",
36
+ "eu": "eus_Latn",
37
+ "fa": "pes_Arab",
38
+ "fi": "fin_Latn",
39
+ "fr": "fra_Latn",
40
+ "ga": "gle_Latn",
41
+ "gd": "gla_Latn",
42
+ "gl": "glg_Latn",
43
+ "gn": "grn_Latn",
44
+ "gu": "guj_Gujr",
45
+ "he": "heb_Hebr",
46
+ "hi": "hin_Deva",
47
+ "hr": "hrv_Latn",
48
+ "ht": "hat_Latn",
49
+ "hu": "hun_Latn",
50
+ "hy": "hye_Armn",
51
+ "id": "ind_Latn",
52
+ "ilo": "ilo_Latn",
53
+ "is": "isl_Latn",
54
+ "it": "ita_Latn",
55
+ "ja": "jpn_Jpan",
56
+ "jv": "jav_Latn",
57
+ "ka": "kat_Geor",
58
+ "kk": "kaz_Cyrl",
59
+ "km": "khm_Khmr",
60
+ "kn": "kan_Knda",
61
+ "ko": "kor_Hang",
62
+ "ku": "kmr_Latn",
63
+ "ky": "kir_Cyrl",
64
+ "lb": "ltz_Latn",
65
+ "li": "lim_Latn",
66
+ "lmo": "lmo_Latn",
67
+ "lo": "lao_Laoo",
68
+ "lt": "lit_Latn",
69
+ "lv": "lvs_Latn",
70
+ "mai": "mai_Deva",
71
+ "mg": "plt_Latn",
72
+ "min": "min_Latn",
73
+ "mk": "mkd_Cyrl",
74
+ "ml": "mal_Mlym",
75
+ "mn": "khk_Cyrl",
76
+ "mr": "mar_Deva",
77
+ "ms": "zsm_Latn",
78
+ "mt": "mlt_Latn",
79
+ "my": "mya_Mymr",
80
+ "ne": "npi_Deva",
81
+ "nl": "nld_Latn",
82
+ "nn": "nno_Latn",
83
+ "no": "nob_Latn",
84
+ "oc": "oci_Latn",
85
+ "or": "ory_Orya",
86
+ "pa": "pan_Guru",
87
+ "pl": "pol_Latn",
88
+ "ps": "pbt_Arab",
89
+ "pt": "por_Latn",
90
+ "qu": "quy_Latn",
91
+ "ro": "ron_Latn",
92
+ "ru": "rus_Cyrl",
93
+ "sa": "san_Deva",
94
+ "sc": "srd_Latn",
95
+ "scn": "scn_Latn",
96
+ "sd": "snd_Arab",
97
+ "sh": "hrv_Latn",
98
+ "si": "sin_Sinh",
99
+ "sk": "slk_Latn",
100
+ "sl": "slv_Latn",
101
+ "so": "som_Latn",
102
+ "sq": "sqi_Latn",
103
+ "sr": "srp_Cyrl",
104
+ "su": "sun_Latn",
105
+ "sv": "swe_Latn",
106
+ "sw": "swh_Latn",
107
+ "ta": "tam_Taml",
108
+ "te": "tel_Telu",
109
+ "tg": "tgk_Cyrl",
110
+ "th": "tha_Thai",
111
+ "tk": "tuk_Latn",
112
+ "tl": "tgl_Latn",
113
+ "tr": "tur_Latn",
114
+ "tt": "tat_Cyrl",
115
+ "ug": "uig_Arab",
116
+ "uk": "ukr_Cyrl",
117
+ "ur": "urd_Arab",
118
+ "uz": "uzn_Latn",
119
+ "vec": "vec_Latn",
120
+ "vi": "vie_Latn",
121
+ "war": "war_Latn",
122
+ "yi": "ydd_Hebr",
123
+ "yo": "yor_Latn",
124
+ "yue": "yue_Hant",
125
+ "zh": "zho_Hans"
126
+ }
127
+
128
+ def detect_language(text: str) -> str:
129
+ # Clean obvious noise that can hurt LID
130
+ t = re.sub(r"\s+", " ", text.strip())
131
+ if not t:
132
+ return "eng_Latn"
133
+
134
+ labels, probs = lid_model.predict(t, k=1)
135
+ lang = labels[0].replace("__label__", "") # e.g., "en", "ug", "zh"
136
+ return FT_TO_NORMAL.get(lang, "eng_Latn"), lang, float(probs[0])
137
+
138
+
139
+
lid.176.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e69ec5451bc261cc7844e49e4792a85d7f09c06789ec800fc4a44aec362764e
3
+ size 131266198
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ numpy
4
+ huggingface_hub
5
+ gradio
6
+ fasttext
7
+ sentencepiece
8
+ protobuf