Spaces:

piyazon
/

Uyghur_Translate

Running

App Files Files Community

Piyazon commited on Jan 7

Commit

6fea37c

1 Parent(s): fc3f1b1

trt

Browse files

Files changed (5) hide show

.gitignore +5 -0
app.py +156 -0
detect_language.py +139 -0
lid.176.bin +3 -0
requirements.txt +8 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+__pycache__/
+DS_Store
+.env
+.vscode/
+*.pyc

app.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import logging
+from detect_language import detect_language
+# --- 1. SETUP ---
+logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
+MODEL_ID = "piyazon/uyghur_translate_v1"
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+elif torch.backends.mps.is_available():
+    device = torch.device("mps")
+else:
+    device = torch.device("cpu")
+print(f"Loading model: {MODEL_ID} on device: {device}...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID).to(device)
+print("Model loaded successfully.")
+LANG_CODES = {
+    "Uyghur": "uig_Arab",
+    "English": "eng_Latn",
+    "Chinese (Simplified)": "zho_Hans",
+}
+# --- 2. TRANSLATION LOGIC ---
+def predict(text, tgt_choice):
+    if not text:
+        # Return: translation_update, status_markdown
+        return gr.update(value="", rtl=False, text_align="left"), ""
+    # A. AUTO DETECT SOURCE
+    try:
+        src_lang, detected_code, conf = detect_language(text)
+        status_message = f"Detected Language: {detected_code}, Confidence: {conf:.2f}"
+        print(f"Detected: {detected_code} -> Using Source: {src_lang}")
+    except Exception:
+        src_lang = "eng_Latn"
+        status_message = "Detected Language: Unknown (Defaulting to English)"
+    # B. GET TARGET
+    tgt_lang = LANG_CODES.get(tgt_choice, "uig_Arab")
+    tokenizer.src_lang = src_lang
+    # C. GENERATE
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=512
+    ).to(device)
+    forced_bos_token_id = tokenizer.convert_tokens_to_ids(tgt_lang)
+    with torch.no_grad():
+        out = model.generate(
+            **inputs,
+            forced_bos_token_id=forced_bos_token_id,
+            max_new_tokens=256,
+            num_beams=4,
+            no_repeat_ngram_size=3
+        )
+    translation = tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+    # D. HANDLE UI DIRECTION
+    if tgt_choice == "Uyghur":
+        translation_update = gr.update(value=translation, rtl=True, text_align="right")
+    else:
+        translation_update = gr.update(value=translation, rtl=False, text_align="left")
+    return translation_update, status_message
+title = "Uyghur Translate"
+description = "An AI-powered translator that auto-detects source language and translates to your chosen target language."
+with gr.Blocks(
+    theme=gr.themes.Glass(),
+    title=title) as demo:
+    gr.Markdown(
+        f"""
+        <div style="text-align:center">
+        <h1 style="margin-bottom:0.25rem">{title}</h1>
+        <p style="margin-top:0">{description}</p>
+        </div>
+        """
+    )
+    with gr.Row():
+        # LEFT: input + status
+        with gr.Column(scale=1):
+            src_text = gr.Textbox(
+                label="Input Text (Auto Detect)",
+                placeholder="...",
+                lines=5,
+                elem_id="src_text",
+            )
+            src_status = gr.Markdown(value="")  # status under input (left)
+        # RIGHT: target language + translation
+        with gr.Column(scale=1):
+            tgt_lang_dropdown = gr.Dropdown(
+                choices=list(LANG_CODES.keys()),
+                value="Uyghur",
+                label="Target Language"
+            )
+            tgt_text = gr.Textbox(
+                label="Translation",
+                lines=5,
+                interactive=False
+            )
+    translate_btn = gr.Button("Translate", variant="primary")
+    # Examples (clicking an example will also run predict)
+    gr.Examples(
+        examples=[
+            ["Hello, how are you today?", "Uyghur"],
+            ["The radius of the Earth is 6371 km.", "Uyghur"],
+            ["今天天气很好。", "English"],
+            ["ياخشىمۇسىز؟", "Chinese (Simplified)"],
+        ],
+        inputs=[src_text, tgt_lang_dropdown],
+        outputs=[tgt_text, src_status],
+        fn=predict,
+        cache_examples=False
+    )
+    translate_btn.click(
+        fn=predict,
+        inputs=[src_text, tgt_lang_dropdown],
+        outputs=[tgt_text, src_status]
+    )
+custom_css = """
+    @import url('https://fonts.googleapis.com/css2?family=Noto+Sans+Arabic&display=swap');
+    textarea {
+        direction: auto !important;  /* matches LTR/RTL automatically */
+        text-align: start !important;  /* matches LTR/RTL automatically */
+        font-family: "Noto Sans Arabic" !important;
+    }
+    .table-wrap{font-family: "Noto Sans Arabic" !important;}
+"""
+demo.launch(css=custom_css)

detect_language.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import fasttext
+import re
+lid_model = fasttext.load_model("lid.176.bin")
+FT_TO_NORMAL = {
+    "af": "afr_Latn",
+    "als": "gsw_Latn",
+    "am": "amh_Ethi",
+    "ar": "arb_Arab",
+    "arz": "arz_Arab",
+    "as": "asm_Beng",
+    "ast": "ast_Latn",
+    "az": "azj_Latn",
+    "azb": "azb_Arab",
+    "ba": "bak_Cyrl",
+    "bcl": "bcl_Latn",
+    "be": "bel_Cyrl",
+    "bg": "bul_Cyrl",
+    "bh": "bho_Deva",
+    "bn": "ben_Beng",
+    "bo": "bod_Tibt",
+    "bs": "bos_Latn",
+    "ca": "cat_Latn",
+    "ceb": "ceb_Latn",
+    "ckb": "ckb_Arab",
+    "cs": "ces_Latn",
+    "cy": "cym_Latn",
+    "da": "dan_Latn",
+    "de": "deu_Latn",
+    "el": "ell_Grek",
+    "en": "eng_Latn",
+    "eo": "epo_Latn",
+    "es": "spa_Latn",
+    "et": "est_Latn",
+    "eu": "eus_Latn",
+    "fa": "pes_Arab",
+    "fi": "fin_Latn",
+    "fr": "fra_Latn",
+    "ga": "gle_Latn",
+    "gd": "gla_Latn",
+    "gl": "glg_Latn",
+    "gn": "grn_Latn",
+    "gu": "guj_Gujr",
+    "he": "heb_Hebr",
+    "hi": "hin_Deva",
+    "hr": "hrv_Latn",
+    "ht": "hat_Latn",
+    "hu": "hun_Latn",
+    "hy": "hye_Armn",
+    "id": "ind_Latn",
+    "ilo": "ilo_Latn",
+    "is": "isl_Latn",
+    "it": "ita_Latn",
+    "ja": "jpn_Jpan",
+    "jv": "jav_Latn",
+    "ka": "kat_Geor",
+    "kk": "kaz_Cyrl",
+    "km": "khm_Khmr",
+    "kn": "kan_Knda",
+    "ko": "kor_Hang",
+    "ku": "kmr_Latn",
+    "ky": "kir_Cyrl",
+    "lb": "ltz_Latn",
+    "li": "lim_Latn",
+    "lmo": "lmo_Latn",
+    "lo": "lao_Laoo",
+    "lt": "lit_Latn",
+    "lv": "lvs_Latn",
+    "mai": "mai_Deva",
+    "mg": "plt_Latn",
+    "min": "min_Latn",
+    "mk": "mkd_Cyrl",
+    "ml": "mal_Mlym",
+    "mn": "khk_Cyrl",
+    "mr": "mar_Deva",
+    "ms": "zsm_Latn",
+    "mt": "mlt_Latn",
+    "my": "mya_Mymr",
+    "ne": "npi_Deva",
+    "nl": "nld_Latn",
+    "nn": "nno_Latn",
+    "no": "nob_Latn",
+    "oc": "oci_Latn",
+    "or": "ory_Orya",
+    "pa": "pan_Guru",
+    "pl": "pol_Latn",
+    "ps": "pbt_Arab",
+    "pt": "por_Latn",
+    "qu": "quy_Latn",
+    "ro": "ron_Latn",
+    "ru": "rus_Cyrl",
+    "sa": "san_Deva",
+    "sc": "srd_Latn",
+    "scn": "scn_Latn",
+    "sd": "snd_Arab",
+    "sh": "hrv_Latn",
+    "si": "sin_Sinh",
+    "sk": "slk_Latn",
+    "sl": "slv_Latn",
+    "so": "som_Latn",
+    "sq": "sqi_Latn",
+    "sr": "srp_Cyrl",
+    "su": "sun_Latn",
+    "sv": "swe_Latn",
+    "sw": "swh_Latn",
+    "ta": "tam_Taml",
+    "te": "tel_Telu",
+    "tg": "tgk_Cyrl",
+    "th": "tha_Thai",
+    "tk": "tuk_Latn",
+    "tl": "tgl_Latn",
+    "tr": "tur_Latn",
+    "tt": "tat_Cyrl",
+    "ug": "uig_Arab",
+    "uk": "ukr_Cyrl",
+    "ur": "urd_Arab",
+    "uz": "uzn_Latn",
+    "vec": "vec_Latn",
+    "vi": "vie_Latn",
+    "war": "war_Latn",
+    "yi": "ydd_Hebr",
+    "yo": "yor_Latn",
+    "yue": "yue_Hant",
+    "zh": "zho_Hans"
+}
+def detect_language(text: str) -> str:
+    # Clean obvious noise that can hurt LID
+    t = re.sub(r"\s+", " ", text.strip())
+    if not t:
+        return "eng_Latn"
+    labels, probs = lid_model.predict(t, k=1)
+    lang = labels[0].replace("__label__", "")  # e.g., "en", "ug", "zh"
+    return FT_TO_NORMAL.get(lang, "eng_Latn"), lang, float(probs[0])

lid.176.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e69ec5451bc261cc7844e49e4792a85d7f09c06789ec800fc4a44aec362764e
+size 131266198

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+transformers
+torch
+numpy
+huggingface_hub
+gradio
+fasttext
+sentencepiece
+protobuf