Spaces:
Sleeping
Sleeping
| # ============================================================ | |
| # FINAL 16GB SAFE TRANSLATION SYSTEM | |
| # NLLB + RAG + LLM (POST-EDITOR) | |
| # Jawa β Indonesia β English | |
| # ============================================================ | |
| import torch | |
| import faiss | |
| import pandas as pd | |
| import gradio as gr | |
| from sentence_transformers import SentenceTransformer | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForSeq2SeqLM, | |
| AutoModelForCausalLM | |
| ) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # ============================================================ | |
| # NLLB MODEL | |
| # ============================================================ | |
| NLLB = "facebook/nllb-200-distilled-600M" | |
| nllb_tok = AutoTokenizer.from_pretrained(NLLB) | |
| nllb = AutoModelForSeq2SeqLM.from_pretrained(NLLB).to(device) | |
| nllb.eval() | |
| JV = "jav_Latn" | |
| ID = "ind_Latn" | |
| EN = "eng_Latn" | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| # ============================================================ | |
| # LIGHTWEIGHT LLM (POST-EDITOR) | |
| # ============================================================ | |
| LLM = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" | |
| # ========================= | |
| # 1. LOAD TOKENIZER | |
| # ========================= | |
| llm_tok = AutoTokenizer.from_pretrained(LLM) | |
| # ========================= | |
| # 2. FIX PAD TOKEN (WAJIB) | |
| # ========================= | |
| llm_tok.pad_token = llm_tok.eos_token | |
| llm_tok.pad_token_id = llm_tok.eos_token_id | |
| # ========================= | |
| # 3. LOAD MODEL | |
| # ========================= | |
| llm = AutoModelForCausalLM.from_pretrained( | |
| LLM, | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32, | |
| low_cpu_mem_usage=True | |
| ).to(device) | |
| llm.eval() | |
| # ============================================================ | |
| # KNOWLEDGE BASE | |
| # ============================================================ | |
| kb = pd.read_csv("kb_jawa_ngoko_krama_indonesia_100k.csv") | |
| kb["jv"] = kb["jv"].astype(str) | |
| kb["id"] = kb["id"].astype(str) | |
| pairs = list(zip(kb["jv"], kb["id"])) | |
| # ============================================================ | |
| # RAG INDEX | |
| # ============================================================ | |
| embedder = SentenceTransformer( | |
| "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" | |
| ) | |
| emb = embedder.encode( | |
| kb["jv"].tolist(), | |
| batch_size=128, | |
| convert_to_numpy=True, | |
| show_progress_bar=True | |
| ) | |
| index = faiss.IndexFlatL2(emb.shape[1]) | |
| index.add(emb) | |
| def retrieve(text, k=5): | |
| v = embedder.encode([text]) | |
| _, I = index.search(v, k) | |
| return "\n".join( | |
| f"{pairs[i][0]} β {pairs[i][1]}" for i in I[0] | |
| ) | |
| # ============================================================ | |
| # TRANSLATION CORE | |
| # ============================================================ | |
| def nllb_translate(text, src, tgt): | |
| nllb_tok.src_lang = src | |
| inputs = nllb_tok( | |
| text, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=512 | |
| ).to(device) | |
| out = nllb.generate( | |
| **inputs, | |
| forced_bos_token_id=nllb_tok.convert_tokens_to_ids(tgt), | |
| max_length=256 | |
| ) | |
| return nllb_tok.decode(out[0], skip_special_tokens=True) | |
| # ============================================================ | |
| # POST-EDITING (NO LEAK) | |
| # ============================================================ | |
| def refine_id(text, context): | |
| prompt = f""" | |
| Perbaiki hasil terjemahan Bahasa Indonesia agar alami dan baku. | |
| Jangan menerjemahkan ulang. | |
| Jangan menambahkan informasi baru. | |
| Jangan menampilkan referensi. | |
| Kalimat: | |
| {text} | |
| Versi yang lebih alami: | |
| """ | |
| inp = llm_tok(prompt, return_tensors="pt").to(device) | |
| out = llm.generate( | |
| **inp, | |
| max_new_tokens=80, | |
| temperature=0.15, | |
| do_sample=False | |
| ) | |
| return llm_tok.decode( | |
| out[0], | |
| skip_special_tokens=True | |
| ).split("alami:")[-1].strip() | |
| # ============================================================ | |
| # FULL PIPELINE | |
| # ============================================================ | |
| def pipeline(text): | |
| # Jawa β Indonesia | |
| id_raw = nllb_translate(text, JV, ID) | |
| # RAG grounding (hidden) | |
| ctx = retrieve(text) | |
| # Post-edit | |
| id_final = refine_id(id_raw, ctx) | |
| # Indonesia β English | |
| en_final = nllb_translate(id_final, ID, EN) | |
| return id_final, en_final | |
| # ============================================================ | |
| # GRADIO UI | |
| # ============================================================ | |
| with gr.Blocks(title="Mesin Translasi Bahasa Jawa") as demo: | |
| gr.Markdown( | |
| """ | |
| ## πΎ Mesin Translasi Bahasa Jawa | |
| **Jawa β Indonesia β English** | |
| β NLLB-200 (Meta) | |
| β RAG Parallel Corpus (hidden) | |
| β LLM post-editor (linguistic refinement) | |
| β Output bersih tanpa referensi | |
| β Aman 16GB | |
| --- | |
| """ | |
| ) | |
| inp = gr.Textbox( | |
| label="Input Bahasa Jawa", | |
| lines=4, | |
| placeholder="Contoh: Aku arep lunga menyang pasar sesuk." | |
| ) | |
| with gr.Row(): | |
| out_id = gr.Textbox(label="Bahasa Indonesia", lines=4) | |
| out_en = gr.Textbox(label="English Translation", lines=4) | |
| gr.Button("π Terjemahkan").click( | |
| pipeline, | |
| inp, | |
| [out_id, out_en] | |
| ) | |
| demo.launch() | |