# ============================================================ # FINAL 16GB SAFE TRANSLATION SYSTEM # NLLB + RAG + LLM (POST-EDITOR) # Jawa → Indonesia → English # ============================================================ import torch import faiss import pandas as pd import gradio as gr from sentence_transformers import SentenceTransformer from transformers import ( AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM ) device = "cuda" if torch.cuda.is_available() else "cpu" # ============================================================ # NLLB MODEL # ============================================================ NLLB = "facebook/nllb-200-distilled-600M" nllb_tok = AutoTokenizer.from_pretrained(NLLB) nllb = AutoModelForSeq2SeqLM.from_pretrained(NLLB).to(device) nllb.eval() JV = "jav_Latn" ID = "ind_Latn" EN = "eng_Latn" from transformers import AutoTokenizer, AutoModelForCausalLM # ============================================================ # LIGHTWEIGHT LLM (POST-EDITOR) # ============================================================ LLM = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # ========================= # 1. LOAD TOKENIZER # ========================= llm_tok = AutoTokenizer.from_pretrained(LLM) # ========================= # 2. FIX PAD TOKEN (WAJIB) # ========================= llm_tok.pad_token = llm_tok.eos_token llm_tok.pad_token_id = llm_tok.eos_token_id # ========================= # 3. LOAD MODEL # ========================= llm = AutoModelForCausalLM.from_pretrained( LLM, torch_dtype=torch.float16 if device == "cuda" else torch.float32, low_cpu_mem_usage=True ).to(device) llm.eval() # ============================================================ # KNOWLEDGE BASE # ============================================================ kb = pd.read_csv("kb_jawa_ngoko_krama_indonesia_100k.csv") kb["jv"] = kb["jv"].astype(str) kb["id"] = kb["id"].astype(str) pairs = list(zip(kb["jv"], kb["id"])) # ============================================================ # RAG INDEX # ============================================================ embedder = SentenceTransformer( "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" ) emb = embedder.encode( kb["jv"].tolist(), batch_size=128, convert_to_numpy=True, show_progress_bar=True ) index = faiss.IndexFlatL2(emb.shape[1]) index.add(emb) def retrieve(text, k=5): v = embedder.encode([text]) _, I = index.search(v, k) return "\n".join( f"{pairs[i][0]} → {pairs[i][1]}" for i in I[0] ) # ============================================================ # TRANSLATION CORE # ============================================================ def nllb_translate(text, src, tgt): nllb_tok.src_lang = src inputs = nllb_tok( text, return_tensors="pt", truncation=True, max_length=512 ).to(device) out = nllb.generate( **inputs, forced_bos_token_id=nllb_tok.convert_tokens_to_ids(tgt), max_length=256 ) return nllb_tok.decode(out[0], skip_special_tokens=True) # ============================================================ # POST-EDITING (NO LEAK) # ============================================================ def refine_id(text, context): prompt = f""" Perbaiki hasil terjemahan Bahasa Indonesia agar alami dan baku. Jangan menerjemahkan ulang. Jangan menambahkan informasi baru. Jangan menampilkan referensi. Kalimat: {text} Versi yang lebih alami: """ inp = llm_tok(prompt, return_tensors="pt").to(device) out = llm.generate( **inp, max_new_tokens=80, temperature=0.15, do_sample=False ) return llm_tok.decode( out[0], skip_special_tokens=True ).split("alami:")[-1].strip() # ============================================================ # FULL PIPELINE # ============================================================ def pipeline(text): # Jawa → Indonesia id_raw = nllb_translate(text, JV, ID) # RAG grounding (hidden) ctx = retrieve(text) # Post-edit id_final = refine_id(id_raw, ctx) # Indonesia → English en_final = nllb_translate(id_final, ID, EN) return id_final, en_final # ============================================================ # GRADIO UI # ============================================================ with gr.Blocks(title="Mesin Translasi Bahasa Jawa") as demo: gr.Markdown( """ ## 🌾 Mesin Translasi Bahasa Jawa **Jawa → Indonesia → English** ✔ NLLB-200 (Meta) ✔ RAG Parallel Corpus (hidden) ✔ LLM post-editor (linguistic refinement) ✔ Output bersih tanpa referensi ✔ Aman 16GB --- """ ) inp = gr.Textbox( label="Input Bahasa Jawa", lines=4, placeholder="Contoh: Aku arep lunga menyang pasar sesuk." ) with gr.Row(): out_id = gr.Textbox(label="Bahasa Indonesia", lines=4) out_en = gr.Textbox(label="English Translation", lines=4) gr.Button("🔄 Terjemahkan").click( pipeline, inp, [out_id, out_en] ) demo.launch()