LLMRAG / app.py
handaru2002's picture
Update app.py
a590ca6 verified
# ============================================================
# FINAL 16GB SAFE TRANSLATION SYSTEM
# NLLB + RAG + LLM (POST-EDITOR)
# Jawa β†’ Indonesia β†’ English
# ============================================================
import torch
import faiss
import pandas as pd
import gradio as gr
from sentence_transformers import SentenceTransformer
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
AutoModelForCausalLM
)
device = "cuda" if torch.cuda.is_available() else "cpu"
# ============================================================
# NLLB MODEL
# ============================================================
NLLB = "facebook/nllb-200-distilled-600M"
nllb_tok = AutoTokenizer.from_pretrained(NLLB)
nllb = AutoModelForSeq2SeqLM.from_pretrained(NLLB).to(device)
nllb.eval()
JV = "jav_Latn"
ID = "ind_Latn"
EN = "eng_Latn"
from transformers import AutoTokenizer, AutoModelForCausalLM
# ============================================================
# LIGHTWEIGHT LLM (POST-EDITOR)
# ============================================================
LLM = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# =========================
# 1. LOAD TOKENIZER
# =========================
llm_tok = AutoTokenizer.from_pretrained(LLM)
# =========================
# 2. FIX PAD TOKEN (WAJIB)
# =========================
llm_tok.pad_token = llm_tok.eos_token
llm_tok.pad_token_id = llm_tok.eos_token_id
# =========================
# 3. LOAD MODEL
# =========================
llm = AutoModelForCausalLM.from_pretrained(
LLM,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
low_cpu_mem_usage=True
).to(device)
llm.eval()
# ============================================================
# KNOWLEDGE BASE
# ============================================================
kb = pd.read_csv("kb_jawa_ngoko_krama_indonesia_100k.csv")
kb["jv"] = kb["jv"].astype(str)
kb["id"] = kb["id"].astype(str)
pairs = list(zip(kb["jv"], kb["id"]))
# ============================================================
# RAG INDEX
# ============================================================
embedder = SentenceTransformer(
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
emb = embedder.encode(
kb["jv"].tolist(),
batch_size=128,
convert_to_numpy=True,
show_progress_bar=True
)
index = faiss.IndexFlatL2(emb.shape[1])
index.add(emb)
def retrieve(text, k=5):
v = embedder.encode([text])
_, I = index.search(v, k)
return "\n".join(
f"{pairs[i][0]} β†’ {pairs[i][1]}" for i in I[0]
)
# ============================================================
# TRANSLATION CORE
# ============================================================
def nllb_translate(text, src, tgt):
nllb_tok.src_lang = src
inputs = nllb_tok(
text,
return_tensors="pt",
truncation=True,
max_length=512
).to(device)
out = nllb.generate(
**inputs,
forced_bos_token_id=nllb_tok.convert_tokens_to_ids(tgt),
max_length=256
)
return nllb_tok.decode(out[0], skip_special_tokens=True)
# ============================================================
# POST-EDITING (NO LEAK)
# ============================================================
def refine_id(text, context):
prompt = f"""
Perbaiki hasil terjemahan Bahasa Indonesia agar alami dan baku.
Jangan menerjemahkan ulang.
Jangan menambahkan informasi baru.
Jangan menampilkan referensi.
Kalimat:
{text}
Versi yang lebih alami:
"""
inp = llm_tok(prompt, return_tensors="pt").to(device)
out = llm.generate(
**inp,
max_new_tokens=80,
temperature=0.15,
do_sample=False
)
return llm_tok.decode(
out[0],
skip_special_tokens=True
).split("alami:")[-1].strip()
# ============================================================
# FULL PIPELINE
# ============================================================
def pipeline(text):
# Jawa β†’ Indonesia
id_raw = nllb_translate(text, JV, ID)
# RAG grounding (hidden)
ctx = retrieve(text)
# Post-edit
id_final = refine_id(id_raw, ctx)
# Indonesia β†’ English
en_final = nllb_translate(id_final, ID, EN)
return id_final, en_final
# ============================================================
# GRADIO UI
# ============================================================
with gr.Blocks(title="Mesin Translasi Bahasa Jawa") as demo:
gr.Markdown(
"""
## 🌾 Mesin Translasi Bahasa Jawa
**Jawa β†’ Indonesia β†’ English**
βœ” NLLB-200 (Meta)
βœ” RAG Parallel Corpus (hidden)
βœ” LLM post-editor (linguistic refinement)
βœ” Output bersih tanpa referensi
βœ” Aman 16GB
---
"""
)
inp = gr.Textbox(
label="Input Bahasa Jawa",
lines=4,
placeholder="Contoh: Aku arep lunga menyang pasar sesuk."
)
with gr.Row():
out_id = gr.Textbox(label="Bahasa Indonesia", lines=4)
out_en = gr.Textbox(label="English Translation", lines=4)
gr.Button("πŸ”„ Terjemahkan").click(
pipeline,
inp,
[out_id, out_en]
)
demo.launch()