Spaces:
Sleeping
Sleeping
File size: 5,228 Bytes
fe5222a 6e6d001 a590ca6 fe5222a 2dadf91 92bcc3d 2dadf91 4a20f56 a590ca6 19ce21c 92bcc3d fe5222a a590ca6 fe5222a 6e6d001 a590ca6 6e6d001 92bcc3d a590ca6 92bcc3d 7698adc a590ca6 92bcc3d a590ca6 92bcc3d f5f26d0 14cf7fa 7698adc 6e6d001 69aa234 7698adc 69aa234 7698adc a590ca6 6e6d001 69aa234 14cf7fa 69aa234 6e6d001 92bcc3d 6e6d001 92bcc3d 4a20f56 92bcc3d 4a20f56 92bcc3d a590ca6 92bcc3d a590ca6 92bcc3d 6e6d001 a590ca6 6e6d001 92bcc3d a590ca6 4a20f56 fe5222a 4a20f56 6e6d001 4a20f56 fe5222a a590ca6 2dadf91 a590ca6 2dadf91 92bcc3d a590ca6 6e6d001 2dadf91 92bcc3d 6e6d001 92bcc3d a590ca6 92bcc3d fe5222a 6e6d001 a590ca6 92bcc3d 2dadf91 a590ca6 2dadf91 a590ca6 92bcc3d 2dadf91 a590ca6 92bcc3d a590ca6 2dadf91 a590ca6 2dadf91 a590ca6 2dadf91 6e6d001 2dadf91 6e6d001 a590ca6 2dadf91 fe5222a a590ca6 fe5222a a590ca6 fe5222a 6e6d001 a590ca6 6e6d001 a590ca6 2dadf91 a590ca6 2dadf91 a590ca6 92bcc3d a590ca6 92bcc3d a590ca6 fe5222a 4a20f56 040925c 2dadf91 6e6d001 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 | # ============================================================
# FINAL 16GB SAFE TRANSLATION SYSTEM
# NLLB + RAG + LLM (POST-EDITOR)
# Jawa β Indonesia β English
# ============================================================
import torch
import faiss
import pandas as pd
import gradio as gr
from sentence_transformers import SentenceTransformer
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
AutoModelForCausalLM
)
device = "cuda" if torch.cuda.is_available() else "cpu"
# ============================================================
# NLLB MODEL
# ============================================================
NLLB = "facebook/nllb-200-distilled-600M"
nllb_tok = AutoTokenizer.from_pretrained(NLLB)
nllb = AutoModelForSeq2SeqLM.from_pretrained(NLLB).to(device)
nllb.eval()
JV = "jav_Latn"
ID = "ind_Latn"
EN = "eng_Latn"
from transformers import AutoTokenizer, AutoModelForCausalLM
# ============================================================
# LIGHTWEIGHT LLM (POST-EDITOR)
# ============================================================
LLM = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# =========================
# 1. LOAD TOKENIZER
# =========================
llm_tok = AutoTokenizer.from_pretrained(LLM)
# =========================
# 2. FIX PAD TOKEN (WAJIB)
# =========================
llm_tok.pad_token = llm_tok.eos_token
llm_tok.pad_token_id = llm_tok.eos_token_id
# =========================
# 3. LOAD MODEL
# =========================
llm = AutoModelForCausalLM.from_pretrained(
LLM,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
low_cpu_mem_usage=True
).to(device)
llm.eval()
# ============================================================
# KNOWLEDGE BASE
# ============================================================
kb = pd.read_csv("kb_jawa_ngoko_krama_indonesia_100k.csv")
kb["jv"] = kb["jv"].astype(str)
kb["id"] = kb["id"].astype(str)
pairs = list(zip(kb["jv"], kb["id"]))
# ============================================================
# RAG INDEX
# ============================================================
embedder = SentenceTransformer(
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
emb = embedder.encode(
kb["jv"].tolist(),
batch_size=128,
convert_to_numpy=True,
show_progress_bar=True
)
index = faiss.IndexFlatL2(emb.shape[1])
index.add(emb)
def retrieve(text, k=5):
v = embedder.encode([text])
_, I = index.search(v, k)
return "\n".join(
f"{pairs[i][0]} β {pairs[i][1]}" for i in I[0]
)
# ============================================================
# TRANSLATION CORE
# ============================================================
def nllb_translate(text, src, tgt):
nllb_tok.src_lang = src
inputs = nllb_tok(
text,
return_tensors="pt",
truncation=True,
max_length=512
).to(device)
out = nllb.generate(
**inputs,
forced_bos_token_id=nllb_tok.convert_tokens_to_ids(tgt),
max_length=256
)
return nllb_tok.decode(out[0], skip_special_tokens=True)
# ============================================================
# POST-EDITING (NO LEAK)
# ============================================================
def refine_id(text, context):
prompt = f"""
Perbaiki hasil terjemahan Bahasa Indonesia agar alami dan baku.
Jangan menerjemahkan ulang.
Jangan menambahkan informasi baru.
Jangan menampilkan referensi.
Kalimat:
{text}
Versi yang lebih alami:
"""
inp = llm_tok(prompt, return_tensors="pt").to(device)
out = llm.generate(
**inp,
max_new_tokens=80,
temperature=0.15,
do_sample=False
)
return llm_tok.decode(
out[0],
skip_special_tokens=True
).split("alami:")[-1].strip()
# ============================================================
# FULL PIPELINE
# ============================================================
def pipeline(text):
# Jawa β Indonesia
id_raw = nllb_translate(text, JV, ID)
# RAG grounding (hidden)
ctx = retrieve(text)
# Post-edit
id_final = refine_id(id_raw, ctx)
# Indonesia β English
en_final = nllb_translate(id_final, ID, EN)
return id_final, en_final
# ============================================================
# GRADIO UI
# ============================================================
with gr.Blocks(title="Mesin Translasi Bahasa Jawa") as demo:
gr.Markdown(
"""
## πΎ Mesin Translasi Bahasa Jawa
**Jawa β Indonesia β English**
β NLLB-200 (Meta)
β RAG Parallel Corpus (hidden)
β LLM post-editor (linguistic refinement)
β Output bersih tanpa referensi
β Aman 16GB
---
"""
)
inp = gr.Textbox(
label="Input Bahasa Jawa",
lines=4,
placeholder="Contoh: Aku arep lunga menyang pasar sesuk."
)
with gr.Row():
out_id = gr.Textbox(label="Bahasa Indonesia", lines=4)
out_en = gr.Textbox(label="English Translation", lines=4)
gr.Button("π Terjemahkan").click(
pipeline,
inp,
[out_id, out_en]
)
demo.launch()
|