Spaces:
Sleeping
Sleeping
| # ============================================================ | |
| # Aplikasi Translasi Jawa -> Indonesia & Inggris + Evaluasi | |
| # MODEL: facebook/m2m100_418M | |
| # METRIK: BLEU + ROUGE-L + METEOR | |
| # RAG: FAISS + SentenceTransformer | |
| # ============================================================ | |
| import torch | |
| import gradio as gr | |
| import sacrebleu | |
| import nltk | |
| import pandas as pd | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| from rouge import Rouge | |
| from nltk.translate.meteor_score import meteor_score | |
| from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer | |
| print("=== APP STARTED ===") | |
| # ============================================================ | |
| # NLTK resources | |
| # ============================================================ | |
| nltk.download("wordnet") | |
| nltk.download("omw-1.4") | |
| # ============================================================ | |
| # Device | |
| # ============================================================ | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print("Using device:", device) | |
| # ============================================================ | |
| # Load Translation Model | |
| # ============================================================ | |
| MODEL_NAME = "facebook/m2m100_418M" | |
| tokenizer = M2M100Tokenizer.from_pretrained(MODEL_NAME) | |
| model = M2M100ForConditionalGeneration.from_pretrained(MODEL_NAME) | |
| model.to(device) | |
| model.eval() | |
| print("M2M100 loaded") | |
| # ============================================================ | |
| # Load RAG Corpus | |
| # ============================================================ | |
| RAG_FILE = "kb_jawa_translation_rag_300k.csv" # WAJIB ADA | |
| rag_df = pd.read_csv(RAG_FILE) | |
| rag_texts = rag_df["jv"].astype(str).tolist() | |
| rag_refs = rag_df["id"].astype(str).tolist() | |
| # ============================================================ | |
| # Load Embedder (SAFE) | |
| # ============================================================ | |
| embedder = SentenceTransformer( | |
| "sentence-transformers/all-MiniLM-L6-v2", | |
| device=device | |
| ) | |
| print("Embedding RAG corpus...") | |
| rag_embeddings = embedder.encode( | |
| rag_texts, | |
| convert_to_numpy=True, | |
| batch_size=64, | |
| show_progress_bar=True | |
| ) | |
| index = faiss.IndexFlatL2(rag_embeddings.shape[1]) | |
| index.add(rag_embeddings) | |
| print("RAG corpus loaded:", len(rag_texts)) | |
| # ============================================================ | |
| # RAG RETRIEVAL FUNCTION (FIXED) | |
| # ============================================================ | |
| def retrieve_examples(query, k=2): | |
| query_emb = embedder.encode([query], convert_to_numpy=True) | |
| _, indices = index.search(query_emb, k) | |
| examples = [] | |
| for idx in indices[0]: | |
| examples.append(rag_texts[idx]) | |
| return examples | |
| # ============================================================ | |
| # Translation Function (RAG-SAFE) | |
| # ============================================================ | |
| def translate_jawa(text, use_rag=True): | |
| if not text.strip(): | |
| return "", "" | |
| tokenizer.src_lang = "jv" | |
| if use_rag: | |
| examples = retrieve_examples(text, k=2) | |
| prefix = " ".join(examples) | |
| final_input = prefix + " " + text | |
| else: | |
| final_input = text | |
| inputs = tokenizer( | |
| final_input, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=512 | |
| ).to(device) | |
| with torch.no_grad(): | |
| gen_id = model.generate( | |
| **inputs, | |
| forced_bos_token_id=tokenizer.get_lang_id("id"), | |
| max_length=256 | |
| ) | |
| gen_en = model.generate( | |
| **inputs, | |
| forced_bos_token_id=tokenizer.get_lang_id("en"), | |
| max_length=256 | |
| ) | |
| id_text = tokenizer.batch_decode(gen_id, skip_special_tokens=True)[0] | |
| en_text = tokenizer.batch_decode(gen_en, skip_special_tokens=True)[0] | |
| return id_text, en_text | |
| # ============================================================ | |
| # Evaluation Function | |
| # ============================================================ | |
| def evaluate_translation(jawa, ref_id, ref_en): | |
| pred_id, pred_en = translate_jawa(jawa) | |
| bleu = sacrebleu.corpus_bleu( | |
| [pred_id], | |
| [[ref_id]] | |
| ).score | |
| rouge = Rouge() | |
| rouge_l = rouge.get_scores(pred_id, ref_id)[0]["rouge-l"]["f"] | |
| meteor = meteor_score( | |
| [ref_id.split()], | |
| pred_id.split() | |
| ) | |
| return ( | |
| pred_id, | |
| pred_en, | |
| f"{bleu:.2f}", | |
| f"{rouge_l:.4f}", | |
| f"{meteor:.4f}" | |
| ) | |
| # ============================================================ | |
| # Gradio UI | |
| # ============================================================ | |
| with gr.Blocks(title="Jawa → Indonesia & English Translator") as demo: | |
| gr.Markdown("## 🈶 Translasi Bahasa Jawa + RAG") | |
| gr.Markdown( | |
| "Model: **facebook/m2m100_418M** \n" | |
| "Evaluasi: **BLEU · ROUGE-L · METEOR**" | |
| ) | |
| with gr.Tab("🔤 Translasi"): | |
| inp = gr.Textbox(lines=5, label="Teks Bahasa Jawa") | |
| use_rag = gr.Checkbox(label="Gunakan RAG", value=True) | |
| out_id = gr.Textbox(lines=3, label="Terjemahan Indonesia") | |
| out_en = gr.Textbox(lines=3, label="Terjemahan English") | |
| btn = gr.Button("Terjemahkan") | |
| btn.click( | |
| fn=translate_jawa, | |
| inputs=[inp, use_rag], | |
| outputs=[out_id, out_en] | |
| ) | |
| with gr.Tab("📊 Evaluasi"): | |
| eval_jawa = gr.Textbox(lines=4, label="Teks Jawa") | |
| ref_id = gr.Textbox(lines=2, label="Referensi Indonesia") | |
| ref_en = gr.Textbox(lines=2, label="Referensi English") | |
| pred_id = gr.Textbox(label="Prediksi Indonesia") | |
| pred_en = gr.Textbox(label="Prediksi English") | |
| bleu = gr.Textbox(label="BLEU") | |
| rouge_l = gr.Textbox(label="ROUGE-L") | |
| meteor = gr.Textbox(label="METEOR") | |
| eval_btn = gr.Button("Evaluasi") | |
| eval_btn.click( | |
| fn=evaluate_translation, | |
| inputs=[eval_jawa, ref_id, ref_en], | |
| outputs=[pred_id, pred_en, bleu, rouge_l, meteor] | |
| ) | |
| # ============================================================ | |
| # Launch | |
| # ============================================================ | |
| if __name__ == "__main__": | |
| print("=== LAUNCHING GRADIO ===") | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) | |