Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| # --- MODEL DATA --- | |
| MODELS_CONFIG = { | |
| "Phase 2: Stable (Formal)": { | |
| "id": "st192011/Maltese-EuroLLM-1.7B-Phase2-Stable", | |
| "description": ( | |
| "The 'Bureaucrat Bot'. Built upon a foundational adaptation phase that mixed " | |
| "monolingual Maltese and Italian to bridge morphological roots. This version " | |
| "was fine-tuned on high-fidelity EU and governmental parallel corpora, " | |
| "optimizing it for extreme formal precision and administrative accuracy." | |
| ), | |
| "chrf": "60.18", | |
| "comet": "0.6431" | |
| }, | |
| "Phase 4: Anchored (Native)": { | |
| "id": "st192011/Maltese-EuroLLM-1.7B-Phase4-Anchored", | |
| "description": ( | |
| "The 'Native Speaker'. An evolution of Phase 2 utilizing a curriculum-based " | |
| "'Full Circle' approach. It integrates synthesized reasoning chains (CoT) " | |
| "that allow the model to process linguistic logic before translating. By mixing " | |
| "all previous data types, it anchors factual accuracy to native-level phrasing " | |
| "and cultural awareness." | |
| ), | |
| "chrf": "52.68", | |
| "comet": "0.6567" | |
| } | |
| } | |
| # --- MODEL LOADING (Local CPU) --- | |
| print("Loading models... this might take a minute.") | |
| # Load Phase 2 | |
| tokenizer_p2 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"]) | |
| model_p2 = AutoModelForCausalLM.from_pretrained( | |
| MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"], | |
| device_map="cpu", | |
| torch_dtype=torch.float32 | |
| ) | |
| # Load Phase 4 | |
| tokenizer_p4 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"]) | |
| model_p4 = AutoModelForCausalLM.from_pretrained( | |
| MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"], | |
| device_map="cpu", | |
| torch_dtype=torch.float32 | |
| ) | |
| def local_translate(model, tokenizer, text, temp): | |
| if not text.strip(): | |
| return "" | |
| prompt = f"### INGLIŻ: {text}\n### MALTI:" | |
| inputs = tokenizer(prompt, return_tensors="pt").to("cpu") | |
| with torch.no_grad(): | |
| output_tokens = model.generate( | |
| **inputs, | |
| max_new_tokens=150, | |
| temperature=temp, | |
| do_sample=True if temp > 0.1 else False, | |
| repetition_penalty=1.2, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| # 1. Decode while skipping standard special tokens | |
| decoded_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True) | |
| # 2. Extract only the Maltese part | |
| if "### MALTI:" in decoded_text: | |
| maltese_text = decoded_text.split("### MALTI:")[-1] | |
| else: | |
| maltese_text = decoded_text | |
| # 3. CRITICAL: Manual cleaning of the end-of-text string if it still remains | |
| # This removes <|endoftext|>, </s>, and any extra whitespace | |
| clean_text = maltese_text.replace("<|endoftext|>", "").replace("</s>", "").strip() | |
| return clean_text | |
| def translate_logic(text, selected_models, temp): | |
| out_p2 = "Model not selected." | |
| out_p4 = "Model not selected." | |
| if "Phase 2: Stable (Formal)" in selected_models: | |
| try: | |
| out_p2 = local_translate(model_p2, tokenizer_p2, text, temp) | |
| except Exception as e: | |
| out_p2 = f"Error: {str(e)}" | |
| if "Phase 4: Anchored (Native)" in selected_models: | |
| try: | |
| out_p4 = local_translate(model_p4, tokenizer_p4, text, temp) | |
| except Exception as e: | |
| out_p4 = f"Error: {str(e)}" | |
| return out_p2, out_p4 | |
| # --- GRADIO UI --- | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🇲🇹 Maltese-MT Lab") | |
| gr.Markdown("Compare English-to-Maltese EuroLLM models running locally on CPU.") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| input_text = gr.Textbox(label="English Source Text", placeholder="Enter English text...", lines=4) | |
| model_selector = gr.CheckboxGroup( | |
| choices=list(MODELS_CONFIG.keys()), | |
| value=list(MODELS_CONFIG.keys()), | |
| label="Select Models" | |
| ) | |
| temp_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, label="Temperature") | |
| btn = gr.Button("🚀 Run Translation", variant="primary") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### Phase 2: Stable (Formal)") | |
| p2_out = gr.Textbox(label="Output", interactive=False, lines=5) | |
| gr.Markdown(f"**Training:** {MODELS_CONFIG['Phase 2: Stable (Formal)']['description']}") | |
| gr.Markdown(f"**Metrics:** ChrF++: `{MODELS_CONFIG['Phase 2: Stable (Formal)']['chrf']}` | COMET: `{MODELS_CONFIG['Phase 2: Stable (Formal)']['comet']}`") | |
| with gr.Column(): | |
| gr.Markdown("### Phase 4: Anchored (Native)") | |
| p4_out = gr.Textbox(label="Output", interactive=False, lines=5) | |
| gr.Markdown(f"**Training:** {MODELS_CONFIG['Phase 4: Anchored (Native)']['description']}") | |
| gr.Markdown(f"**Metrics:** ChrF++: `{MODELS_CONFIG['Phase 4: Anchored (Native)']['chrf']}` | COMET: `{MODELS_CONFIG['Phase 4: Anchored (Native)']['comet']}`") | |
| gr.Examples( | |
| examples=[ | |
| ["The ferry to Gozo leaves every 45 minutes."], | |
| ["We now have 4-month-old mice that are non-diabetic that used to be diabetic."], | |
| ["This regulation shall be binding in its entirety and directly applicable in all Member States."] | |
| ], | |
| inputs=input_text | |
| ) | |
| btn.click( | |
| fn=translate_logic, | |
| inputs=[input_text, model_selector, temp_slider], | |
| outputs=[p2_out, p4_out] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |