import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer # --- MODEL DATA --- MODELS_CONFIG = { "Phase 2: Stable (Formal)": { "id": "st192011/Maltese-EuroLLM-1.7B-Phase2-Stable", "description": ( "The 'Bureaucrat Bot'. Built upon a foundational adaptation phase that mixed " "monolingual Maltese and Italian to bridge morphological roots. This version " "was fine-tuned on high-fidelity EU and governmental parallel corpora, " "optimizing it for extreme formal precision and administrative accuracy." ), "chrf": "60.18", "comet": "0.6431" }, "Phase 4: Anchored (Native)": { "id": "st192011/Maltese-EuroLLM-1.7B-Phase4-Anchored", "description": ( "The 'Native Speaker'. An evolution of Phase 2 utilizing a curriculum-based " "'Full Circle' approach. It integrates synthesized reasoning chains (CoT) " "that allow the model to process linguistic logic before translating. By mixing " "all previous data types, it anchors factual accuracy to native-level phrasing " "and cultural awareness." ), "chrf": "52.68", "comet": "0.6567" } } # --- MODEL LOADING (Local CPU) --- print("Loading models... this might take a minute.") # Load Phase 2 tokenizer_p2 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"]) model_p2 = AutoModelForCausalLM.from_pretrained( MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"], device_map="cpu", torch_dtype=torch.float32 ) # Load Phase 4 tokenizer_p4 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"]) model_p4 = AutoModelForCausalLM.from_pretrained( MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"], device_map="cpu", torch_dtype=torch.float32 ) def local_translate(model, tokenizer, text, temp): if not text.strip(): return "" prompt = f"### INGLIŻ: {text}\n### MALTI:" inputs = tokenizer(prompt, return_tensors="pt").to("cpu") with torch.no_grad(): output_tokens = model.generate( **inputs, max_new_tokens=150, temperature=temp, do_sample=True if temp > 0.1 else False, repetition_penalty=1.2, pad_token_id=tokenizer.eos_token_id ) # 1. Decode while skipping standard special tokens decoded_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True) # 2. Extract only the Maltese part if "### MALTI:" in decoded_text: maltese_text = decoded_text.split("### MALTI:")[-1] else: maltese_text = decoded_text # 3. CRITICAL: Manual cleaning of the end-of-text string if it still remains # This removes <|endoftext|>, , and any extra whitespace clean_text = maltese_text.replace("<|endoftext|>", "").replace("", "").strip() return clean_text def translate_logic(text, selected_models, temp): out_p2 = "Model not selected." out_p4 = "Model not selected." if "Phase 2: Stable (Formal)" in selected_models: try: out_p2 = local_translate(model_p2, tokenizer_p2, text, temp) except Exception as e: out_p2 = f"Error: {str(e)}" if "Phase 4: Anchored (Native)" in selected_models: try: out_p4 = local_translate(model_p4, tokenizer_p4, text, temp) except Exception as e: out_p4 = f"Error: {str(e)}" return out_p2, out_p4 # --- GRADIO UI --- with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# 🇲🇹 Maltese-MT Lab") gr.Markdown("Compare English-to-Maltese EuroLLM models running locally on CPU.") with gr.Row(): with gr.Column(scale=2): input_text = gr.Textbox(label="English Source Text", placeholder="Enter English text...", lines=4) model_selector = gr.CheckboxGroup( choices=list(MODELS_CONFIG.keys()), value=list(MODELS_CONFIG.keys()), label="Select Models" ) temp_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, label="Temperature") btn = gr.Button("🚀 Run Translation", variant="primary") with gr.Row(): with gr.Column(): gr.Markdown("### Phase 2: Stable (Formal)") p2_out = gr.Textbox(label="Output", interactive=False, lines=5) gr.Markdown(f"**Training:** {MODELS_CONFIG['Phase 2: Stable (Formal)']['description']}") gr.Markdown(f"**Metrics:** ChrF++: `{MODELS_CONFIG['Phase 2: Stable (Formal)']['chrf']}` | COMET: `{MODELS_CONFIG['Phase 2: Stable (Formal)']['comet']}`") with gr.Column(): gr.Markdown("### Phase 4: Anchored (Native)") p4_out = gr.Textbox(label="Output", interactive=False, lines=5) gr.Markdown(f"**Training:** {MODELS_CONFIG['Phase 4: Anchored (Native)']['description']}") gr.Markdown(f"**Metrics:** ChrF++: `{MODELS_CONFIG['Phase 4: Anchored (Native)']['chrf']}` | COMET: `{MODELS_CONFIG['Phase 4: Anchored (Native)']['comet']}`") gr.Examples( examples=[ ["The ferry to Gozo leaves every 45 minutes."], ["We now have 4-month-old mice that are non-diabetic that used to be diabetic."], ["This regulation shall be binding in its entirety and directly applicable in all Member States."] ], inputs=input_text ) btn.click( fn=translate_logic, inputs=[input_text, model_selector, temp_slider], outputs=[p2_out, p4_out] ) if __name__ == "__main__": demo.launch()