Maltese-MT-Lab / app.py
st192011's picture
Update app.py
f0a5f11 verified
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# --- MODEL DATA ---
MODELS_CONFIG = {
"Phase 2: Stable (Formal)": {
"id": "st192011/Maltese-EuroLLM-1.7B-Phase2-Stable",
"description": (
"The 'Bureaucrat Bot'. Built upon a foundational adaptation phase that mixed "
"monolingual Maltese and Italian to bridge morphological roots. This version "
"was fine-tuned on high-fidelity EU and governmental parallel corpora, "
"optimizing it for extreme formal precision and administrative accuracy."
),
"chrf": "60.18",
"comet": "0.6431"
},
"Phase 4: Anchored (Native)": {
"id": "st192011/Maltese-EuroLLM-1.7B-Phase4-Anchored",
"description": (
"The 'Native Speaker'. An evolution of Phase 2 utilizing a curriculum-based "
"'Full Circle' approach. It integrates synthesized reasoning chains (CoT) "
"that allow the model to process linguistic logic before translating. By mixing "
"all previous data types, it anchors factual accuracy to native-level phrasing "
"and cultural awareness."
),
"chrf": "52.68",
"comet": "0.6567"
}
}
# --- MODEL LOADING (Local CPU) ---
print("Loading models... this might take a minute.")
# Load Phase 2
tokenizer_p2 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"])
model_p2 = AutoModelForCausalLM.from_pretrained(
MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"],
device_map="cpu",
torch_dtype=torch.float32
)
# Load Phase 4
tokenizer_p4 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"])
model_p4 = AutoModelForCausalLM.from_pretrained(
MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"],
device_map="cpu",
torch_dtype=torch.float32
)
def local_translate(model, tokenizer, text, temp):
if not text.strip():
return ""
prompt = f"### INGLIŻ: {text}\n### MALTI:"
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
with torch.no_grad():
output_tokens = model.generate(
**inputs,
max_new_tokens=150,
temperature=temp,
do_sample=True if temp > 0.1 else False,
repetition_penalty=1.2,
pad_token_id=tokenizer.eos_token_id
)
# 1. Decode while skipping standard special tokens
decoded_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
# 2. Extract only the Maltese part
if "### MALTI:" in decoded_text:
maltese_text = decoded_text.split("### MALTI:")[-1]
else:
maltese_text = decoded_text
# 3. CRITICAL: Manual cleaning of the end-of-text string if it still remains
# This removes <|endoftext|>, </s>, and any extra whitespace
clean_text = maltese_text.replace("<|endoftext|>", "").replace("</s>", "").strip()
return clean_text
def translate_logic(text, selected_models, temp):
out_p2 = "Model not selected."
out_p4 = "Model not selected."
if "Phase 2: Stable (Formal)" in selected_models:
try:
out_p2 = local_translate(model_p2, tokenizer_p2, text, temp)
except Exception as e:
out_p2 = f"Error: {str(e)}"
if "Phase 4: Anchored (Native)" in selected_models:
try:
out_p4 = local_translate(model_p4, tokenizer_p4, text, temp)
except Exception as e:
out_p4 = f"Error: {str(e)}"
return out_p2, out_p4
# --- GRADIO UI ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🇲🇹 Maltese-MT Lab")
gr.Markdown("Compare English-to-Maltese EuroLLM models running locally on CPU.")
with gr.Row():
with gr.Column(scale=2):
input_text = gr.Textbox(label="English Source Text", placeholder="Enter English text...", lines=4)
model_selector = gr.CheckboxGroup(
choices=list(MODELS_CONFIG.keys()),
value=list(MODELS_CONFIG.keys()),
label="Select Models"
)
temp_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, label="Temperature")
btn = gr.Button("🚀 Run Translation", variant="primary")
with gr.Row():
with gr.Column():
gr.Markdown("### Phase 2: Stable (Formal)")
p2_out = gr.Textbox(label="Output", interactive=False, lines=5)
gr.Markdown(f"**Training:** {MODELS_CONFIG['Phase 2: Stable (Formal)']['description']}")
gr.Markdown(f"**Metrics:** ChrF++: `{MODELS_CONFIG['Phase 2: Stable (Formal)']['chrf']}` | COMET: `{MODELS_CONFIG['Phase 2: Stable (Formal)']['comet']}`")
with gr.Column():
gr.Markdown("### Phase 4: Anchored (Native)")
p4_out = gr.Textbox(label="Output", interactive=False, lines=5)
gr.Markdown(f"**Training:** {MODELS_CONFIG['Phase 4: Anchored (Native)']['description']}")
gr.Markdown(f"**Metrics:** ChrF++: `{MODELS_CONFIG['Phase 4: Anchored (Native)']['chrf']}` | COMET: `{MODELS_CONFIG['Phase 4: Anchored (Native)']['comet']}`")
gr.Examples(
examples=[
["The ferry to Gozo leaves every 45 minutes."],
["We now have 4-month-old mice that are non-diabetic that used to be diabetic."],
["This regulation shall be binding in its entirety and directly applicable in all Member States."]
],
inputs=input_text
)
btn.click(
fn=translate_logic,
inputs=[input_text, model_selector, temp_slider],
outputs=[p2_out, p4_out]
)
if __name__ == "__main__":
demo.launch()