Spaces:

st192011
/

Maltese-MT-Lab

Sleeping

File size: 5,776 Bytes

bec47f7
a5ae8c5
 
bec47f7
f0a5f11
a5ae8c5
bec47f7
 
b8da437
 
 
 
 
 
bec47f7
 
 
 
 
b8da437
 
 
 
 
 
 
bec47f7
 
 
 
 
b8da437
f0a5f11
a5ae8c5
f0a5f11
a5ae8c5
 
 
 
 
 
 
f0a5f11
a5ae8c5
 
 
 
 
 
 
 
b8da437
 
 
a5ae8c5
 
 
 
 
 
 
 
 
 
 
 
 
f0a5f11
 
b8da437
f0a5f11
 
 
b8da437
f0a5f11
b8da437
f0a5f11
 
 
 
 
a5ae8c5
bec47f7
a5ae8c5
 
bec47f7
a5ae8c5
bec47f7
a5ae8c5
 
 
 
 
 
 
bec47f7
a5ae8c5
bec47f7
 
 
 
 
b8da437
 
bec47f7
 
 
f0a5f11
bec47f7
a5ae8c5
 
f0a5f11
b8da437
f0a5f11
bec47f7
 
 
 
b8da437
bec47f7
f0a5f11
b8da437
bec47f7
 
b8da437
bec47f7
f0a5f11
b8da437
bec47f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8da437

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# --- MODEL DATA ---
MODELS_CONFIG = {
    "Phase 2: Stable (Formal)": {
        "id": "st192011/Maltese-EuroLLM-1.7B-Phase2-Stable",
        "description": (
            "The 'Bureaucrat Bot'. Built upon a foundational adaptation phase that mixed "
            "monolingual Maltese and Italian to bridge morphological roots. This version "
            "was fine-tuned on high-fidelity EU and governmental parallel corpora, "
            "optimizing it for extreme formal precision and administrative accuracy."
        ),
        "chrf": "60.18",
        "comet": "0.6431"
    },
    "Phase 4: Anchored (Native)": {
        "id": "st192011/Maltese-EuroLLM-1.7B-Phase4-Anchored",
        "description": (
            "The 'Native Speaker'. An evolution of Phase 2 utilizing a curriculum-based "
            "'Full Circle' approach. It integrates synthesized reasoning chains (CoT) "
            "that allow the model to process linguistic logic before translating. By mixing "
            "all previous data types, it anchors factual accuracy to native-level phrasing "
            "and cultural awareness."
        ),
        "chrf": "52.68",
        "comet": "0.6567"
    }
}

# --- MODEL LOADING (Local CPU) ---
print("Loading models... this might take a minute.")

# Load Phase 2
tokenizer_p2 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"])
model_p2 = AutoModelForCausalLM.from_pretrained(
    MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"], 
    device_map="cpu", 
    torch_dtype=torch.float32
)

# Load Phase 4
tokenizer_p4 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"])
model_p4 = AutoModelForCausalLM.from_pretrained(
    MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"], 
    device_map="cpu", 
    torch_dtype=torch.float32
)

def local_translate(model, tokenizer, text, temp):
    if not text.strip():
        return ""
        
    prompt = f"### INGLIŻ: {text}\n### MALTI:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
    
    with torch.no_grad():
        output_tokens = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=temp,
            do_sample=True if temp > 0.1 else False,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # 1. Decode while skipping standard special tokens
    decoded_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    
    # 2. Extract only the Maltese part
    if "### MALTI:" in decoded_text:
        maltese_text = decoded_text.split("### MALTI:")[-1]
    else:
        maltese_text = decoded_text
        
    # 3. CRITICAL: Manual cleaning of the end-of-text string if it still remains
    # This removes <|endoftext|>, </s>, and any extra whitespace
    clean_text = maltese_text.replace("<|endoftext|>", "").replace("</s>", "").strip()
    
    return clean_text

def translate_logic(text, selected_models, temp):
    out_p2 = "Model not selected."
    out_p4 = "Model not selected."
    
    if "Phase 2: Stable (Formal)" in selected_models:
        try:
            out_p2 = local_translate(model_p2, tokenizer_p2, text, temp)
        except Exception as e:
            out_p2 = f"Error: {str(e)}"

    if "Phase 4: Anchored (Native)" in selected_models:
        try:
            out_p4 = local_translate(model_p4, tokenizer_p4, text, temp)
        except Exception as e:
            out_p4 = f"Error: {str(e)}"
            
    return out_p2, out_p4

# --- GRADIO UI ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🇲🇹 Maltese-MT Lab")
    gr.Markdown("Compare English-to-Maltese EuroLLM models running locally on CPU.")
    
    with gr.Row():
        with gr.Column(scale=2):
            input_text = gr.Textbox(label="English Source Text", placeholder="Enter English text...", lines=4)
            model_selector = gr.CheckboxGroup(
                choices=list(MODELS_CONFIG.keys()), 
                value=list(MODELS_CONFIG.keys()), 
                label="Select Models"
            )
            temp_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, label="Temperature")
            btn = gr.Button("🚀 Run Translation", variant="primary")

    with gr.Row():
        with gr.Column():
            gr.Markdown("### Phase 2: Stable (Formal)")
            p2_out = gr.Textbox(label="Output", interactive=False, lines=5)
            gr.Markdown(f"**Training:** {MODELS_CONFIG['Phase 2: Stable (Formal)']['description']}")
            gr.Markdown(f"**Metrics:** ChrF++: `{MODELS_CONFIG['Phase 2: Stable (Formal)']['chrf']}` | COMET: `{MODELS_CONFIG['Phase 2: Stable (Formal)']['comet']}`")
            
        with gr.Column():
            gr.Markdown("### Phase 4: Anchored (Native)")
            p4_out = gr.Textbox(label="Output", interactive=False, lines=5)
            gr.Markdown(f"**Training:** {MODELS_CONFIG['Phase 4: Anchored (Native)']['description']}")
            gr.Markdown(f"**Metrics:** ChrF++: `{MODELS_CONFIG['Phase 4: Anchored (Native)']['chrf']}` | COMET: `{MODELS_CONFIG['Phase 4: Anchored (Native)']['comet']}`")

    gr.Examples(
        examples=[
            ["The ferry to Gozo leaves every 45 minutes."],
            ["We now have 4-month-old mice that are non-diabetic that used to be diabetic."],
            ["This regulation shall be binding in its entirety and directly applicable in all Member States."]
        ],
        inputs=input_text
    )

    btn.click(
        fn=translate_logic, 
        inputs=[input_text, model_selector, temp_slider], 
        outputs=[p2_out, p4_out]
    )

if __name__ == "__main__":
    demo.launch()