File size: 5,776 Bytes
bec47f7
a5ae8c5
 
bec47f7
f0a5f11
a5ae8c5
bec47f7
 
b8da437
 
 
 
 
 
bec47f7
 
 
 
 
b8da437
 
 
 
 
 
 
bec47f7
 
 
 
 
b8da437
f0a5f11
a5ae8c5
f0a5f11
a5ae8c5
 
 
 
 
 
 
f0a5f11
a5ae8c5
 
 
 
 
 
 
 
b8da437
 
 
a5ae8c5
 
 
 
 
 
 
 
 
 
 
 
 
f0a5f11
 
b8da437
f0a5f11
 
 
b8da437
f0a5f11
b8da437
f0a5f11
 
 
 
 
a5ae8c5
bec47f7
a5ae8c5
 
bec47f7
a5ae8c5
bec47f7
a5ae8c5
 
 
 
 
 
 
bec47f7
a5ae8c5
bec47f7
 
 
 
 
b8da437
 
bec47f7
 
 
f0a5f11
bec47f7
a5ae8c5
 
f0a5f11
b8da437
f0a5f11
bec47f7
 
 
 
b8da437
bec47f7
f0a5f11
b8da437
bec47f7
 
b8da437
bec47f7
f0a5f11
b8da437
bec47f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8da437
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# --- MODEL DATA ---
MODELS_CONFIG = {
    "Phase 2: Stable (Formal)": {
        "id": "st192011/Maltese-EuroLLM-1.7B-Phase2-Stable",
        "description": (
            "The 'Bureaucrat Bot'. Built upon a foundational adaptation phase that mixed "
            "monolingual Maltese and Italian to bridge morphological roots. This version "
            "was fine-tuned on high-fidelity EU and governmental parallel corpora, "
            "optimizing it for extreme formal precision and administrative accuracy."
        ),
        "chrf": "60.18",
        "comet": "0.6431"
    },
    "Phase 4: Anchored (Native)": {
        "id": "st192011/Maltese-EuroLLM-1.7B-Phase4-Anchored",
        "description": (
            "The 'Native Speaker'. An evolution of Phase 2 utilizing a curriculum-based "
            "'Full Circle' approach. It integrates synthesized reasoning chains (CoT) "
            "that allow the model to process linguistic logic before translating. By mixing "
            "all previous data types, it anchors factual accuracy to native-level phrasing "
            "and cultural awareness."
        ),
        "chrf": "52.68",
        "comet": "0.6567"
    }
}

# --- MODEL LOADING (Local CPU) ---
print("Loading models... this might take a minute.")

# Load Phase 2
tokenizer_p2 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"])
model_p2 = AutoModelForCausalLM.from_pretrained(
    MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"], 
    device_map="cpu", 
    torch_dtype=torch.float32
)

# Load Phase 4
tokenizer_p4 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"])
model_p4 = AutoModelForCausalLM.from_pretrained(
    MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"], 
    device_map="cpu", 
    torch_dtype=torch.float32
)

def local_translate(model, tokenizer, text, temp):
    if not text.strip():
        return ""
        
    prompt = f"### INGLIŻ: {text}\n### MALTI:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
    
    with torch.no_grad():
        output_tokens = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=temp,
            do_sample=True if temp > 0.1 else False,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # 1. Decode while skipping standard special tokens
    decoded_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    
    # 2. Extract only the Maltese part
    if "### MALTI:" in decoded_text:
        maltese_text = decoded_text.split("### MALTI:")[-1]
    else:
        maltese_text = decoded_text
        
    # 3. CRITICAL: Manual cleaning of the end-of-text string if it still remains
    # This removes <|endoftext|>, </s>, and any extra whitespace
    clean_text = maltese_text.replace("<|endoftext|>", "").replace("</s>", "").strip()
    
    return clean_text

def translate_logic(text, selected_models, temp):
    out_p2 = "Model not selected."
    out_p4 = "Model not selected."
    
    if "Phase 2: Stable (Formal)" in selected_models:
        try:
            out_p2 = local_translate(model_p2, tokenizer_p2, text, temp)
        except Exception as e:
            out_p2 = f"Error: {str(e)}"

    if "Phase 4: Anchored (Native)" in selected_models:
        try:
            out_p4 = local_translate(model_p4, tokenizer_p4, text, temp)
        except Exception as e:
            out_p4 = f"Error: {str(e)}"
            
    return out_p2, out_p4

# --- GRADIO UI ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🇲🇹 Maltese-MT Lab")
    gr.Markdown("Compare English-to-Maltese EuroLLM models running locally on CPU.")
    
    with gr.Row():
        with gr.Column(scale=2):
            input_text = gr.Textbox(label="English Source Text", placeholder="Enter English text...", lines=4)
            model_selector = gr.CheckboxGroup(
                choices=list(MODELS_CONFIG.keys()), 
                value=list(MODELS_CONFIG.keys()), 
                label="Select Models"
            )
            temp_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, label="Temperature")
            btn = gr.Button("🚀 Run Translation", variant="primary")

    with gr.Row():
        with gr.Column():
            gr.Markdown("### Phase 2: Stable (Formal)")
            p2_out = gr.Textbox(label="Output", interactive=False, lines=5)
            gr.Markdown(f"**Training:** {MODELS_CONFIG['Phase 2: Stable (Formal)']['description']}")
            gr.Markdown(f"**Metrics:** ChrF++: `{MODELS_CONFIG['Phase 2: Stable (Formal)']['chrf']}` | COMET: `{MODELS_CONFIG['Phase 2: Stable (Formal)']['comet']}`")
            
        with gr.Column():
            gr.Markdown("### Phase 4: Anchored (Native)")
            p4_out = gr.Textbox(label="Output", interactive=False, lines=5)
            gr.Markdown(f"**Training:** {MODELS_CONFIG['Phase 4: Anchored (Native)']['description']}")
            gr.Markdown(f"**Metrics:** ChrF++: `{MODELS_CONFIG['Phase 4: Anchored (Native)']['chrf']}` | COMET: `{MODELS_CONFIG['Phase 4: Anchored (Native)']['comet']}`")

    gr.Examples(
        examples=[
            ["The ferry to Gozo leaves every 45 minutes."],
            ["We now have 4-month-old mice that are non-diabetic that used to be diabetic."],
            ["This regulation shall be binding in its entirety and directly applicable in all Member States."]
        ],
        inputs=input_text
    )

    btn.click(
        fn=translate_logic, 
        inputs=[input_text, model_selector, temp_slider], 
        outputs=[p2_out, p4_out]
    )

if __name__ == "__main__":
    demo.launch()