Spaces:
Sleeping
Sleeping
File size: 5,776 Bytes
bec47f7 a5ae8c5 bec47f7 f0a5f11 a5ae8c5 bec47f7 b8da437 bec47f7 b8da437 bec47f7 b8da437 f0a5f11 a5ae8c5 f0a5f11 a5ae8c5 f0a5f11 a5ae8c5 b8da437 a5ae8c5 f0a5f11 b8da437 f0a5f11 b8da437 f0a5f11 b8da437 f0a5f11 a5ae8c5 bec47f7 a5ae8c5 bec47f7 a5ae8c5 bec47f7 a5ae8c5 bec47f7 a5ae8c5 bec47f7 b8da437 bec47f7 f0a5f11 bec47f7 a5ae8c5 f0a5f11 b8da437 f0a5f11 bec47f7 b8da437 bec47f7 f0a5f11 b8da437 bec47f7 b8da437 bec47f7 f0a5f11 b8da437 bec47f7 b8da437 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# --- MODEL DATA ---
MODELS_CONFIG = {
"Phase 2: Stable (Formal)": {
"id": "st192011/Maltese-EuroLLM-1.7B-Phase2-Stable",
"description": (
"The 'Bureaucrat Bot'. Built upon a foundational adaptation phase that mixed "
"monolingual Maltese and Italian to bridge morphological roots. This version "
"was fine-tuned on high-fidelity EU and governmental parallel corpora, "
"optimizing it for extreme formal precision and administrative accuracy."
),
"chrf": "60.18",
"comet": "0.6431"
},
"Phase 4: Anchored (Native)": {
"id": "st192011/Maltese-EuroLLM-1.7B-Phase4-Anchored",
"description": (
"The 'Native Speaker'. An evolution of Phase 2 utilizing a curriculum-based "
"'Full Circle' approach. It integrates synthesized reasoning chains (CoT) "
"that allow the model to process linguistic logic before translating. By mixing "
"all previous data types, it anchors factual accuracy to native-level phrasing "
"and cultural awareness."
),
"chrf": "52.68",
"comet": "0.6567"
}
}
# --- MODEL LOADING (Local CPU) ---
print("Loading models... this might take a minute.")
# Load Phase 2
tokenizer_p2 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"])
model_p2 = AutoModelForCausalLM.from_pretrained(
MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"],
device_map="cpu",
torch_dtype=torch.float32
)
# Load Phase 4
tokenizer_p4 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"])
model_p4 = AutoModelForCausalLM.from_pretrained(
MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"],
device_map="cpu",
torch_dtype=torch.float32
)
def local_translate(model, tokenizer, text, temp):
if not text.strip():
return ""
prompt = f"### INGLIŻ: {text}\n### MALTI:"
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
with torch.no_grad():
output_tokens = model.generate(
**inputs,
max_new_tokens=150,
temperature=temp,
do_sample=True if temp > 0.1 else False,
repetition_penalty=1.2,
pad_token_id=tokenizer.eos_token_id
)
# 1. Decode while skipping standard special tokens
decoded_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
# 2. Extract only the Maltese part
if "### MALTI:" in decoded_text:
maltese_text = decoded_text.split("### MALTI:")[-1]
else:
maltese_text = decoded_text
# 3. CRITICAL: Manual cleaning of the end-of-text string if it still remains
# This removes <|endoftext|>, </s>, and any extra whitespace
clean_text = maltese_text.replace("<|endoftext|>", "").replace("</s>", "").strip()
return clean_text
def translate_logic(text, selected_models, temp):
out_p2 = "Model not selected."
out_p4 = "Model not selected."
if "Phase 2: Stable (Formal)" in selected_models:
try:
out_p2 = local_translate(model_p2, tokenizer_p2, text, temp)
except Exception as e:
out_p2 = f"Error: {str(e)}"
if "Phase 4: Anchored (Native)" in selected_models:
try:
out_p4 = local_translate(model_p4, tokenizer_p4, text, temp)
except Exception as e:
out_p4 = f"Error: {str(e)}"
return out_p2, out_p4
# --- GRADIO UI ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🇲🇹 Maltese-MT Lab")
gr.Markdown("Compare English-to-Maltese EuroLLM models running locally on CPU.")
with gr.Row():
with gr.Column(scale=2):
input_text = gr.Textbox(label="English Source Text", placeholder="Enter English text...", lines=4)
model_selector = gr.CheckboxGroup(
choices=list(MODELS_CONFIG.keys()),
value=list(MODELS_CONFIG.keys()),
label="Select Models"
)
temp_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, label="Temperature")
btn = gr.Button("🚀 Run Translation", variant="primary")
with gr.Row():
with gr.Column():
gr.Markdown("### Phase 2: Stable (Formal)")
p2_out = gr.Textbox(label="Output", interactive=False, lines=5)
gr.Markdown(f"**Training:** {MODELS_CONFIG['Phase 2: Stable (Formal)']['description']}")
gr.Markdown(f"**Metrics:** ChrF++: `{MODELS_CONFIG['Phase 2: Stable (Formal)']['chrf']}` | COMET: `{MODELS_CONFIG['Phase 2: Stable (Formal)']['comet']}`")
with gr.Column():
gr.Markdown("### Phase 4: Anchored (Native)")
p4_out = gr.Textbox(label="Output", interactive=False, lines=5)
gr.Markdown(f"**Training:** {MODELS_CONFIG['Phase 4: Anchored (Native)']['description']}")
gr.Markdown(f"**Metrics:** ChrF++: `{MODELS_CONFIG['Phase 4: Anchored (Native)']['chrf']}` | COMET: `{MODELS_CONFIG['Phase 4: Anchored (Native)']['comet']}`")
gr.Examples(
examples=[
["The ferry to Gozo leaves every 45 minutes."],
["We now have 4-month-old mice that are non-diabetic that used to be diabetic."],
["This regulation shall be binding in its entirety and directly applicable in all Member States."]
],
inputs=input_text
)
btn.click(
fn=translate_logic,
inputs=[input_text, model_selector, temp_slider],
outputs=[p2_out, p4_out]
)
if __name__ == "__main__":
demo.launch() |