Spaces:

st192011
/

Maltese-MT-Lab

Running

App Files Files Community

Maltese-MT-Lab / app.py

st192011

Update app.py

f0a5f11 verified 15 days ago

raw

history blame contribute delete

5.78 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer

	# --- MODEL DATA ---
	MODELS_CONFIG = {
	"Phase 2: Stable (Formal)": {
	"id": "st192011/Maltese-EuroLLM-1.7B-Phase2-Stable",
	"description": (
	"The 'Bureaucrat Bot'. Built upon a foundational adaptation phase that mixed "
	"monolingual Maltese and Italian to bridge morphological roots. This version "
	"was fine-tuned on high-fidelity EU and governmental parallel corpora, "
	"optimizing it for extreme formal precision and administrative accuracy."
	),
	"chrf": "60.18",
	"comet": "0.6431"
	},
	"Phase 4: Anchored (Native)": {
	"id": "st192011/Maltese-EuroLLM-1.7B-Phase4-Anchored",
	"description": (
	"The 'Native Speaker'. An evolution of Phase 2 utilizing a curriculum-based "
	"'Full Circle' approach. It integrates synthesized reasoning chains (CoT) "
	"that allow the model to process linguistic logic before translating. By mixing "
	"all previous data types, it anchors factual accuracy to native-level phrasing "
	"and cultural awareness."
	),
	"chrf": "52.68",
	"comet": "0.6567"
	}
	}

	# --- MODEL LOADING (Local CPU) ---
	print("Loading models... this might take a minute.")

	# Load Phase 2
	tokenizer_p2 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"])
	model_p2 = AutoModelForCausalLM.from_pretrained(
	MODELS_CONFIG["Phase 2: Stable (Formal)"]["id"],
	device_map="cpu",
	torch_dtype=torch.float32
	)

	# Load Phase 4
	tokenizer_p4 = AutoTokenizer.from_pretrained(MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"])
	model_p4 = AutoModelForCausalLM.from_pretrained(
	MODELS_CONFIG["Phase 4: Anchored (Native)"]["id"],
	device_map="cpu",
	torch_dtype=torch.float32
	)

	def local_translate(model, tokenizer, text, temp):
	if not text.strip():
	return ""

	prompt = f"### INGLIŻ: {text}\n### MALTI:"
	inputs = tokenizer(prompt, return_tensors="pt").to("cpu")

	with torch.no_grad():
	output_tokens = model.generate(
	**inputs,
	max_new_tokens=150,
	temperature=temp,
	do_sample=True if temp > 0.1 else False,
	repetition_penalty=1.2,
	pad_token_id=tokenizer.eos_token_id
	)

	# 1. Decode while skipping standard special tokens
	decoded_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

	# 2. Extract only the Maltese part
	if "### MALTI:" in decoded_text:
	maltese_text = decoded_text.split("### MALTI:")[-1]
	else:
	maltese_text = decoded_text

	# 3. CRITICAL: Manual cleaning of the end-of-text string if it still remains
	# This removes <\|endoftext\|>, </s>, and any extra whitespace
	clean_text = maltese_text.replace("<\|endoftext\|>", "").replace("</s>", "").strip()

	return clean_text

	def translate_logic(text, selected_models, temp):
	out_p2 = "Model not selected."
	out_p4 = "Model not selected."

	if "Phase 2: Stable (Formal)" in selected_models:
	try:
	out_p2 = local_translate(model_p2, tokenizer_p2, text, temp)
	except Exception as e:
	out_p2 = f"Error: {str(e)}"

	if "Phase 4: Anchored (Native)" in selected_models:
	try:
	out_p4 = local_translate(model_p4, tokenizer_p4, text, temp)
	except Exception as e:
	out_p4 = f"Error: {str(e)}"

	return out_p2, out_p4

	# --- GRADIO UI ---
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🇲🇹 Maltese-MT Lab")
	gr.Markdown("Compare English-to-Maltese EuroLLM models running locally on CPU.")

	with gr.Row():
	with gr.Column(scale=2):
	input_text = gr.Textbox(label="English Source Text", placeholder="Enter English text...", lines=4)
	model_selector = gr.CheckboxGroup(
	choices=list(MODELS_CONFIG.keys()),
	value=list(MODELS_CONFIG.keys()),
	label="Select Models"
	)
	temp_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, label="Temperature")
	btn = gr.Button("🚀 Run Translation", variant="primary")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### Phase 2: Stable (Formal)")
	p2_out = gr.Textbox(label="Output", interactive=False, lines=5)
	gr.Markdown(f"Training: {MODELS_CONFIG['Phase 2: Stable (Formal)']['description']}")
	gr.Markdown(f"Metrics: ChrF++: `{MODELS_CONFIG['Phase 2: Stable (Formal)']['chrf']}` \| COMET: `{MODELS_CONFIG['Phase 2: Stable (Formal)']['comet']}`")

	with gr.Column():
	gr.Markdown("### Phase 4: Anchored (Native)")
	p4_out = gr.Textbox(label="Output", interactive=False, lines=5)
	gr.Markdown(f"Training: {MODELS_CONFIG['Phase 4: Anchored (Native)']['description']}")
	gr.Markdown(f"Metrics: ChrF++: `{MODELS_CONFIG['Phase 4: Anchored (Native)']['chrf']}` \| COMET: `{MODELS_CONFIG['Phase 4: Anchored (Native)']['comet']}`")

	gr.Examples(
	examples=[
	["The ferry to Gozo leaves every 45 minutes."],
	["We now have 4-month-old mice that are non-diabetic that used to be diabetic."],
	["This regulation shall be binding in its entirety and directly applicable in all Member States."]
	],
	inputs=input_text
	)

	btn.click(
	fn=translate_logic,
	inputs=[input_text, model_selector, temp_slider],
	outputs=[p2_out, p4_out]
	)

	if __name__ == "__main__":
	demo.launch()