Spaces:

GDSI
/

luhya-multilingual-translator

Runtime error

App Files Files Community

luhya-multilingual-translator / app.py

mamakobe

Update app.py

e8098c7 verified 5 months ago

raw

history blame contribute delete

13 kB

	# ================================================================
	# GRADIO UI FOR LUHYA MULTILINGUAL TRANSLATION MODEL
	# ================================================================

	import gradio as gr
	import torch
	from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
	import time
	import json

	class LuhyaTranslationInterface:
	"""Gradio interface for Luhya translation model"""

	def __init__(self, model_name: str):
	self.model_name = model_name
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Load model and tokenizer
	print(f"Loading model: {model_name}")
	self.tokenizer = M2M100Tokenizer.from_pretrained(model_name)
	self.model = M2M100ForConditionalGeneration.from_pretrained(model_name)
	self.model.to(self.device)
	self.model.eval()

	# Language and dialect mappings
	self.languages = {
	"English": "en",
	"Swahili": "sw",
	"Luhya (General)": "luy"
	}

	self.dialects = {
	"Bukusu": "luy_bukusu",
	"Wanga": "luy_wanga",
	"Kisa": "luy_kisa",
	"Maragoli": "luy_maragoli",
	"Tachoni": "luy_tachoni",
	"Kabras": "luy_kabras",
	"Tsotso": "luy_tsotso",
	"Marachi": "luy_marachi",
	"Luwanga": "luy_luwanga"
	}

	# Example translations for quick testing
	self.examples = [
	["Good morning", "English", "Tsotso", "Basic greeting"],
	["Hello, how are you?", "English", "Bukusu", "Common question"],
	["Thank you very much", "English", "Wanga", "Gratitude expression"],
	["What is your name?", "English", "Maragoli", "Personal question"],
	["I love you", "English", "Kabras", "Emotional expression"],
	["Where are you going?", "English", "Tachoni", "Direction question"]
	]

	def translate_text(self, text: str, source_lang: str, target_dialect: str, max_length: int = 128):
	"""Translate text using the model"""

	if not text.strip():
	return "Please enter some text to translate.", "", 0.0

	try:
	start_time = time.time()

	# Map language names to codes
	source_code = self.languages.get(source_lang, "en")
	target_code = self.dialects.get(target_dialect, "luy_bukusu")

	# Set tokenizer languages
	self.tokenizer.src_lang = source_code if source_code in ["en", "sw"] else "sw"
	self.tokenizer.tgt_lang = "sw" # Use Swahili as base target

	# Prepare input text with dialect token
	if source_code != "en":
	# For non-English input, add source dialect token
	input_text = text
	else:
	# For English input, add target dialect token to guide translation
	input_text = f"<{target_code}> {text}"

	# Tokenize
	inputs = self.tokenizer(input_text, return_tensors="pt", max_length=max_length, truncation=True).to(self.device)

	# Generate translation
	with torch.no_grad():
	outputs = self.model.generate(
	**inputs,
	max_length=max_length,
	num_beams=4,
	early_stopping=True,
	pad_token_id=self.tokenizer.pad_token_id,
	eos_token_id=self.tokenizer.eos_token_id,
	do_sample=False,
	temperature=1.0
	)

	# Decode result
	translation = self.tokenizer.decode(outputs[0], skip_special_tokens=False)
	translation = translation.replace('<s>', '').replace('</s>', '').strip()

	# Calculate translation time
	translation_time = time.time() - start_time

	# Simple confidence score based on presence of target dialect token and length
	confidence = self.calculate_confidence(translation, target_code, text)

	return translation, f"Translation completed in {translation_time:.2f} seconds", confidence

	except Exception as e:
	return f"Translation error: {str(e)}", "Error occurred during translation", 0.0

	def calculate_confidence(self, translation: str, target_code: str, source_text: str) -> float:
	"""Calculate a simple confidence score for the translation"""
	score = 0.0

	# Check if target dialect token is present
	if f"<{target_code}>" in translation:
	score += 0.4

	# Check if translation is not just copying source
	if source_text.lower() not in translation.lower():
	score += 0.3

	# Check reasonable length
	words = translation.split()
	if 1 <= len(words) <= 15:
	score += 0.2

	# Check for repetitive patterns
	if not (".)" in translation or "..." in translation):
	score += 0.1

	return min(1.0, score)

	def create_interface(self):
	"""Create the Gradio interface"""

	# Custom CSS for better styling
	css = """
	.gradio-container {
	font-family: 'Arial', sans-serif;
	}
	.title {
	text-align: center;
	color: #2E8B57;
	margin-bottom: 20px;
	}
	.description {
	text-align: center;
	color: #666;
	margin-bottom: 30px;
	}
	.confidence-high { color: #28a745; }
	.confidence-medium { color: #ffc107; }
	.confidence-low { color: #dc3545; }
	"""

	# Create interface
	with gr.Blocks(css=css, title="Luhya Multilingual Translator") as demo:

	# Header
	gr.HTML("""
	<div class="title">
	<h1>🌍 Luhya Multilingual Translation Model</h1>
	</div>
	<div class="description">
	<p>Translate between English, Swahili, and various Luhya dialects including Bukusu, Wanga, Maragoli, and more.</p>
	<p><em>This model supports bidirectional translation and dialect-specific outputs.</em></p>
	</div>
	""")

	# Main interface
	with gr.Row():
	with gr.Column(scale=1):
	# Input section
	gr.HTML("<h3>📝 Input</h3>")

	input_text = gr.Textbox(
	label="Text to translate",
	placeholder="Enter text in English, Swahili, or Luhya...",
	lines=3,
	max_lines=5
	)

	with gr.Row():
	source_lang = gr.Dropdown(
	choices=list(self.languages.keys()),
	label="Source Language",
	value="English"
	)

	target_dialect = gr.Dropdown(
	choices=list(self.dialects.keys()),
	label="Target Dialect",
	value="Bukusu"
	)

	translate_btn = gr.Button("🔄 Translate", variant="primary", size="lg")

	with gr.Column(scale=1):
	# Output section
	gr.HTML("<h3>✨ Translation</h3>")

	output_text = gr.Textbox(
	label="Translated text",
	lines=3,
	max_lines=5,
	interactive=False
	)

	with gr.Row():
	status_text = gr.Textbox(
	label="Status",
	interactive=False,
	scale=2
	)

	confidence_score = gr.Number(
	label="Confidence",
	interactive=False,
	scale=1
	)

	# Examples section
	gr.HTML("<h3>💡 Try these examples:</h3>")

	examples_component = gr.Examples(
	examples=self.examples,
	inputs=[input_text, source_lang, target_dialect, gr.Textbox(visible=False)],
	outputs=[output_text, status_text, confidence_score],
	fn=lambda t, s, d, _: self.translate_text(t, s, d),
	cache_examples=False
	)

	# Information section
	with gr.Accordion("ℹ️ Model Information", open=False):
	gr.HTML(f"""
	<div style="padding: 15px;">
	<h4>Model Details</h4>
	<ul>
	<li><strong>Base Model:</strong> facebook/m2m100_418M</li>
	<li><strong>Model Repository:</strong> <a href="https://huggingface.co/{self.model_name}" target="_blank">{self.model_name}</a></li>
	<li><strong>Supported Languages:</strong> English, Swahili</li>
	<li><strong>Supported Dialects:</strong> Bukusu, Wanga, Kisa, Maragoli, Tachoni, Kabras, Tsotso, Marachi, Luwanga</li>
	<li><strong>Training:</strong> Fine-tuned on community-sourced Luhya translations</li>
	</ul>

	<h4>Usage Tips</h4>
	<ul>
	<li>Keep sentences reasonably short (under 100 words) for best results</li>
	<li>The model works best with common phrases and everyday language</li>
	<li>Confidence scores indicate model certainty about the translation</li>
	<li>Try different dialects to see variations in translation</li>
	</ul>

	<h4>Cultural Context</h4>
	<p>This model was developed to support Luhya language preservation and accessibility.
	Luhya is a group of related Bantu languages spoken in western Kenya by the Luhya people.</p>
	</div>
	""")

	# Set up the translation function
	translate_btn.click(
	fn=self.translate_text,
	inputs=[input_text, source_lang, target_dialect],
	outputs=[output_text, status_text, confidence_score]
	)

	# Footer
	gr.HTML("""
	<div style="text-align: center; margin-top: 30px; padding: 20px; background-color: #f8f9fa; border-radius: 10px;">
	<p><strong>Luhya Multilingual Translation Model</strong></p>
	<p>Built with ❤️ for language preservation and community accessibility</p>
	<p><em>Part of the effort to digitize and preserve African languages</em></p>
	</div>
	""")

	return demo

	# ================================================================
	# STANDALONE GRADIO APP
	# ================================================================

	def create_luhya_translator_app(model_name: str = "your-username/luhya-multilingual-m2m100"):
	"""Create and launch the Luhya translation app"""

	# Initialize the interface
	translator = LuhyaTranslationInterface(model_name)

	# Create the Gradio interface
	demo = translator.create_interface()

	return demo

	# ================================================================
	# FOR HUGGINGFACE SPACES DEPLOYMENT
	# ================================================================

	# This is the main file that HuggingFace Spaces will run
	if __name__ == "__main__":
	import os

	# Get model name from environment variable or use default
	model_name = os.getenv("MODEL_NAME", "mamakobe/luhya-multilingual-m2m100")

	# Create and launch the app
	demo = create_luhya_translator_app(model_name)

	# Launch with specific settings for HuggingFace Spaces
	demo.launch(
	server_name="0.0.0.0", # Required for HuggingFace Spaces
	server_port=7860, # Default port for HuggingFace Spaces
	share=False, # Don't create public link when on Spaces
	show_error=True, # Show errors in interface
	show_tips=True, # Show Gradio tips
	enable_queue=True # Enable queueing for better performance
	)