Spaces:
Runtime error
Runtime error
File size: 13,009 Bytes
d0d33dd e8098c7 d0d33dd e8098c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 |
# ================================================================
# GRADIO UI FOR LUHYA MULTILINGUAL TRANSLATION MODEL
# ================================================================
import gradio as gr
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import time
import json
class LuhyaTranslationInterface:
"""Gradio interface for Luhya translation model"""
def __init__(self, model_name: str):
self.model_name = model_name
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load model and tokenizer
print(f"Loading model: {model_name}")
self.tokenizer = M2M100Tokenizer.from_pretrained(model_name)
self.model = M2M100ForConditionalGeneration.from_pretrained(model_name)
self.model.to(self.device)
self.model.eval()
# Language and dialect mappings
self.languages = {
"English": "en",
"Swahili": "sw",
"Luhya (General)": "luy"
}
self.dialects = {
"Bukusu": "luy_bukusu",
"Wanga": "luy_wanga",
"Kisa": "luy_kisa",
"Maragoli": "luy_maragoli",
"Tachoni": "luy_tachoni",
"Kabras": "luy_kabras",
"Tsotso": "luy_tsotso",
"Marachi": "luy_marachi",
"Luwanga": "luy_luwanga"
}
# Example translations for quick testing
self.examples = [
["Good morning", "English", "Tsotso", "Basic greeting"],
["Hello, how are you?", "English", "Bukusu", "Common question"],
["Thank you very much", "English", "Wanga", "Gratitude expression"],
["What is your name?", "English", "Maragoli", "Personal question"],
["I love you", "English", "Kabras", "Emotional expression"],
["Where are you going?", "English", "Tachoni", "Direction question"]
]
def translate_text(self, text: str, source_lang: str, target_dialect: str, max_length: int = 128):
"""Translate text using the model"""
if not text.strip():
return "Please enter some text to translate.", "", 0.0
try:
start_time = time.time()
# Map language names to codes
source_code = self.languages.get(source_lang, "en")
target_code = self.dialects.get(target_dialect, "luy_bukusu")
# Set tokenizer languages
self.tokenizer.src_lang = source_code if source_code in ["en", "sw"] else "sw"
self.tokenizer.tgt_lang = "sw" # Use Swahili as base target
# Prepare input text with dialect token
if source_code != "en":
# For non-English input, add source dialect token
input_text = text
else:
# For English input, add target dialect token to guide translation
input_text = f"<{target_code}> {text}"
# Tokenize
inputs = self.tokenizer(input_text, return_tensors="pt", max_length=max_length, truncation=True).to(self.device)
# Generate translation
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_length=max_length,
num_beams=4,
early_stopping=True,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id,
do_sample=False,
temperature=1.0
)
# Decode result
translation = self.tokenizer.decode(outputs[0], skip_special_tokens=False)
translation = translation.replace('<s>', '').replace('</s>', '').strip()
# Calculate translation time
translation_time = time.time() - start_time
# Simple confidence score based on presence of target dialect token and length
confidence = self.calculate_confidence(translation, target_code, text)
return translation, f"Translation completed in {translation_time:.2f} seconds", confidence
except Exception as e:
return f"Translation error: {str(e)}", "Error occurred during translation", 0.0
def calculate_confidence(self, translation: str, target_code: str, source_text: str) -> float:
"""Calculate a simple confidence score for the translation"""
score = 0.0
# Check if target dialect token is present
if f"<{target_code}>" in translation:
score += 0.4
# Check if translation is not just copying source
if source_text.lower() not in translation.lower():
score += 0.3
# Check reasonable length
words = translation.split()
if 1 <= len(words) <= 15:
score += 0.2
# Check for repetitive patterns
if not (".)" in translation or "..." in translation):
score += 0.1
return min(1.0, score)
def create_interface(self):
"""Create the Gradio interface"""
# Custom CSS for better styling
css = """
.gradio-container {
font-family: 'Arial', sans-serif;
}
.title {
text-align: center;
color: #2E8B57;
margin-bottom: 20px;
}
.description {
text-align: center;
color: #666;
margin-bottom: 30px;
}
.confidence-high { color: #28a745; }
.confidence-medium { color: #ffc107; }
.confidence-low { color: #dc3545; }
"""
# Create interface
with gr.Blocks(css=css, title="Luhya Multilingual Translator") as demo:
# Header
gr.HTML("""
<div class="title">
<h1>π Luhya Multilingual Translation Model</h1>
</div>
<div class="description">
<p>Translate between English, Swahili, and various Luhya dialects including Bukusu, Wanga, Maragoli, and more.</p>
<p><em>This model supports bidirectional translation and dialect-specific outputs.</em></p>
</div>
""")
# Main interface
with gr.Row():
with gr.Column(scale=1):
# Input section
gr.HTML("<h3>π Input</h3>")
input_text = gr.Textbox(
label="Text to translate",
placeholder="Enter text in English, Swahili, or Luhya...",
lines=3,
max_lines=5
)
with gr.Row():
source_lang = gr.Dropdown(
choices=list(self.languages.keys()),
label="Source Language",
value="English"
)
target_dialect = gr.Dropdown(
choices=list(self.dialects.keys()),
label="Target Dialect",
value="Bukusu"
)
translate_btn = gr.Button("π Translate", variant="primary", size="lg")
with gr.Column(scale=1):
# Output section
gr.HTML("<h3>β¨ Translation</h3>")
output_text = gr.Textbox(
label="Translated text",
lines=3,
max_lines=5,
interactive=False
)
with gr.Row():
status_text = gr.Textbox(
label="Status",
interactive=False,
scale=2
)
confidence_score = gr.Number(
label="Confidence",
interactive=False,
scale=1
)
# Examples section
gr.HTML("<h3>π‘ Try these examples:</h3>")
examples_component = gr.Examples(
examples=self.examples,
inputs=[input_text, source_lang, target_dialect, gr.Textbox(visible=False)],
outputs=[output_text, status_text, confidence_score],
fn=lambda t, s, d, _: self.translate_text(t, s, d),
cache_examples=False
)
# Information section
with gr.Accordion("βΉοΈ Model Information", open=False):
gr.HTML(f"""
<div style="padding: 15px;">
<h4>Model Details</h4>
<ul>
<li><strong>Base Model:</strong> facebook/m2m100_418M</li>
<li><strong>Model Repository:</strong> <a href="https://huggingface.co/{self.model_name}" target="_blank">{self.model_name}</a></li>
<li><strong>Supported Languages:</strong> English, Swahili</li>
<li><strong>Supported Dialects:</strong> Bukusu, Wanga, Kisa, Maragoli, Tachoni, Kabras, Tsotso, Marachi, Luwanga</li>
<li><strong>Training:</strong> Fine-tuned on community-sourced Luhya translations</li>
</ul>
<h4>Usage Tips</h4>
<ul>
<li>Keep sentences reasonably short (under 100 words) for best results</li>
<li>The model works best with common phrases and everyday language</li>
<li>Confidence scores indicate model certainty about the translation</li>
<li>Try different dialects to see variations in translation</li>
</ul>
<h4>Cultural Context</h4>
<p>This model was developed to support Luhya language preservation and accessibility.
Luhya is a group of related Bantu languages spoken in western Kenya by the Luhya people.</p>
</div>
""")
# Set up the translation function
translate_btn.click(
fn=self.translate_text,
inputs=[input_text, source_lang, target_dialect],
outputs=[output_text, status_text, confidence_score]
)
# Footer
gr.HTML("""
<div style="text-align: center; margin-top: 30px; padding: 20px; background-color: #f8f9fa; border-radius: 10px;">
<p><strong>Luhya Multilingual Translation Model</strong></p>
<p>Built with β€οΈ for language preservation and community accessibility</p>
<p><em>Part of the effort to digitize and preserve African languages</em></p>
</div>
""")
return demo
# ================================================================
# STANDALONE GRADIO APP
# ================================================================
def create_luhya_translator_app(model_name: str = "your-username/luhya-multilingual-m2m100"):
"""Create and launch the Luhya translation app"""
# Initialize the interface
translator = LuhyaTranslationInterface(model_name)
# Create the Gradio interface
demo = translator.create_interface()
return demo
# ================================================================
# FOR HUGGINGFACE SPACES DEPLOYMENT
# ================================================================
# This is the main file that HuggingFace Spaces will run
if __name__ == "__main__":
import os
# Get model name from environment variable or use default
model_name = os.getenv("MODEL_NAME", "mamakobe/luhya-multilingual-m2m100")
# Create and launch the app
demo = create_luhya_translator_app(model_name)
# Launch with specific settings for HuggingFace Spaces
demo.launch(
server_name="0.0.0.0", # Required for HuggingFace Spaces
server_port=7860, # Default port for HuggingFace Spaces
share=False, # Don't create public link when on Spaces
show_error=True, # Show errors in interface
show_tips=True, # Show Gradio tips
enable_queue=True # Enable queueing for better performance
)
|