mamakobe commited on
Commit
d0d33dd
Β·
verified Β·
1 Parent(s): abfa033

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +300 -0
app.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ================================================================
2
+ # GRADIO UI FOR LUHYA MULTILINGUAL TRANSLATION MODEL
3
+ # ================================================================
4
+
5
+ import gradio as gr
6
+ import torch
7
+ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
8
+ import time
9
+ import json
10
+
11
+ class LuhyaTranslationInterface:
12
+ """Gradio interface for Luhya translation model"""
13
+
14
+ def __init__(self, model_name: str):
15
+ self.model_name = model_name
16
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
+
18
+ # Load model and tokenizer
19
+ print(f"Loading model: {model_name}")
20
+ self.tokenizer = M2M100Tokenizer.from_pretrained(model_name)
21
+ self.model = M2M100ForConditionalGeneration.from_pretrained(model_name)
22
+ self.model.to(self.device)
23
+ self.model.eval()
24
+
25
+ # Language and dialect mappings
26
+ self.languages = {
27
+ "English": "en",
28
+ "Swahili": "sw",
29
+ "Luhya (General)": "luy"
30
+ }
31
+
32
+ self.dialects = {
33
+ "Bukusu": "luy_bukusu",
34
+ "Wanga": "luy_wanga",
35
+ "Kisa": "luy_kisa",
36
+ "Maragoli": "luy_maragoli",
37
+ "Tachoni": "luy_tachoni",
38
+ "Kabras": "luy_kabras",
39
+ "Tsotso": "luy_tsotso",
40
+ "Marachi": "luy_marachi",
41
+ "Luwanga": "luy_luwanga"
42
+ }
43
+
44
+ # Example translations for quick testing
45
+ self.examples = [
46
+ ["Good morning", "English", "Tsotso", "Basic greeting"],
47
+ ["Hello, how are you?", "English", "Bukusu", "Common question"],
48
+ ["Thank you very much", "English", "Wanga", "Gratitude expression"],
49
+ ["What is your name?", "English", "Maragoli", "Personal question"],
50
+ ["I love you", "English", "Kabras", "Emotional expression"],
51
+ ["Where are you going?", "English", "Tachoni", "Direction question"]
52
+ ]
53
+
54
+ def translate_text(self, text: str, source_lang: str, target_dialect: str, max_length: int = 128):
55
+ """Translate text using the model"""
56
+
57
+ if not text.strip():
58
+ return "Please enter some text to translate.", "", 0.0
59
+
60
+ try:
61
+ start_time = time.time()
62
+
63
+ # Map language names to codes
64
+ source_code = self.languages.get(source_lang, "en")
65
+ target_code = self.dialects.get(target_dialect, "luy_bukusu")
66
+
67
+ # Set tokenizer languages
68
+ self.tokenizer.src_lang = source_code if source_code in ["en", "sw"] else "sw"
69
+ self.tokenizer.tgt_lang = "sw" # Use Swahili as base target
70
+
71
+ # Prepare input text with dialect token
72
+ if source_code != "en":
73
+ # For non-English input, add source dialect token
74
+ input_text = text
75
+ else:
76
+ # For English input, add target dialect token to guide translation
77
+ input_text = f"<{target_code}> {text}"
78
+
79
+ # Tokenize
80
+ inputs = self.tokenizer(input_text, return_tensors="pt", max_length=max_length, truncation=True).to(self.device)
81
+
82
+ # Generate translation
83
+ with torch.no_grad():
84
+ outputs = self.model.generate(
85
+ **inputs,
86
+ max_length=max_length,
87
+ num_beams=4,
88
+ early_stopping=True,
89
+ pad_token_id=self.tokenizer.pad_token_id,
90
+ eos_token_id=self.tokenizer.eos_token_id,
91
+ do_sample=False,
92
+ temperature=1.0
93
+ )
94
+
95
+ # Decode result
96
+ translation = self.tokenizer.decode(outputs[0], skip_special_tokens=False)
97
+ translation = translation.replace('<s>', '').replace('</s>', '').strip()
98
+
99
+ # Calculate translation time
100
+ translation_time = time.time() - start_time
101
+
102
+ # Simple confidence score based on presence of target dialect token and length
103
+ confidence = self.calculate_confidence(translation, target_code, text)
104
+
105
+ return translation, f"Translation completed in {translation_time:.2f} seconds", confidence
106
+
107
+ except Exception as e:
108
+ return f"Translation error: {str(e)}", "Error occurred during translation", 0.0
109
+
110
+ def calculate_confidence(self, translation: str, target_code: str, source_text: str) -> float:
111
+ """Calculate a simple confidence score for the translation"""
112
+ score = 0.0
113
+
114
+ # Check if target dialect token is present
115
+ if f"<{target_code}>" in translation:
116
+ score += 0.4
117
+
118
+ # Check if translation is not just copying source
119
+ if source_text.lower() not in translation.lower():
120
+ score += 0.3
121
+
122
+ # Check reasonable length
123
+ words = translation.split()
124
+ if 1 <= len(words) <= 15:
125
+ score += 0.2
126
+
127
+ # Check for repetitive patterns
128
+ if not (".)" in translation or "..." in translation):
129
+ score += 0.1
130
+
131
+ return min(1.0, score)
132
+
133
+ def create_interface(self):
134
+ """Create the Gradio interface"""
135
+
136
+ # Custom CSS for better styling
137
+ css = """
138
+ .gradio-container {
139
+ font-family: 'Arial', sans-serif;
140
+ }
141
+ .title {
142
+ text-align: center;
143
+ color: #2E8B57;
144
+ margin-bottom: 20px;
145
+ }
146
+ .description {
147
+ text-align: center;
148
+ color: #666;
149
+ margin-bottom: 30px;
150
+ }
151
+ .confidence-high { color: #28a745; }
152
+ .confidence-medium { color: #ffc107; }
153
+ .confidence-low { color: #dc3545; }
154
+ """
155
+
156
+ # Create interface
157
+ with gr.Blocks(css=css, title="Luhya Multilingual Translator") as demo:
158
+
159
+ # Header
160
+ gr.HTML("""
161
+ <div class="title">
162
+ <h1>🌍 Luhya Multilingual Translation Model</h1>
163
+ </div>
164
+ <div class="description">
165
+ <p>Translate between English, Swahili, and various Luhya dialects including Bukusu, Wanga, Maragoli, and more.</p>
166
+ <p><em>This model supports bidirectional translation and dialect-specific outputs.</em></p>
167
+ </div>
168
+ """)
169
+
170
+ # Main interface
171
+ with gr.Row():
172
+ with gr.Column(scale=1):
173
+ # Input section
174
+ gr.HTML("<h3>πŸ“ Input</h3>")
175
+
176
+ input_text = gr.Textbox(
177
+ label="Text to translate",
178
+ placeholder="Enter text in English, Swahili, or Luhya...",
179
+ lines=3,
180
+ max_lines=5
181
+ )
182
+
183
+ with gr.Row():
184
+ source_lang = gr.Dropdown(
185
+ choices=list(self.languages.keys()),
186
+ label="Source Language",
187
+ value="English"
188
+ )
189
+
190
+ target_dialect = gr.Dropdown(
191
+ choices=list(self.dialects.keys()),
192
+ label="Target Dialect",
193
+ value="Bukusu"
194
+ )
195
+
196
+ translate_btn = gr.Button("πŸ”„ Translate", variant="primary", size="lg")
197
+
198
+ with gr.Column(scale=1):
199
+ # Output section
200
+ gr.HTML("<h3>✨ Translation</h3>")
201
+
202
+ output_text = gr.Textbox(
203
+ label="Translated text",
204
+ lines=3,
205
+ max_lines=5,
206
+ interactive=False
207
+ )
208
+
209
+ with gr.Row():
210
+ status_text = gr.Textbox(
211
+ label="Status",
212
+ interactive=False,
213
+ scale=2
214
+ )
215
+
216
+ confidence_score = gr.Number(
217
+ label="Confidence",
218
+ interactive=False,
219
+ scale=1
220
+ )
221
+
222
+ # Examples section
223
+ gr.HTML("<h3>πŸ’‘ Try these examples:</h3>")
224
+
225
+ examples_component = gr.Examples(
226
+ examples=self.examples,
227
+ inputs=[input_text, source_lang, target_dialect, gr.Textbox(visible=False)],
228
+ outputs=[output_text, status_text, confidence_score],
229
+ fn=lambda t, s, d, _: self.translate_text(t, s, d),
230
+ cache_examples=False
231
+ )
232
+
233
+ # Information section
234
+ with gr.Accordion("ℹ️ Model Information", open=False):
235
+ gr.HTML(f"""
236
+ <div style="padding: 15px;">
237
+ <h4>Model Details</h4>
238
+ <ul>
239
+ <li><strong>Base Model:</strong> facebook/m2m100_418M</li>
240
+ <li><strong>Model Repository:</strong> <a href="https://huggingface.co/{self.model_name}" target="_blank">{self.model_name}</a></li>
241
+ <li><strong>Supported Languages:</strong> English, Swahili</li>
242
+ <li><strong>Supported Dialects:</strong> Bukusu, Wanga, Kisa, Maragoli, Tachoni, Kabras, Tsotso, Marachi, Luwanga</li>
243
+ <li><strong>Training:</strong> Fine-tuned on community-sourced Luhya translations</li>
244
+ </ul>
245
+
246
+ <h4>Usage Tips</h4>
247
+ <ul>
248
+ <li>Keep sentences reasonably short (under 100 words) for best results</li>
249
+ <li>The model works best with common phrases and everyday language</li>
250
+ <li>Confidence scores indicate model certainty about the translation</li>
251
+ <li>Try different dialects to see variations in translation</li>
252
+ </ul>
253
+
254
+ <h4>Cultural Context</h4>
255
+ <p>This model was developed to support Luhya language preservation and accessibility.
256
+ Luhya is a group of related Bantu languages spoken in western Kenya by the Luhya people.</p>
257
+ </div>
258
+ """)
259
+
260
+ # Set up the translation function
261
+ translate_btn.click(
262
+ fn=self.translate_text,
263
+ inputs=[input_text, source_lang, target_dialect],
264
+ outputs=[output_text, status_text, confidence_score]
265
+ )
266
+
267
+ # Footer
268
+ gr.HTML("""
269
+ <div style="text-align: center; margin-top: 30px; padding: 20px; background-color: #f8f9fa; border-radius: 10px;">
270
+ <p><strong>Luhya Multilingual Translation Model</strong></p>
271
+ <p>Built with ❀️ for language preservation and community accessibility</p>
272
+ <p><em>Part of the effort to digitize and preserve African languages</em></p>
273
+ </div>
274
+ """)
275
+
276
+ return demo
277
+
278
+ # ================================================================
279
+ # FOR HUGGINGFACE SPACES DEPLOYMENT
280
+ # ================================================================
281
+
282
+ # This is the main file that HuggingFace Spaces will run
283
+ if __name__ == "__main__":
284
+ import os
285
+
286
+ # Get model name from environment variable or use default
287
+ model_name = os.getenv("MODEL_NAME", "mamakobe/luhya-multilingual-m2m100")
288
+
289
+ # Create and launch the app
290
+ demo = create_luhya_translator_app(model_name)
291
+
292
+ # Launch with specific settings for HuggingFace Spaces
293
+ demo.launch(
294
+ server_name="0.0.0.0", # Required for HuggingFace Spaces
295
+ server_port=7860, # Default port for HuggingFace Spaces
296
+ share=False, # Don't create public link when on Spaces
297
+ show_error=True, # Show errors in interface
298
+ show_tips=True, # Show Gradio tips
299
+ enable_queue=True # Enable queueing for better performance
300
+ )