File size: 13,009 Bytes
d0d33dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8098c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d0d33dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8098c7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
# ================================================================
# GRADIO UI FOR LUHYA MULTILINGUAL TRANSLATION MODEL
# ================================================================

import gradio as gr
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import time
import json

class LuhyaTranslationInterface:
    """Gradio interface for Luhya translation model"""
    
    def __init__(self, model_name: str):
        self.model_name = model_name
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Load model and tokenizer
        print(f"Loading model: {model_name}")
        self.tokenizer = M2M100Tokenizer.from_pretrained(model_name)
        self.model = M2M100ForConditionalGeneration.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()
        
        # Language and dialect mappings
        self.languages = {
            "English": "en",
            "Swahili": "sw",
            "Luhya (General)": "luy"
        }
        
        self.dialects = {
            "Bukusu": "luy_bukusu",
            "Wanga": "luy_wanga", 
            "Kisa": "luy_kisa",
            "Maragoli": "luy_maragoli",
            "Tachoni": "luy_tachoni",
            "Kabras": "luy_kabras",
            "Tsotso": "luy_tsotso",
            "Marachi": "luy_marachi",
            "Luwanga": "luy_luwanga"
        }
        
        # Example translations for quick testing
        self.examples = [
            ["Good morning", "English", "Tsotso", "Basic greeting"],
            ["Hello, how are you?", "English", "Bukusu", "Common question"],
            ["Thank you very much", "English", "Wanga", "Gratitude expression"],
            ["What is your name?", "English", "Maragoli", "Personal question"],
            ["I love you", "English", "Kabras", "Emotional expression"],
            ["Where are you going?", "English", "Tachoni", "Direction question"]
        ]
    
    def translate_text(self, text: str, source_lang: str, target_dialect: str, max_length: int = 128):
        """Translate text using the model"""
        
        if not text.strip():
            return "Please enter some text to translate.", "", 0.0
        
        try:
            start_time = time.time()
            
            # Map language names to codes
            source_code = self.languages.get(source_lang, "en")
            target_code = self.dialects.get(target_dialect, "luy_bukusu")
            
            # Set tokenizer languages
            self.tokenizer.src_lang = source_code if source_code in ["en", "sw"] else "sw"
            self.tokenizer.tgt_lang = "sw"  # Use Swahili as base target
            
            # Prepare input text with dialect token
            if source_code != "en":
                # For non-English input, add source dialect token
                input_text = text
            else:
                # For English input, add target dialect token to guide translation
                input_text = f"<{target_code}> {text}"
            
            # Tokenize
            inputs = self.tokenizer(input_text, return_tensors="pt", max_length=max_length, truncation=True).to(self.device)
            
            # Generate translation
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_length=max_length,
                    num_beams=4,
                    early_stopping=True,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    do_sample=False,
                    temperature=1.0
                )
            
            # Decode result
            translation = self.tokenizer.decode(outputs[0], skip_special_tokens=False)
            translation = translation.replace('<s>', '').replace('</s>', '').strip()
            
            # Calculate translation time
            translation_time = time.time() - start_time
            
            # Simple confidence score based on presence of target dialect token and length
            confidence = self.calculate_confidence(translation, target_code, text)
            
            return translation, f"Translation completed in {translation_time:.2f} seconds", confidence
            
        except Exception as e:
            return f"Translation error: {str(e)}", "Error occurred during translation", 0.0
    
    def calculate_confidence(self, translation: str, target_code: str, source_text: str) -> float:
        """Calculate a simple confidence score for the translation"""
        score = 0.0
        
        # Check if target dialect token is present
        if f"<{target_code}>" in translation:
            score += 0.4
        
        # Check if translation is not just copying source
        if source_text.lower() not in translation.lower():
            score += 0.3
        
        # Check reasonable length
        words = translation.split()
        if 1 <= len(words) <= 15:
            score += 0.2
        
        # Check for repetitive patterns
        if not (".)" in translation or "..." in translation):
            score += 0.1
        
        return min(1.0, score)
    
    def create_interface(self):
        """Create the Gradio interface"""
        
        # Custom CSS for better styling
        css = """
        .gradio-container {
            font-family: 'Arial', sans-serif;
        }
        .title {
            text-align: center;
            color: #2E8B57;
            margin-bottom: 20px;
        }
        .description {
            text-align: center;
            color: #666;
            margin-bottom: 30px;
        }
        .confidence-high { color: #28a745; }
        .confidence-medium { color: #ffc107; }
        .confidence-low { color: #dc3545; }
        """
        
        # Create interface
        with gr.Blocks(css=css, title="Luhya Multilingual Translator") as demo:
            
            # Header
            gr.HTML("""
            <div class="title">
                <h1>🌍 Luhya Multilingual Translation Model</h1>
            </div>
            <div class="description">
                <p>Translate between English, Swahili, and various Luhya dialects including Bukusu, Wanga, Maragoli, and more.</p>
                <p><em>This model supports bidirectional translation and dialect-specific outputs.</em></p>
            </div>
            """)
            
            # Main interface
            with gr.Row():
                with gr.Column(scale=1):
                    # Input section
                    gr.HTML("<h3>πŸ“ Input</h3>")
                    
                    input_text = gr.Textbox(
                        label="Text to translate",
                        placeholder="Enter text in English, Swahili, or Luhya...",
                        lines=3,
                        max_lines=5
                    )
                    
                    with gr.Row():
                        source_lang = gr.Dropdown(
                            choices=list(self.languages.keys()),
                            label="Source Language",
                            value="English"
                        )
                        
                        target_dialect = gr.Dropdown(
                            choices=list(self.dialects.keys()),
                            label="Target Dialect",
                            value="Bukusu"
                        )
                    
                    translate_btn = gr.Button("πŸ”„ Translate", variant="primary", size="lg")
                
                with gr.Column(scale=1):
                    # Output section
                    gr.HTML("<h3>✨ Translation</h3>")
                    
                    output_text = gr.Textbox(
                        label="Translated text",
                        lines=3,
                        max_lines=5,
                        interactive=False
                    )
                    
                    with gr.Row():
                        status_text = gr.Textbox(
                            label="Status",
                            interactive=False,
                            scale=2
                        )
                        
                        confidence_score = gr.Number(
                            label="Confidence",
                            interactive=False,
                            scale=1
                        )
            
            # Examples section
            gr.HTML("<h3>πŸ’‘ Try these examples:</h3>")
            
            examples_component = gr.Examples(
                examples=self.examples,
                inputs=[input_text, source_lang, target_dialect, gr.Textbox(visible=False)],
                outputs=[output_text, status_text, confidence_score],
                fn=lambda t, s, d, _: self.translate_text(t, s, d),
                cache_examples=False
            )
            
            # Information section
            with gr.Accordion("ℹ️ Model Information", open=False):
                gr.HTML(f"""
                <div style="padding: 15px;">
                    <h4>Model Details</h4>
                    <ul>
                        <li><strong>Base Model:</strong> facebook/m2m100_418M</li>
                        <li><strong>Model Repository:</strong> <a href="https://huggingface.co/{self.model_name}" target="_blank">{self.model_name}</a></li>
                        <li><strong>Supported Languages:</strong> English, Swahili</li>
                        <li><strong>Supported Dialects:</strong> Bukusu, Wanga, Kisa, Maragoli, Tachoni, Kabras, Tsotso, Marachi, Luwanga</li>
                        <li><strong>Training:</strong> Fine-tuned on community-sourced Luhya translations</li>
                    </ul>
                    
                    <h4>Usage Tips</h4>
                    <ul>
                        <li>Keep sentences reasonably short (under 100 words) for best results</li>
                        <li>The model works best with common phrases and everyday language</li>
                        <li>Confidence scores indicate model certainty about the translation</li>
                        <li>Try different dialects to see variations in translation</li>
                    </ul>
                    
                    <h4>Cultural Context</h4>
                    <p>This model was developed to support Luhya language preservation and accessibility. 
                    Luhya is a group of related Bantu languages spoken in western Kenya by the Luhya people.</p>
                </div>
                """)
            
            # Set up the translation function
            translate_btn.click(
                fn=self.translate_text,
                inputs=[input_text, source_lang, target_dialect],
                outputs=[output_text, status_text, confidence_score]
            )
            
            # Footer
            gr.HTML("""
            <div style="text-align: center; margin-top: 30px; padding: 20px; background-color: #f8f9fa; border-radius: 10px;">
                <p><strong>Luhya Multilingual Translation Model</strong></p>
                <p>Built with ❀️ for language preservation and community accessibility</p>
                <p><em>Part of the effort to digitize and preserve African languages</em></p>
            </div>
            """)
        
        return demo

# ================================================================
# STANDALONE GRADIO APP
# ================================================================

def create_luhya_translator_app(model_name: str = "your-username/luhya-multilingual-m2m100"):
    """Create and launch the Luhya translation app"""
    
    # Initialize the interface
    translator = LuhyaTranslationInterface(model_name)
    
    # Create the Gradio interface
    demo = translator.create_interface()
    
    return demo

# ================================================================
# FOR HUGGINGFACE SPACES DEPLOYMENT
# ================================================================

# This is the main file that HuggingFace Spaces will run
if __name__ == "__main__":
    import os
    
    # Get model name from environment variable or use default
    model_name = os.getenv("MODEL_NAME", "mamakobe/luhya-multilingual-m2m100")
    
    # Create and launch the app
    demo = create_luhya_translator_app(model_name)
    
    # Launch with specific settings for HuggingFace Spaces
    demo.launch(
        server_name="0.0.0.0",  # Required for HuggingFace Spaces
        server_port=7860,       # Default port for HuggingFace Spaces
        share=False,            # Don't create public link when on Spaces
        show_error=True,        # Show errors in interface
        show_tips=True,         # Show Gradio tips
        enable_queue=True       # Enable queueing for better performance
    )