jb100 commited on
Commit
1ef1e17
ยท
verified ยท
1 Parent(s): bd6ee78

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +1033 -0
app.py ADDED
@@ -0,0 +1,1033 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
+ import PyPDF2
5
+ from docx import Document
6
+ import tempfile
7
+ import os
8
+ from typing import Optional, Tuple
9
+ import logging
10
+ import spaces
11
+ import time
12
+ import re
13
+
14
+ # Set up logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Authentication credentials from environment variables
19
+ VALID_USERNAME = os.getenv("USERNAME", "admin")
20
+ VALID_PASSWORD = os.getenv("PASSWORD", "password123")
21
+
22
+ # Session management
23
+ authenticated_sessions = set()
24
+
25
+ def authenticate(username: str, password: str) -> tuple:
26
+ """Authenticate user credentials and return session info"""
27
+ if username == VALID_USERNAME and password == VALID_PASSWORD:
28
+ session_id = f"session_{int(time.time())}_{hash(username)}"
29
+ authenticated_sessions.add(session_id)
30
+ logger.info(f"Successful login for user: {username}")
31
+ return True, session_id
32
+ else:
33
+ logger.warning(f"Failed login attempt for user: {username}")
34
+ return False, None
35
+
36
+ def is_authenticated(session_id: str) -> bool:
37
+ """Check if session is authenticated"""
38
+ return session_id in authenticated_sessions
39
+
40
+ def logout_session(session_id: str):
41
+ """Remove session from authenticated sessions"""
42
+ if session_id in authenticated_sessions:
43
+ authenticated_sessions.remove(session_id)
44
+ logger.info(f"Session logged out: {session_id}")
45
+
46
+ class NLLBTranslator:
47
+ def __init__(self, model_size="600M"):
48
+ self.model = None
49
+ self.tokenizer = None
50
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
51
+ self.model_size = model_size
52
+ self.load_model()
53
+
54
+ def load_model(self):
55
+ """Load the NLLB model and tokenizer"""
56
+ try:
57
+ # Use the smaller, more stable model by default
58
+ if self.model_size == "600M":
59
+ model_name = "facebook/nllb-200-distilled-600M"
60
+ elif self.model_size == "1.3B":
61
+ model_name = "facebook/nllb-200-1.3B"
62
+ else: # 3.3B
63
+ model_name = "facebook/nllb-200-3.3B"
64
+
65
+ logger.info(f"Loading NLLB model: {model_name}")
66
+
67
+ if torch.cuda.is_available():
68
+ logger.info(f"CUDA available: {torch.cuda.get_device_name(0)}")
69
+ torch_dtype = torch.float16
70
+ else:
71
+ logger.warning("CUDA not available, using CPU")
72
+ torch_dtype = torch.float32
73
+
74
+ # Load tokenizer
75
+ logger.info("Loading NLLB tokenizer...")
76
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
77
+
78
+ # Load model
79
+ logger.info("Loading NLLB model...")
80
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(
81
+ model_name,
82
+ torch_dtype=torch_dtype,
83
+ low_cpu_mem_usage=True
84
+ )
85
+ self.model = self.model.to(self.device)
86
+ self.model.eval()
87
+
88
+ logger.info("NLLB model loaded successfully!")
89
+
90
+ except Exception as e:
91
+ logger.error(f"Error loading NLLB model: {str(e)}")
92
+ raise e
93
+
94
+ def split_into_sentences(self, text: str) -> tuple:
95
+ """Split text into sentences while preserving paragraph structure"""
96
+ paragraphs = re.split(r'\n\s*\n', text)
97
+
98
+ sentence_list = []
99
+ paragraph_markers = []
100
+
101
+ for para_idx, paragraph in enumerate(paragraphs):
102
+ if not paragraph.strip():
103
+ continue
104
+
105
+ sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
106
+
107
+ for sent_idx, sentence in enumerate(sentences):
108
+ if sentence.strip():
109
+ sentence_list.append(sentence.strip())
110
+ is_para_end = (sent_idx == len(sentences) - 1)
111
+ is_last_para = (para_idx == len(paragraphs) - 1)
112
+ paragraph_markers.append({
113
+ 'is_paragraph_end': is_para_end and not is_last_para,
114
+ 'original_sentence': sentence.strip()
115
+ })
116
+
117
+ return sentence_list, paragraph_markers
118
+
119
+ def reconstruct_formatting(self, translated_sentences: list, paragraph_markers: list) -> str:
120
+ """Reconstruct text with original paragraph formatting"""
121
+ if len(translated_sentences) != len(paragraph_markers):
122
+ return ' '.join(translated_sentences)
123
+
124
+ result = []
125
+ for i, (translation, marker) in enumerate(zip(translated_sentences, paragraph_markers)):
126
+ result.append(translation)
127
+
128
+ if marker['is_paragraph_end']:
129
+ result.append('\n\n')
130
+ elif i < len(translated_sentences) - 1:
131
+ result.append(' ')
132
+
133
+ return ''.join(result)
134
+
135
+ @spaces.GPU
136
+ def translate_text(self, text: str, source_lang: str, target_lang: str) -> str:
137
+ """Translate text from source language to target language"""
138
+ try:
139
+ source_code = LANGUAGE_CODES.get(source_lang)
140
+ target_code = LANGUAGE_CODES.get(target_lang)
141
+
142
+ if not source_code or not target_code:
143
+ return f"Unsupported language: {source_lang} or {target_lang}"
144
+
145
+ if source_lang == target_lang:
146
+ return text
147
+
148
+ logger.info(f"Translating from {source_lang} to {target_lang}")
149
+
150
+ # Check if simple or complex text
151
+ if '\n' not in text and len(text.split('.')) <= 2:
152
+ input_sentences = [text.strip()]
153
+ paragraph_markers = None
154
+ else:
155
+ input_sentences, paragraph_markers = self.split_into_sentences(text)
156
+ if not input_sentences:
157
+ return "No valid text found to translate."
158
+
159
+ return self.perform_translation(input_sentences, source_code, target_code, paragraph_markers)
160
+
161
+ except Exception as e:
162
+ logger.error(f"Translation error: {str(e)}")
163
+ return f"Error during translation: {str(e)}"
164
+
165
+ def perform_translation(self, input_sentences: list, source_code: str, target_code: str, paragraph_markers: list) -> str:
166
+ """Perform the actual translation using NLLB model"""
167
+ batch_size = 2 # Conservative batch size for stability
168
+
169
+ # For very long sentences, use single processing
170
+ avg_sentence_length = sum(len(s.split()) for s in input_sentences) / len(input_sentences) if input_sentences else 0
171
+ if avg_sentence_length > 100:
172
+ batch_size = 1
173
+
174
+ logger.info(f"Using batch size {batch_size} for average sentence length {avg_sentence_length:.1f} words")
175
+
176
+ all_translations = []
177
+
178
+ for i in range(0, len(input_sentences), batch_size):
179
+ batch_sentences = input_sentences[i:i + batch_size]
180
+
181
+ try:
182
+ # Tokenize input
183
+ inputs = self.tokenizer(
184
+ batch_sentences,
185
+ return_tensors="pt",
186
+ padding=True,
187
+ truncation=True,
188
+ max_length=512
189
+ ).to(self.device)
190
+
191
+ # Generate translation
192
+ with torch.no_grad():
193
+ translated_tokens = self.model.generate(
194
+ **inputs,
195
+ forced_bos_token_id=self.tokenizer.lang_code_to_id.get(target_code, self.tokenizer.eos_token_id),
196
+ max_length=512,
197
+ num_beams=4,
198
+ early_stopping=True,
199
+ do_sample=False
200
+ )
201
+
202
+ # Decode translations
203
+ translations = self.tokenizer.batch_decode(
204
+ translated_tokens,
205
+ skip_special_tokens=True
206
+ )
207
+
208
+ all_translations.extend(translations)
209
+
210
+ # Progress logging
211
+ if len(input_sentences) > 10:
212
+ progress = min(100, int(((i + batch_size) / len(input_sentences)) * 100))
213
+ logger.info(f"Translation progress: {progress}%")
214
+
215
+ except Exception as e:
216
+ logger.error(f"Translation error in batch: {str(e)}")
217
+
218
+ # Fallback: process sentences individually
219
+ for single_sentence in batch_sentences:
220
+ try:
221
+ inputs = self.tokenizer(
222
+ single_sentence,
223
+ return_tensors="pt",
224
+ truncation=True,
225
+ max_length=512
226
+ ).to(self.device)
227
+
228
+ with torch.no_grad():
229
+ translated_tokens = self.model.generate(
230
+ **inputs,
231
+ forced_bos_token_id=self.tokenizer.lang_code_to_id.get(target_code, self.tokenizer.eos_token_id),
232
+ max_length=512,
233
+ num_beams=4,
234
+ early_stopping=True
235
+ )
236
+
237
+ translation = self.tokenizer.decode(
238
+ translated_tokens[0],
239
+ skip_special_tokens=True
240
+ )
241
+
242
+ all_translations.append(translation)
243
+
244
+ except Exception as single_e:
245
+ logger.error(f"Failed to translate sentence: {str(single_e)}")
246
+ all_translations.append(f"[Translation failed for: {single_sentence[:50]}...]")
247
+
248
+ # Reconstruct formatting
249
+ if paragraph_markers and len(all_translations) == len(paragraph_markers):
250
+ final_translation = self.reconstruct_formatting(all_translations, paragraph_markers)
251
+ else:
252
+ final_translation = ' '.join(all_translations) if all_translations else "Translation failed"
253
+
254
+ return final_translation
255
+
256
+ # NLLB-200 supported languages (comprehensive list)
257
+ LANGUAGE_CODES = {
258
+ # Major European Languages
259
+ "English": "eng_Latn",
260
+ "French": "fra_Latn",
261
+ "German": "deu_Latn",
262
+ "Spanish": "spa_Latn",
263
+ "Italian": "ita_Latn",
264
+ "Portuguese": "por_Latn",
265
+ "Russian": "rus_Cyrl",
266
+ "Dutch": "nld_Latn",
267
+ "Polish": "pol_Latn",
268
+ "Czech": "ces_Latn",
269
+ "Swedish": "swe_Latn",
270
+ "Danish": "dan_Latn",
271
+ "Norwegian": "nob_Latn",
272
+ "Finnish": "fin_Latn",
273
+ "Greek": "ell_Grek",
274
+ "Hungarian": "hun_Latn",
275
+ "Romanian": "ron_Latn",
276
+ "Bulgarian": "bul_Cyrl",
277
+ "Croatian": "hrv_Latn",
278
+ "Slovak": "slk_Latn",
279
+ "Ukrainian": "ukr_Cyrl",
280
+ "Belarusian": "bel_Cyrl",
281
+ "Serbian": "srp_Cyrl",
282
+ "Slovenian": "slv_Latn",
283
+ "Estonian": "est_Latn",
284
+ "Latvian": "lav_Latn",
285
+ "Lithuanian": "lit_Latn",
286
+ "Macedonian": "mkd_Cyrl",
287
+ "Albanian": "als_Latn",
288
+ "Bosnian": "bos_Latn",
289
+ "Montenegrin": "cnr_Latn",
290
+ "Maltese": "mlt_Latn",
291
+ "Luxembourgish": "ltz_Latn",
292
+
293
+ # Asian Languages - East Asian
294
+ "Chinese (Simplified)": "zho_Hans",
295
+ "Chinese (Traditional)": "zho_Hant",
296
+ "Japanese": "jpn_Jpan",
297
+ "Korean": "kor_Hang",
298
+ "Mongolian": "khk_Cyrl",
299
+
300
+ # Asian Languages - Southeast Asian
301
+ "Vietnamese": "vie_Latn",
302
+ "Thai": "tha_Thai",
303
+ "Indonesian": "ind_Latn",
304
+ "Malay": "zsm_Latn",
305
+ "Filipino": "fil_Latn",
306
+ "Tagalog": "tgl_Latn",
307
+ "Javanese": "jav_Latn",
308
+ "Sundanese": "sun_Latn",
309
+ "Burmese": "mya_Mymr",
310
+ "Khmer": "khm_Khmr",
311
+ "Lao": "lao_Laoo",
312
+ "Cebuano": "ceb_Latn",
313
+ "Minangkabau": "min_Latn",
314
+ "Acehnese": "ace_Latn",
315
+ "Balinese": "ban_Latn",
316
+ "Banjarese": "bjn_Latn",
317
+ "Bugis": "bug_Latn",
318
+ "Madurese": "mad_Latn",
319
+
320
+ # Asian Languages - South Asian
321
+ "Hindi": "hin_Deva",
322
+ "Bengali": "ben_Beng",
323
+ "Tamil": "tam_Taml",
324
+ "Telugu": "tel_Telu",
325
+ "Marathi": "mar_Deva",
326
+ "Gujarati": "guj_Gujr",
327
+ "Kannada": "kan_Knda",
328
+ "Malayalam": "mal_Mlym",
329
+ "Punjabi": "pan_Guru",
330
+ "Urdu": "urd_Arab",
331
+ "Nepali": "nep_Deva",
332
+ "Sinhala": "sin_Sinh",
333
+ "Assamese": "asm_Beng",
334
+ "Oriya": "ory_Orya",
335
+ "Sanskrit": "san_Deva",
336
+ "Kashmiri": "kas_Arab",
337
+ "Sindhi": "snd_Arab",
338
+ "Maithili": "mai_Deva",
339
+ "Santali": "sat_Olck",
340
+ "Manipuri": "mni_Beng",
341
+ "Bodo": "brx_Deva",
342
+ "Dogri": "doi_Deva",
343
+ "Konkani": "gom_Deva",
344
+
345
+ # Middle Eastern Languages
346
+ "Arabic": "arb_Arab",
347
+ "Hebrew": "heb_Hebr",
348
+ "Persian": "pes_Arab",
349
+ "Turkish": "tur_Latn",
350
+ "Kurdish": "ckb_Arab",
351
+ "Pashto": "pbt_Arab",
352
+ "Dari": "prs_Arab",
353
+ "Azerbaijani": "azj_Latn",
354
+ "Kazakh": "kaz_Cyrl",
355
+ "Kyrgyz": "kir_Cyrl",
356
+ "Uzbek": "uzn_Latn",
357
+ "Tajik": "tgk_Cyrl",
358
+ "Turkmen": "tuk_Latn",
359
+ "Uighur": "uig_Arab",
360
+ "Armenian": "hye_Armn",
361
+ "Georgian": "kat_Geor",
362
+ "Amharic": "amh_Ethi",
363
+ "Tigrinya": "tir_Ethi",
364
+ "Oromo": "orm_Ethi",
365
+
366
+ # African Languages
367
+ "Swahili": "swh_Latn",
368
+ "Yoruba": "yor_Latn",
369
+ "Igbo": "ibo_Latn",
370
+ "Hausa": "hau_Latn",
371
+ "Zulu": "zul_Latn",
372
+ "Xhosa": "xho_Latn",
373
+ "Afrikaans": "afr_Latn",
374
+ "Somali": "som_Latn",
375
+ "Shona": "sna_Latn",
376
+ "Kinyarwanda": "kin_Latn",
377
+ "Rundi": "run_Latn",
378
+ "Chichewa": "nya_Latn",
379
+ "Luganda": "lug_Latn",
380
+ "Wolof": "wol_Latn",
381
+ "Fula": "fuv_Latn",
382
+ "Twi": "twi_Latn",
383
+ "Lingala": "lin_Latn",
384
+ "Bambara": "bam_Latn",
385
+ "Mossi": "mos_Latn",
386
+ "Ewe": "ewe_Latn",
387
+ "Akan": "aka_Latn",
388
+ "Malagasy": "plt_Latn",
389
+ "Sesotho": "sot_Latn",
390
+ "Tswana": "tsn_Latn",
391
+ "Venda": "ven_Latn",
392
+ "Tsonga": "tso_Latn",
393
+ "Ndebele": "nso_Latn",
394
+ "Swati": "ssw_Latn",
395
+
396
+ # European Celtic & Regional Languages
397
+ "Welsh": "cym_Latn",
398
+ "Irish": "gle_Latn",
399
+ "Scottish Gaelic": "gla_Latn",
400
+ "Breton": "bre_Latn",
401
+ "Cornish": "cor_Latn",
402
+ "Manx": "glv_Latn",
403
+ "Basque": "eus_Latn",
404
+ "Catalan": "cat_Latn",
405
+ "Galician": "glg_Latn",
406
+ "Occitan": "oci_Latn",
407
+ "Sardinian": "srd_Latn",
408
+ "Corsican": "cos_Latn",
409
+ "Faroese": "fao_Latn",
410
+ "Icelandic": "isl_Latn",
411
+ "Frisian": "fry_Latn",
412
+ "Kashubian": "csb_Latn",
413
+ "Sorbian": "hsb_Latn",
414
+ "Romansh": "roh_Latn",
415
+
416
+ # Americas Indigenous Languages
417
+ "Quechua": "quy_Latn",
418
+ "Guarani": "grn_Latn",
419
+ "Aymara": "ayr_Latn",
420
+ "Nahuatl": "nah_Latn",
421
+ "Maya": "mam_Latn",
422
+ "Wayuu": "guc_Latn",
423
+ "Otomi": "oto_Latn",
424
+ "Zapotec": "zap_Latn",
425
+ "Mixe": "mie_Latn",
426
+ "Tzeltal": "tzh_Latn",
427
+ "Tzotzil": "tzo_Latn",
428
+ "Tarahumara": "tar_Latn",
429
+ "Huichol": "hch_Latn",
430
+ "Mazatec": "maz_Latn",
431
+ "Chatino": "ctp_Latn",
432
+ "Chinantec": "chq_Latn",
433
+ "Mixtec": "mxt_Latn",
434
+ "Triqui": "trc_Latn",
435
+ "Mazahua": "maz_Latn",
436
+ "Purรฉpecha": "tsz_Latn",
437
+ "Totonac": "top_Latn",
438
+ "Huastec": "hus_Latn",
439
+ "Zoque": "zos_Latn",
440
+ "Chol": "ctu_Latn",
441
+ "Mam": "mam_Latn",
442
+ "Kสผicheสผ": "quc_Latn",
443
+ "Kaqchikel": "cak_Latn",
444
+ "Achuar": "acu_Latn",
445
+ "Shuar": "jiv_Latn",
446
+ "Awajรบn": "agr_Latn",
447
+ "Shipibo": "shp_Latn",
448
+ "Ashรกninka": "cni_Latn",
449
+
450
+ # Pacific Languages
451
+ "Mฤori": "mri_Latn",
452
+ "Samoan": "smo_Latn",
453
+ "Tongan": "ton_Latn",
454
+ "Fijian": "fij_Latn",
455
+ "Hawaiian": "haw_Latn",
456
+ "Tahitian": "tah_Latn",
457
+ "Chamorro": "cha_Latn",
458
+ "Palauan": "pau_Latn",
459
+ "Marshallese": "mah_Latn",
460
+ "Chuukese": "chk_Latn",
461
+ "Kosraean": "kos_Latn",
462
+ "Pohnpeian": "pon_Latn",
463
+ "Yapese": "yap_Latn",
464
+
465
+ # Additional Asian Languages
466
+ "Tibetan": "bod_Tibt",
467
+ "Dzongkha": "dzo_Tibt",
468
+ "Ladakhi": "lbj_Tibt",
469
+ "Sherpa": "xsr_Deva",
470
+ "Newari": "new_Deva",
471
+ "Maithili": "mai_Deva",
472
+ "Bhojpuri": "bho_Deva",
473
+ "Magahi": "mag_Deva",
474
+ "Angika": "anp_Deva",
475
+ "Bajjika": "bpy_Beng",
476
+ "Chittagonian": "ctg_Beng",
477
+ "Sylheti": "syl_Beng",
478
+ "Rohingya": "rhg_Arab",
479
+ "Meitei": "mni_Beng",
480
+ "Tripuri": "trp_Latn",
481
+ "Garo": "grt_Beng",
482
+ "Kokborok": "trp_Latn",
483
+ "Mizo": "lus_Latn",
484
+ "Nagamese": "nag_Latn",
485
+ "Khasi": "kha_Latn",
486
+ "Balochi": "bal_Arab",
487
+ "Brahui": "brh_Arab",
488
+ "Burushaski": "bsk_Arab",
489
+ "Gilgiti": "shx_Arab",
490
+ "Hindko": "hno_Arab",
491
+ "Pahari": "phr_Deva",
492
+ "Garhwali": "gbm_Deva",
493
+ "Kumaoni": "kfy_Deva",
494
+
495
+ # Additional African Languages
496
+ "Berber": "ber_Latn",
497
+ "Tamazight": "tzm_Latn",
498
+ "Kabyle": "kab_Latn",
499
+ "Tuareg": "taq_Latn",
500
+ "Nuer": "nus_Latn",
501
+ "Dinka": "din_Latn",
502
+ "Kanuri": "knc_Latn",
503
+ "Tiv": "tiv_Latn",
504
+ "Efik": "efi_Latn",
505
+ "Ibibio": "ibb_Latn",
506
+ "Annang": "anw_Latn",
507
+ "Ijaw": "ijc_Latn",
508
+ "Urhobo": "urh_Latn",
509
+ "Edo": "bin_Latn",
510
+ "Igala": "igl_Latn",
511
+ "Idoma": "idu_Latn",
512
+ "Berom": "bom_Latn",
513
+ "Gbagyi": "gbr_Latn",
514
+ "Nupe": "nup_Latn",
515
+ "Jukun": "jbu_Latn",
516
+ "Chadic": "cdc_Latn",
517
+ "Adamawa": "adm_Latn",
518
+ "Gur": "gur_Latn",
519
+ "Kru": "kru_Latn",
520
+ "Mande": "mnd_Latn",
521
+ "Nilotic": "nil_Latn",
522
+ "Cushitic": "cus_Latn",
523
+ "Omotic": "omo_Latn",
524
+ "Khoisan": "khi_Latn",
525
+
526
+ # Sign Languages (limited support)
527
+ "American Sign Language": "ase_Sgnw",
528
+ "British Sign Language": "bfi_Sgnw",
529
+ "French Sign Language": "fsl_Sgnw",
530
+ "German Sign Language": "gsg_Sgnw",
531
+ "Japanese Sign Language": "jsl_Sgnw",
532
+ "Chinese Sign Language": "csl_Sgnw",
533
+
534
+ # Historical and Classical Languages
535
+ "Latin": "lat_Latn",
536
+ "Ancient Greek": "grc_Grek",
537
+ "Old Church Slavonic": "chu_Cyrl",
538
+ "Middle English": "enm_Latn",
539
+ "Old English": "ang_Latn",
540
+ "Old Norse": "non_Latn",
541
+ "Gothic": "got_Goth",
542
+ "Aramaic": "arc_Armi",
543
+ "Coptic": "cop_Copt",
544
+ "Ge'ez": "gez_Ethi",
545
+ "Akkadian": "akk_Xsux",
546
+ "Sumerian": "sux_Xsux",
547
+ "Hittite": "hit_Xsux",
548
+ "Phoenician": "phn_Phnx",
549
+ "Ugaritic": "uga_Ugar",
550
+ "Pahlavi": "pal_Phlv",
551
+ "Avestan": "ave_Avst",
552
+ "Old Persian": "peo_Xpeo",
553
+ "Sogdian": "sog_Sogd",
554
+ "Tocharian": "txb_Latn",
555
+ "Khotanese": "kho_Brah",
556
+ "Gandhari": "pgd_Khar",
557
+ "Prakrit": "prc_Brah",
558
+ "Pali": "pli_Latn",
559
+ }
560
+
561
+ # Create a sorted list for better UI
562
+ LANGUAGE_NAMES = sorted(LANGUAGE_CODES.keys())
563
+
564
+ def extract_text_from_pdf(file_path: str) -> str:
565
+ """Extract text from PDF file while preserving paragraph structure"""
566
+ try:
567
+ with open(file_path, 'rb') as file:
568
+ pdf_reader = PyPDF2.PdfReader(file)
569
+ paragraphs = []
570
+
571
+ for page in pdf_reader.pages:
572
+ page_text = page.extract_text()
573
+ if page_text.strip():
574
+ page_paragraphs = [p.strip() for p in page_text.split('\n\n') if p.strip()]
575
+ paragraphs.extend(page_paragraphs)
576
+
577
+ return '\n\n'.join(paragraphs)
578
+ except Exception as e:
579
+ logger.error(f"Error extracting text from PDF: {str(e)}")
580
+ return f"Error reading PDF: {str(e)}"
581
+
582
+ def extract_text_from_docx(file_path: str) -> Tuple[str, list]:
583
+ """Extract text from DOCX file while preserving paragraph structure and formatting info"""
584
+ try:
585
+ doc = Document(file_path)
586
+ paragraphs = []
587
+ formatting_info = []
588
+
589
+ for para in doc.paragraphs:
590
+ text = para.text.strip()
591
+ if text:
592
+ paragraphs.append(text)
593
+
594
+ para_format = {
595
+ 'alignment': para.alignment,
596
+ 'runs': []
597
+ }
598
+
599
+ for run in para.runs:
600
+ if run.text.strip():
601
+ run_format = {
602
+ 'text': run.text,
603
+ 'bold': run.bold,
604
+ 'italic': run.italic,
605
+ 'underline': run.underline,
606
+ 'font_name': run.font.name,
607
+ 'font_size': run.font.size
608
+ }
609
+ para_format['runs'].append(run_format)
610
+
611
+ formatting_info.append(para_format)
612
+
613
+ text = '\n\n'.join(paragraphs)
614
+ return text, formatting_info
615
+
616
+ except Exception as e:
617
+ logger.error(f"Error extracting text from DOCX: {str(e)}")
618
+ return f"Error reading DOCX: {str(e)}", []
619
+
620
+ def create_formatted_docx(translated_paragraphs: list, formatting_info: list, filename: str) -> str:
621
+ """Create a DOCX file with translated text while preserving original formatting"""
622
+ try:
623
+ doc = Document()
624
+
625
+ # Remove default paragraph
626
+ if doc.paragraphs:
627
+ p = doc.paragraphs[0]
628
+ p._element.getparent().remove(p._element)
629
+
630
+ for i, (para_text, para_format) in enumerate(zip(translated_paragraphs, formatting_info)):
631
+ if not para_text.strip():
632
+ continue
633
+
634
+ paragraph = doc.add_paragraph()
635
+
636
+ # Apply paragraph formatting
637
+ try:
638
+ if para_format.get('alignment') is not None:
639
+ paragraph.alignment = para_format['alignment']
640
+ except Exception as e:
641
+ logger.warning(f"Could not apply paragraph formatting: {e}")
642
+
643
+ # Apply run formatting
644
+ runs_info = para_format.get('runs', [])
645
+
646
+ if runs_info:
647
+ # Get dominant formatting
648
+ total_runs = len(runs_info)
649
+ bold_count = sum(1 for r in runs_info if r.get('bold'))
650
+ italic_count = sum(1 for r in runs_info if r.get('italic'))
651
+ underline_count = sum(1 for r in runs_info if r.get('underline'))
652
+
653
+ run = paragraph.add_run(para_text)
654
+
655
+ try:
656
+ if bold_count > total_runs / 2:
657
+ run.bold = True
658
+ if italic_count > total_runs / 2:
659
+ run.italic = True
660
+ if underline_count > total_runs / 2:
661
+ run.underline = True
662
+ except Exception as e:
663
+ logger.warning(f"Could not apply run formatting: {e}")
664
+ else:
665
+ paragraph.add_run(para_text)
666
+
667
+ doc.save(filename)
668
+ return filename
669
+
670
+ except Exception as e:
671
+ logger.error(f"Error creating formatted DOCX: {str(e)}")
672
+ return create_docx_with_text('\n\n'.join(translated_paragraphs), filename)
673
+
674
+ def create_docx_with_text(text: str, filename: str) -> str:
675
+ """Create a DOCX file with the given text"""
676
+ try:
677
+ doc = Document()
678
+ paragraphs = text.split('\n\n')
679
+
680
+ for para_text in paragraphs:
681
+ if para_text.strip():
682
+ cleaned_text = para_text.replace('\n', ' ').strip()
683
+ doc.add_paragraph(cleaned_text)
684
+
685
+ doc.save(filename)
686
+ return filename
687
+ except Exception as e:
688
+ logger.error(f"Error creating DOCX: {str(e)}")
689
+ return None
690
+
691
+ @spaces.GPU
692
+ def translate_text_input(text: str, source_lang: str, target_lang: str, session_id: str = "") -> str:
693
+ """Handle text input translation"""
694
+ if not is_authenticated(session_id):
695
+ return "โŒ Please log in to use this feature."
696
+
697
+ if not text.strip():
698
+ return "Please enter some text to translate."
699
+
700
+ if source_lang not in LANGUAGE_CODES or target_lang not in LANGUAGE_CODES:
701
+ return "Invalid language selection."
702
+
703
+ return translator.translate_text(text, source_lang, target_lang)
704
+
705
+ @spaces.GPU
706
+ def translate_document(file, source_lang: str, target_lang: str, session_id: str = "") -> Tuple[Optional[str], str]:
707
+ """Handle document translation while preserving original formatting"""
708
+ if not is_authenticated(session_id):
709
+ return None, "โŒ Please log in to use this feature."
710
+
711
+ if file is None:
712
+ return None, "Please upload a document."
713
+
714
+ if source_lang not in LANGUAGE_CODES or target_lang not in LANGUAGE_CODES:
715
+ return None, "Invalid language selection."
716
+
717
+ start_time = time.time()
718
+
719
+ try:
720
+ file_extension = os.path.splitext(file.name)[1].lower()
721
+ formatting_info = None
722
+
723
+ logger.info(f"Starting document translation: {source_lang} โ†’ {target_lang}")
724
+
725
+ if file_extension == '.pdf':
726
+ text = extract_text_from_pdf(file.name)
727
+ elif file_extension == '.docx':
728
+ text, formatting_info = extract_text_from_docx(file.name)
729
+ else:
730
+ return None, "Unsupported file format. Please upload PDF or DOCX files only."
731
+
732
+ if text.startswith("Error"):
733
+ return None, text
734
+
735
+ word_count = len(text.split())
736
+ char_count = len(text)
737
+ logger.info(f"Document stats: {word_count} words, {char_count} characters")
738
+
739
+ # Translate the text
740
+ translate_start = time.time()
741
+ translated_text = translator.translate_text(text, source_lang, target_lang)
742
+ translate_end = time.time()
743
+
744
+ translate_duration = translate_end - translate_start
745
+ logger.info(f"Core translation took: {translate_duration:.2f} seconds")
746
+
747
+ # Create output file
748
+ output_filename = f"translated_{os.path.splitext(os.path.basename(file.name))[0]}.docx"
749
+ output_path = os.path.join(tempfile.gettempdir(), output_filename)
750
+
751
+ # Create formatted output
752
+ if formatting_info and file_extension == '.docx':
753
+ translated_paragraphs = translated_text.split('\n\n')
754
+
755
+ if len(translated_paragraphs) == len(formatting_info):
756
+ create_formatted_docx(translated_paragraphs, formatting_info, output_path)
757
+ else:
758
+ logger.warning(f"Paragraph count mismatch, using fallback")
759
+ create_docx_with_text(translated_text, output_path)
760
+ else:
761
+ create_docx_with_text(translated_text, output_path)
762
+
763
+ # Calculate timing
764
+ end_time = time.time()
765
+ total_duration = end_time - start_time
766
+
767
+ minutes = int(total_duration // 60)
768
+ seconds = int(total_duration % 60)
769
+ time_str = f"{minutes}m {seconds}s" if minutes > 0 else f"{seconds}s"
770
+
771
+ # Calculate speed
772
+ if word_count > 0 and total_duration > 0:
773
+ words_per_minute = int((word_count / total_duration) * 60)
774
+ speed_info = f" โ€ข Speed: {words_per_minute} words/min"
775
+ else:
776
+ speed_info = ""
777
+
778
+ translation_type = "Same language processed" if source_lang == target_lang else "NLLB translation"
779
+
780
+ status_message = (
781
+ f"โœ… Translation completed successfully!\n"
782
+ f"โฑ๏ธ Time taken: {time_str}\n"
783
+ f"๐Ÿ“„ Document: {word_count} words, {char_count} characters\n"
784
+ f"๐Ÿ”„ Type: {translation_type}{speed_info}\n"
785
+ f"๐Ÿ“ Original formatting preserved in output file."
786
+ )
787
+
788
+ logger.info(f"Document translation completed in {total_duration:.2f} seconds")
789
+
790
+ return output_path, status_message
791
+
792
+ except Exception as e:
793
+ end_time = time.time()
794
+ total_duration = end_time - start_time
795
+ minutes = int(total_duration // 60)
796
+ seconds = int(total_duration % 60)
797
+ time_str = f"{minutes}m {seconds}s" if minutes > 0 else f"{seconds}s"
798
+
799
+ logger.error(f"Document translation error after {time_str}: {str(e)}")
800
+ return None, f"โŒ Error during document translation (after {time_str}): {str(e)}"
801
+
802
+ # Initialize translator
803
+ print("Initializing NLLB Translator...")
804
+ translator = NLLBTranslator(model_size="600M") # Use smaller model for stability
805
+
806
+ # Create the Gradio app
807
+ with gr.Blocks(title="NLLB Universal Translator", theme=gr.themes.Soft()) as demo:
808
+ session_state = gr.State("")
809
+
810
+ # Login interface
811
+ with gr.Column(visible=True) as login_column:
812
+ gr.Markdown("""
813
+ # ๐ŸŒ NLLB Universal Translator - Authentication Required
814
+
815
+ Translate between **200+ languages** using Meta's NLLB (No Language Left Behind) model.
816
+ Please enter your credentials to access the translation tool.
817
+ """)
818
+
819
+ with gr.Row():
820
+ with gr.Column(scale=1):
821
+ pass
822
+
823
+ with gr.Column(scale=2):
824
+ with gr.Group():
825
+ gr.Markdown("### Login")
826
+ username_input = gr.Textbox(
827
+ label="Username",
828
+ placeholder="Enter username",
829
+ type="text"
830
+ )
831
+ password_input = gr.Textbox(
832
+ label="Password",
833
+ placeholder="Enter password",
834
+ type="password"
835
+ )
836
+ login_btn = gr.Button("Login", variant="primary", size="lg")
837
+ login_status = gr.Markdown("")
838
+
839
+ with gr.Column(scale=1):
840
+ pass
841
+
842
+ gr.Markdown("""
843
+ ---
844
+
845
+ **Features:**
846
+ - ๐Ÿ”’ Secure authentication system
847
+ - ๐ŸŒ Support for **200+ languages** using Meta's NLLB model
848
+ - ๐Ÿ“„ Document translation with formatting preservation
849
+ - ๐Ÿš€ High-quality neural machine translation
850
+ - ๐Ÿ’พ Preserves original document formatting and styling
851
+ - ๐Ÿ—บ๏ธ Includes indigenous, regional, and low-resource languages
852
+ - ๐Ÿ“š Historical and classical languages support
853
+ """)
854
+
855
+ # Main translator interface
856
+ with gr.Column(visible=False) as main_column:
857
+ gr.Markdown("""
858
+ # ๐ŸŒ NLLB Universal Translator
859
+
860
+ Translate text and documents between **200+ languages** using Meta's NLLB model.
861
+ Supports major world languages plus indigenous, regional, and low-resource languages.
862
+ """)
863
+
864
+ with gr.Tabs():
865
+ # Text Translation Tab
866
+ with gr.TabItem("๐Ÿ“ Text Translation"):
867
+ with gr.Row():
868
+ with gr.Column():
869
+ text_input = gr.Textbox(
870
+ label="Input Text",
871
+ placeholder="Enter text to translate...",
872
+ lines=6
873
+ )
874
+ with gr.Row():
875
+ source_lang_text = gr.Dropdown(
876
+ choices=LANGUAGE_NAMES,
877
+ label="Source Language",
878
+ value="English",
879
+ filterable=True
880
+ )
881
+ target_lang_text = gr.Dropdown(
882
+ choices=LANGUAGE_NAMES,
883
+ label="Target Language",
884
+ value="Spanish",
885
+ filterable=True
886
+ )
887
+ translate_text_btn = gr.Button("๐Ÿ”„ Translate Text", variant="primary", size="lg")
888
+
889
+ with gr.Column():
890
+ text_output = gr.Textbox(
891
+ label="Translated Text",
892
+ lines=6,
893
+ interactive=False
894
+ )
895
+
896
+ gr.Markdown("""
897
+ **Supported Languages (200+):**
898
+ - ๐Ÿ‡ช๐Ÿ‡บ **European**: English, Spanish, French, German, Italian, Russian, etc.
899
+ - ๐Ÿ‡จ๐Ÿ‡ณ **East Asian**: Chinese, Japanese, Korean, Mongolian
900
+ - ๐Ÿ‡ฎ๐Ÿ‡ณ **South Asian**: Hindi, Bengali, Tamil, Telugu, Urdu, Sanskrit, etc.
901
+ - ๐Ÿ‡ธ๐Ÿ‡ฆ **Middle Eastern**: Arabic, Persian, Hebrew, Turkish, Kurdish
902
+ - ๐ŸŒ **African**: Swahili, Yoruba, Hausa, Zulu, Amharic, Berber
903
+ - ๐Ÿ‡ป๐Ÿ‡ณ **Southeast Asian**: Vietnamese, Thai, Indonesian, Filipino, Burmese
904
+ - ๐Ÿ๏ธ **Pacific**: Mฤori, Samoan, Hawaiian, Fijian, Tahitian
905
+ - ๐Ÿ›๏ธ **Historical**: Latin, Ancient Greek, Sanskrit, Aramaic
906
+ - ๐Ÿ—บ๏ธ **Indigenous**: Quechua, Guarani, Nahuatl, Maya, and many more
907
+ - ๐Ÿ”ค **Regional**: Welsh, Basque, Catalan, Breton, Faroese
908
+ """)
909
+
910
+
911
+ # Document Translation Tab
912
+ with gr.TabItem("๐Ÿ“„ Document Translation"):
913
+ with gr.Row():
914
+ with gr.Column():
915
+ file_input = gr.File(
916
+ label="๐Ÿ“ Upload Document",
917
+ file_types=[".pdf", ".docx"],
918
+ type="filepath"
919
+ )
920
+ with gr.Row():
921
+ source_lang_doc = gr.Dropdown(
922
+ choices=LANGUAGE_NAMES,
923
+ label="Source Language",
924
+ value="English",
925
+ filterable=True
926
+ )
927
+ target_lang_doc = gr.Dropdown(
928
+ choices=LANGUAGE_NAMES,
929
+ label="Target Language",
930
+ value="French",
931
+ filterable=True
932
+ )
933
+ translate_doc_btn = gr.Button("๐Ÿ”„ Translate Document", variant="primary", size="lg")
934
+
935
+ gr.Markdown("""
936
+ **Document Features:**
937
+ - ๐Ÿ“ Preserves original formatting
938
+ - ๐Ÿ“‹ Maintains paragraph structure
939
+ - ๐ŸŽจ Keeps basic styling (bold, italic, underline)
940
+ - ๐Ÿ“Š Supports PDF and DOCX files
941
+ - ๐Ÿ’พ Outputs formatted DOCX file
942
+ """)
943
+
944
+ with gr.Column():
945
+ doc_status = gr.Textbox(
946
+ label="๐Ÿ“Š Translation Status",
947
+ lines=6,
948
+ interactive=False
949
+ )
950
+ doc_output = gr.File(
951
+ label="๐Ÿ“ฅ Download Translated Document"
952
+ )
953
+
954
+ # Examples
955
+ gr.Examples(
956
+ examples=[
957
+ ["Hello, how are you today?", "English", "Spanish"],
958
+ ["Bonjour, comment allez-vous?", "French", "English"],
959
+ ["ไฝ ๅฅฝ๏ผŒไฝ ไปŠๅคฉๅฅฝๅ—๏ผŸ", "Chinese (Simplified)", "English"],
960
+ ["เคจเคฎเคธเฅเคคเฅ‡, เค†เคช เค•เฅˆเคธเฅ‡ เคนเฅˆเค‚?", "Hindi", "English"],
961
+ ["ู…ุฑุญุจุงุŒ ูƒูŠู ุญุงู„ูƒุŸ", "Arabic", "English"],
962
+ ["Machine learning is transforming the world.", "English", "French"],
963
+ ],
964
+ inputs=[text_input, source_lang_text, target_lang_text],
965
+ outputs=[text_output],
966
+ fn=lambda text, src, tgt: translate_text_input(text, src, tgt, ""),
967
+ cache_examples=False,
968
+ label="Try these examples:"
969
+ )
970
+
971
+ # Logout functionality
972
+ with gr.Row():
973
+ logout_btn = gr.Button("๐Ÿ”“ Logout", variant="secondary", size="sm")
974
+
975
+ def handle_login(username, password):
976
+ success, session_id = authenticate(username, password)
977
+ if success:
978
+ return (
979
+ gr.Markdown("โœ… **Login successful!** Welcome to the NLLB Universal Translator."),
980
+ gr.Column(visible=False),
981
+ gr.Column(visible=True),
982
+ session_id
983
+ )
984
+ else:
985
+ return (
986
+ gr.Markdown("โŒ **Invalid credentials.** Please check your username and password."),
987
+ gr.Column(visible=True),
988
+ gr.Column(visible=False),
989
+ ""
990
+ )
991
+
992
+ def handle_logout(session_id):
993
+ if session_id:
994
+ logout_session(session_id)
995
+ return (
996
+ gr.Column(visible=True),
997
+ gr.Column(visible=False),
998
+ "",
999
+ gr.Textbox(value=""),
1000
+ gr.Textbox(value=""),
1001
+ gr.Markdown("๐Ÿ”“ **Logged out successfully.** Please login again to continue.")
1002
+ )
1003
+
1004
+ # Event handlers
1005
+ login_btn.click(
1006
+ fn=handle_login,
1007
+ inputs=[username_input, password_input],
1008
+ outputs=[login_status, login_column, main_column, session_state]
1009
+ )
1010
+
1011
+ logout_btn.click(
1012
+ fn=handle_logout,
1013
+ inputs=[session_state],
1014
+ outputs=[login_column, main_column, session_state, username_input, password_input, login_status]
1015
+ )
1016
+
1017
+ translate_text_btn.click(
1018
+ fn=lambda text, src, tgt, session: translate_text_input(text, src, tgt, session),
1019
+ inputs=[text_input, source_lang_text, target_lang_text, session_state],
1020
+ outputs=[text_output]
1021
+ )
1022
+
1023
+ translate_doc_btn.click(
1024
+ fn=lambda file, src, tgt, session: translate_document(file, src, tgt, session),
1025
+ inputs=[file_input, source_lang_doc, target_lang_doc, session_state],
1026
+ outputs=[doc_output, doc_status]
1027
+ )
1028
+
1029
+ print("NLLB Universal Translator initialized successfully!")
1030
+
1031
+ # Launch the app
1032
+ if __name__ == "__main__":
1033
+ demo.launch(share=True)