Spaces:
Sleeping
Sleeping
| # Code v15 | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| import PyPDF2 | |
| from docx import Document | |
| import tempfile | |
| import os | |
| from typing import Optional, Tuple | |
| import logging | |
| import spaces | |
| import time | |
| import re | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Authentication credentials from environment variables | |
| VALID_USERNAME = os.getenv("USERNAME", "admin") | |
| VALID_PASSWORD = os.getenv("PASSWORD", "password123") | |
| # Session management | |
| authenticated_sessions = set() | |
| def authenticate(username: str, password: str) -> tuple: | |
| """Authenticate user credentials and return session info""" | |
| if username == VALID_USERNAME and password == VALID_PASSWORD: | |
| session_id = f"session_{int(time.time())}_{hash(username)}" | |
| authenticated_sessions.add(session_id) | |
| logger.info(f"Successful login for user: {username}") | |
| return True, session_id | |
| else: | |
| logger.warning(f"Failed login attempt for user: {username}") | |
| return False, None | |
| def is_authenticated(session_id: str) -> bool: | |
| """Check if session is authenticated""" | |
| return session_id in authenticated_sessions | |
| def logout_session(session_id: str): | |
| """Remove session from authenticated sessions""" | |
| if session_id in authenticated_sessions: | |
| authenticated_sessions.remove(session_id) | |
| logger.info(f"Session logged out: {session_id}") | |
| class NLLBTranslator: | |
| def __init__(self, model_size="600M"): | |
| self.model = None | |
| self.tokenizer = None | |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| self.model_size = model_size | |
| self.load_model() | |
| def load_model(self): | |
| """Load the NLLB model and tokenizer""" | |
| try: | |
| # Use the smaller, more stable model by default | |
| if self.model_size == "600M": | |
| model_name = "facebook/nllb-200-distilled-600M" | |
| elif self.model_size == "1.3B": | |
| model_name = "facebook/nllb-200-1.3B" | |
| else: # 3.3B | |
| model_name = "facebook/nllb-200-3.3B" | |
| logger.info(f"Loading NLLB model: {model_name}") | |
| if torch.cuda.is_available(): | |
| logger.info(f"CUDA available: {torch.cuda.get_device_name(0)}") | |
| torch_dtype = torch.float16 | |
| else: | |
| logger.warning("CUDA not available, using CPU") | |
| torch_dtype = torch.float32 | |
| # Load tokenizer | |
| logger.info("Loading NLLB tokenizer...") | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # Load model | |
| logger.info("Loading NLLB model...") | |
| self.model = AutoModelForSeq2SeqLM.from_pretrained( | |
| model_name, | |
| torch_dtype=torch_dtype, | |
| low_cpu_mem_usage=True | |
| ) | |
| self.model = self.model.to(self.device) | |
| self.model.eval() | |
| logger.info("NLLB model loaded successfully!") | |
| except Exception as e: | |
| logger.error(f"Error loading NLLB model: {str(e)}") | |
| raise e | |
| def split_into_sentences(self, text: str) -> tuple: | |
| """Split text into sentences while preserving paragraph structure""" | |
| paragraphs = re.split(r'\n\s*\n', text) | |
| sentence_list = [] | |
| paragraph_markers = [] | |
| for para_idx, paragraph in enumerate(paragraphs): | |
| if not paragraph.strip(): | |
| continue | |
| sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip()) | |
| for sent_idx, sentence in enumerate(sentences): | |
| if sentence.strip(): | |
| sentence_list.append(sentence.strip()) | |
| is_para_end = (sent_idx == len(sentences) - 1) | |
| is_last_para = (para_idx == len(paragraphs) - 1) | |
| paragraph_markers.append({ | |
| 'is_paragraph_end': is_para_end and not is_last_para, | |
| 'original_sentence': sentence.strip() | |
| }) | |
| return sentence_list, paragraph_markers | |
| def reconstruct_formatting(self, translated_sentences: list, paragraph_markers: list) -> str: | |
| """Reconstruct text with original paragraph formatting""" | |
| if len(translated_sentences) != len(paragraph_markers): | |
| return ' '.join(translated_sentences) | |
| result = [] | |
| for i, (translation, marker) in enumerate(zip(translated_sentences, paragraph_markers)): | |
| result.append(translation) | |
| if marker['is_paragraph_end']: | |
| result.append('\n\n') | |
| elif i < len(translated_sentences) - 1: | |
| result.append(' ') | |
| return ''.join(result) | |
| def translate_text(self, text: str, source_lang: str, target_lang: str) -> str: | |
| """Translate text from source language to target language""" | |
| try: | |
| source_code = LANGUAGE_CODES.get(source_lang) | |
| target_code = LANGUAGE_CODES.get(target_lang) | |
| if not source_code or not target_code: | |
| return f"Unsupported language: {source_lang} or {target_lang}" | |
| if source_lang == target_lang: | |
| return text | |
| logger.info(f"Translating from {source_lang} ({source_code}) to {target_lang} ({target_code})") | |
| # For simple test, try a direct approach first | |
| if text.strip() == "Hello, how are you today?": | |
| logger.info("Using simple test translation") | |
| return self.simple_translate(text, source_code, target_code) | |
| # Check if simple or complex text | |
| if '\n' not in text and len(text.split('.')) <= 2: | |
| input_sentences = [text.strip()] | |
| paragraph_markers = None | |
| else: | |
| input_sentences, paragraph_markers = self.split_into_sentences(text) | |
| if not input_sentences: | |
| return "No valid text found to translate." | |
| return self.perform_translation(input_sentences, source_code, target_code, paragraph_markers) | |
| except Exception as e: | |
| logger.error(f"Translation error: {str(e)}") | |
| import traceback | |
| traceback.print_exc() | |
| return f"Error during translation: {str(e)}" | |
| def simple_translate(self, text: str, source_code: str, target_code: str) -> str: | |
| """Simple translation method for testing""" | |
| try: | |
| # Set source language | |
| self.tokenizer.src_lang = source_code | |
| # Tokenize | |
| inputs = self.tokenizer( | |
| text, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=512 | |
| ).to(self.device) | |
| # Generate without forced language token to avoid tokenizer issues | |
| with torch.no_grad(): | |
| outputs = self.model.generate( | |
| **inputs, | |
| max_length=512, | |
| num_beams=5, | |
| early_stopping=True, | |
| do_sample=False, | |
| pad_token_id=self.tokenizer.pad_token_id, | |
| eos_token_id=self.tokenizer.eos_token_id | |
| ) | |
| # Decode | |
| translation = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| logger.info(f"Simple translation result: {translation}") | |
| return translation.strip() if translation.strip() else "Translation produced empty result" | |
| except Exception as e: | |
| logger.error(f"Simple translation failed: {str(e)}") | |
| return f"Simple translation failed: {str(e)}" | |
| def perform_translation(self, input_sentences: list, source_code: str, target_code: str, paragraph_markers: list) -> str: | |
| """Perform the actual translation using NLLB model""" | |
| batch_size = 2 # Conservative batch size for stability | |
| # For very long sentences, use single processing | |
| avg_sentence_length = sum(len(s.split()) for s in input_sentences) / len(input_sentences) if input_sentences else 0 | |
| if avg_sentence_length > 100: | |
| batch_size = 1 | |
| logger.info(f"Using batch size {batch_size} for average sentence length {avg_sentence_length:.1f} words") | |
| logger.info(f"Translating from {source_code} to {target_code}") | |
| all_translations = [] | |
| for i in range(0, len(input_sentences), batch_size): | |
| batch_sentences = input_sentences[i:i + batch_size] | |
| try: | |
| # Tokenize input with source language | |
| self.tokenizer.src_lang = source_code | |
| inputs = self.tokenizer( | |
| batch_sentences, | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=512 | |
| ).to(self.device) | |
| # Get target language token ID using different methods | |
| target_token_id = None | |
| try: | |
| # Method 1: Try lang_code_to_id | |
| if hasattr(self.tokenizer, 'lang_code_to_id'): | |
| target_token_id = self.tokenizer.lang_code_to_id[target_code] | |
| # Method 2: Try convert_tokens_to_ids | |
| elif hasattr(self.tokenizer, 'convert_tokens_to_ids'): | |
| target_token_id = self.tokenizer.convert_tokens_to_ids(target_code) | |
| # Method 3: Try getting from vocabulary | |
| else: | |
| target_token_id = self.tokenizer.get_vocab().get(target_code) | |
| except (KeyError, AttributeError): | |
| logger.warning(f"Could not find target language token for {target_code}") | |
| target_token_id = None | |
| # Generate translation | |
| generation_kwargs = { | |
| "max_length": 512, | |
| "num_beams": 4, | |
| "early_stopping": True, | |
| "do_sample": False, | |
| "pad_token_id": self.tokenizer.pad_token_id, | |
| "eos_token_id": self.tokenizer.eos_token_id | |
| } | |
| # Only add forced_bos_token_id if we found a valid target token | |
| if target_token_id is not None: | |
| generation_kwargs["forced_bos_token_id"] = target_token_id | |
| with torch.no_grad(): | |
| translated_tokens = self.model.generate(**inputs, **generation_kwargs) | |
| # Decode translations | |
| translations = self.tokenizer.batch_decode( | |
| translated_tokens, | |
| skip_special_tokens=True | |
| ) | |
| # Clean up translations | |
| cleaned_translations = [] | |
| for trans in translations: | |
| cleaned = trans.strip() | |
| if cleaned: | |
| cleaned_translations.append(cleaned) | |
| else: | |
| cleaned_translations.append("Translation produced empty result") | |
| all_translations.extend(cleaned_translations) | |
| # Progress logging | |
| if len(input_sentences) > 10: | |
| progress = min(100, int(((i + batch_size) / len(input_sentences)) * 100)) | |
| logger.info(f"Translation progress: {progress}%") | |
| except Exception as e: | |
| logger.error(f"Translation error in batch: {str(e)}") | |
| # Fallback: process sentences individually with simpler approach | |
| for single_sentence in batch_sentences: | |
| try: | |
| # Set source language | |
| self.tokenizer.src_lang = source_code | |
| inputs = self.tokenizer( | |
| single_sentence, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=512 | |
| ).to(self.device) | |
| # Use simple generation without forced language tokens | |
| with torch.no_grad(): | |
| translated_tokens = self.model.generate( | |
| **inputs, | |
| max_length=512, | |
| num_beams=2, | |
| early_stopping=True, | |
| do_sample=False, | |
| pad_token_id=self.tokenizer.pad_token_id, | |
| eos_token_id=self.tokenizer.eos_token_id | |
| ) | |
| translation = self.tokenizer.decode( | |
| translated_tokens[0], | |
| skip_special_tokens=True | |
| ) | |
| # Clean the translation | |
| cleaned_translation = translation.strip() | |
| if cleaned_translation: | |
| all_translations.append(cleaned_translation) | |
| else: | |
| all_translations.append("Empty translation result") | |
| except Exception as single_e: | |
| logger.error(f"Failed to translate sentence '{single_sentence}': {str(single_e)}") | |
| all_translations.append(f"Translation failed: {str(single_e)}") | |
| # Reconstruct formatting | |
| if paragraph_markers and len(all_translations) == len(paragraph_markers): | |
| final_translation = self.reconstruct_formatting(all_translations, paragraph_markers) | |
| else: | |
| final_translation = ' '.join(all_translations) if all_translations else "Translation failed - no output generated" | |
| return final_translation | |
| # NLLB-200 supported languages (comprehensive list) | |
| LANGUAGE_CODES = { | |
| # Major European Languages | |
| "English": "eng_Latn", | |
| "French": "fra_Latn", | |
| "German": "deu_Latn", | |
| "Spanish": "spa_Latn", | |
| "Italian": "ita_Latn", | |
| "Portuguese": "por_Latn", | |
| "Russian": "rus_Cyrl", | |
| "Dutch": "nld_Latn", | |
| "Polish": "pol_Latn", | |
| "Czech": "ces_Latn", | |
| "Swedish": "swe_Latn", | |
| "Danish": "dan_Latn", | |
| "Norwegian": "nob_Latn", | |
| "Finnish": "fin_Latn", | |
| "Greek": "ell_Grek", | |
| "Hungarian": "hun_Latn", | |
| "Romanian": "ron_Latn", | |
| "Bulgarian": "bul_Cyrl", | |
| "Croatian": "hrv_Latn", | |
| "Slovak": "slk_Latn", | |
| "Ukrainian": "ukr_Cyrl", | |
| "Belarusian": "bel_Cyrl", | |
| "Serbian": "srp_Cyrl", | |
| "Slovenian": "slv_Latn", | |
| "Estonian": "est_Latn", | |
| "Latvian": "lav_Latn", | |
| "Lithuanian": "lit_Latn", | |
| "Macedonian": "mkd_Cyrl", | |
| "Albanian": "als_Latn", | |
| "Bosnian": "bos_Latn", | |
| "Montenegrin": "cnr_Latn", | |
| "Maltese": "mlt_Latn", | |
| "Luxembourgish": "ltz_Latn", | |
| # Asian Languages - East Asian | |
| "Chinese (Simplified)": "zho_Hans", | |
| "Chinese (Traditional)": "zho_Hant", | |
| "Japanese": "jpn_Jpan", | |
| "Korean": "kor_Hang", | |
| "Mongolian": "khk_Cyrl", | |
| # Asian Languages - Southeast Asian | |
| "Vietnamese": "vie_Latn", | |
| "Thai": "tha_Thai", | |
| "Indonesian": "ind_Latn", | |
| "Malay": "zsm_Latn", | |
| "Filipino": "fil_Latn", | |
| "Tagalog": "tgl_Latn", | |
| "Javanese": "jav_Latn", | |
| "Sundanese": "sun_Latn", | |
| "Burmese": "mya_Mymr", | |
| "Khmer": "khm_Khmr", | |
| "Lao": "lao_Laoo", | |
| "Cebuano": "ceb_Latn", | |
| "Minangkabau": "min_Latn", | |
| "Acehnese": "ace_Latn", | |
| "Balinese": "ban_Latn", | |
| "Banjarese": "bjn_Latn", | |
| "Bugis": "bug_Latn", | |
| "Madurese": "mad_Latn", | |
| # Asian Languages - South Asian | |
| "Hindi": "hin_Deva", | |
| "Bengali": "ben_Beng", | |
| "Tamil": "tam_Taml", | |
| "Telugu": "tel_Telu", | |
| "Marathi": "mar_Deva", | |
| "Gujarati": "guj_Gujr", | |
| "Kannada": "kan_Knda", | |
| "Malayalam": "mal_Mlym", | |
| "Punjabi": "pan_Guru", | |
| "Urdu": "urd_Arab", | |
| "Nepali": "nep_Deva", | |
| "Sinhala": "sin_Sinh", | |
| "Assamese": "asm_Beng", | |
| "Oriya": "ory_Orya", | |
| "Sanskrit": "san_Deva", | |
| "Kashmiri": "kas_Arab", | |
| "Sindhi": "snd_Arab", | |
| "Maithili": "mai_Deva", | |
| "Santali": "sat_Olck", | |
| "Manipuri": "mni_Beng", | |
| "Bodo": "brx_Deva", | |
| "Dogri": "doi_Deva", | |
| "Konkani": "gom_Deva", | |
| # Middle Eastern Languages | |
| "Arabic": "arb_Arab", | |
| "Hebrew": "heb_Hebr", | |
| "Persian": "pes_Arab", | |
| "Turkish": "tur_Latn", | |
| "Kurdish": "ckb_Arab", | |
| "Pashto": "pbt_Arab", | |
| "Dari": "prs_Arab", | |
| "Azerbaijani": "azj_Latn", | |
| "Kazakh": "kaz_Cyrl", | |
| "Kyrgyz": "kir_Cyrl", | |
| "Uzbek": "uzn_Latn", | |
| "Tajik": "tgk_Cyrl", | |
| "Turkmen": "tuk_Latn", | |
| "Uighur": "uig_Arab", | |
| "Armenian": "hye_Armn", | |
| "Georgian": "kat_Geor", | |
| "Amharic": "amh_Ethi", | |
| "Tigrinya": "tir_Ethi", | |
| "Oromo": "orm_Ethi", | |
| # African Languages | |
| "Swahili": "swh_Latn", | |
| "Yoruba": "yor_Latn", | |
| "Igbo": "ibo_Latn", | |
| "Hausa": "hau_Latn", | |
| "Zulu": "zul_Latn", | |
| "Xhosa": "xho_Latn", | |
| "Afrikaans": "afr_Latn", | |
| "Somali": "som_Latn", | |
| "Shona": "sna_Latn", | |
| "Kinyarwanda": "kin_Latn", | |
| "Rundi": "run_Latn", | |
| "Chichewa": "nya_Latn", | |
| "Luganda": "lug_Latn", | |
| "Wolof": "wol_Latn", | |
| "Fula": "fuv_Latn", | |
| "Twi": "twi_Latn", | |
| "Lingala": "lin_Latn", | |
| "Bambara": "bam_Latn", | |
| "Mossi": "mos_Latn", | |
| "Ewe": "ewe_Latn", | |
| "Akan": "aka_Latn", | |
| "Malagasy": "plt_Latn", | |
| "Sesotho": "sot_Latn", | |
| "Tswana": "tsn_Latn", | |
| "Venda": "ven_Latn", | |
| "Tsonga": "tso_Latn", | |
| "Ndebele": "nso_Latn", | |
| "Swati": "ssw_Latn", | |
| # European Celtic & Regional Languages | |
| "Welsh": "cym_Latn", | |
| "Irish": "gle_Latn", | |
| "Scottish Gaelic": "gla_Latn", | |
| "Breton": "bre_Latn", | |
| "Cornish": "cor_Latn", | |
| "Manx": "glv_Latn", | |
| "Basque": "eus_Latn", | |
| "Catalan": "cat_Latn", | |
| "Galician": "glg_Latn", | |
| "Occitan": "oci_Latn", | |
| "Sardinian": "srd_Latn", | |
| "Corsican": "cos_Latn", | |
| "Faroese": "fao_Latn", | |
| "Icelandic": "isl_Latn", | |
| "Frisian": "fry_Latn", | |
| "Kashubian": "csb_Latn", | |
| "Sorbian": "hsb_Latn", | |
| "Romansh": "roh_Latn", | |
| # Americas Indigenous Languages | |
| "Quechua": "quy_Latn", | |
| "Guarani": "grn_Latn", | |
| "Aymara": "ayr_Latn", | |
| "Nahuatl": "nah_Latn", | |
| "Maya": "mam_Latn", | |
| "Wayuu": "guc_Latn", | |
| "Otomi": "oto_Latn", | |
| "Zapotec": "zap_Latn", | |
| "Mixe": "mie_Latn", | |
| "Tzeltal": "tzh_Latn", | |
| "Tzotzil": "tzo_Latn", | |
| "Tarahumara": "tar_Latn", | |
| "Huichol": "hch_Latn", | |
| "Mazatec": "maz_Latn", | |
| "Chatino": "ctp_Latn", | |
| "Chinantec": "chq_Latn", | |
| "Mixtec": "mxt_Latn", | |
| "Triqui": "trc_Latn", | |
| "Mazahua": "maz_Latn", | |
| "Purépecha": "tsz_Latn", | |
| "Totonac": "top_Latn", | |
| "Huastec": "hus_Latn", | |
| "Zoque": "zos_Latn", | |
| "Chol": "ctu_Latn", | |
| "Mam": "mam_Latn", | |
| "Kʼicheʼ": "quc_Latn", | |
| "Kaqchikel": "cak_Latn", | |
| "Achuar": "acu_Latn", | |
| "Shuar": "jiv_Latn", | |
| "Awajún": "agr_Latn", | |
| "Shipibo": "shp_Latn", | |
| "Asháninka": "cni_Latn", | |
| # Pacific Languages | |
| "Māori": "mri_Latn", | |
| "Samoan": "smo_Latn", | |
| "Tongan": "ton_Latn", | |
| "Fijian": "fij_Latn", | |
| "Hawaiian": "haw_Latn", | |
| "Tahitian": "tah_Latn", | |
| "Chamorro": "cha_Latn", | |
| "Palauan": "pau_Latn", | |
| "Marshallese": "mah_Latn", | |
| "Chuukese": "chk_Latn", | |
| "Kosraean": "kos_Latn", | |
| "Pohnpeian": "pon_Latn", | |
| "Yapese": "yap_Latn", | |
| # Additional Asian Languages | |
| "Tibetan": "bod_Tibt", | |
| "Dzongkha": "dzo_Tibt", | |
| "Ladakhi": "lbj_Tibt", | |
| "Sherpa": "xsr_Deva", | |
| "Newari": "new_Deva", | |
| "Maithili": "mai_Deva", | |
| "Bhojpuri": "bho_Deva", | |
| "Magahi": "mag_Deva", | |
| "Angika": "anp_Deva", | |
| "Bajjika": "bpy_Beng", | |
| "Chittagonian": "ctg_Beng", | |
| "Sylheti": "syl_Beng", | |
| "Rohingya": "rhg_Arab", | |
| "Meitei": "mni_Beng", | |
| "Tripuri": "trp_Latn", | |
| "Garo": "grt_Beng", | |
| "Kokborok": "trp_Latn", | |
| "Mizo": "lus_Latn", | |
| "Nagamese": "nag_Latn", | |
| "Khasi": "kha_Latn", | |
| "Balochi": "bal_Arab", | |
| "Brahui": "brh_Arab", | |
| "Burushaski": "bsk_Arab", | |
| "Gilgiti": "shx_Arab", | |
| "Hindko": "hno_Arab", | |
| "Pahari": "phr_Deva", | |
| "Garhwali": "gbm_Deva", | |
| "Kumaoni": "kfy_Deva", | |
| # Additional African Languages | |
| "Berber": "ber_Latn", | |
| "Tamazight": "tzm_Latn", | |
| "Kabyle": "kab_Latn", | |
| "Tuareg": "taq_Latn", | |
| "Nuer": "nus_Latn", | |
| "Dinka": "din_Latn", | |
| "Kanuri": "knc_Latn", | |
| "Tiv": "tiv_Latn", | |
| "Efik": "efi_Latn", | |
| "Ibibio": "ibb_Latn", | |
| "Annang": "anw_Latn", | |
| "Ijaw": "ijc_Latn", | |
| "Urhobo": "urh_Latn", | |
| "Edo": "bin_Latn", | |
| "Igala": "igl_Latn", | |
| "Idoma": "idu_Latn", | |
| "Berom": "bom_Latn", | |
| "Gbagyi": "gbr_Latn", | |
| "Nupe": "nup_Latn", | |
| "Jukun": "jbu_Latn", | |
| "Chadic": "cdc_Latn", | |
| "Adamawa": "adm_Latn", | |
| "Gur": "gur_Latn", | |
| "Kru": "kru_Latn", | |
| "Mande": "mnd_Latn", | |
| "Nilotic": "nil_Latn", | |
| "Cushitic": "cus_Latn", | |
| "Omotic": "omo_Latn", | |
| "Khoisan": "khi_Latn", | |
| # Sign Languages (limited support) | |
| "American Sign Language": "ase_Sgnw", | |
| "British Sign Language": "bfi_Sgnw", | |
| "French Sign Language": "fsl_Sgnw", | |
| "German Sign Language": "gsg_Sgnw", | |
| "Japanese Sign Language": "jsl_Sgnw", | |
| "Chinese Sign Language": "csl_Sgnw", | |
| # Historical and Classical Languages | |
| "Latin": "lat_Latn", | |
| "Ancient Greek": "grc_Grek", | |
| "Old Church Slavonic": "chu_Cyrl", | |
| "Middle English": "enm_Latn", | |
| "Old English": "ang_Latn", | |
| "Old Norse": "non_Latn", | |
| "Gothic": "got_Goth", | |
| "Aramaic": "arc_Armi", | |
| "Coptic": "cop_Copt", | |
| "Ge'ez": "gez_Ethi", | |
| "Akkadian": "akk_Xsux", | |
| "Sumerian": "sux_Xsux", | |
| "Hittite": "hit_Xsux", | |
| "Phoenician": "phn_Phnx", | |
| "Ugaritic": "uga_Ugar", | |
| "Pahlavi": "pal_Phlv", | |
| "Avestan": "ave_Avst", | |
| "Old Persian": "peo_Xpeo", | |
| "Sogdian": "sog_Sogd", | |
| "Tocharian": "txb_Latn", | |
| "Khotanese": "kho_Brah", | |
| "Gandhari": "pgd_Khar", | |
| "Prakrit": "prc_Brah", | |
| "Pali": "pli_Latn", | |
| } | |
| # Create a sorted list for better UI | |
| LANGUAGE_NAMES = sorted(LANGUAGE_CODES.keys()) | |
| def extract_text_from_pdf(file_path: str) -> str: | |
| """Extract text from PDF file while preserving paragraph structure""" | |
| try: | |
| with open(file_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| paragraphs = [] | |
| for page in pdf_reader.pages: | |
| page_text = page.extract_text() | |
| if page_text.strip(): | |
| page_paragraphs = [p.strip() for p in page_text.split('\n\n') if p.strip()] | |
| paragraphs.extend(page_paragraphs) | |
| return '\n\n'.join(paragraphs) | |
| except Exception as e: | |
| logger.error(f"Error extracting text from PDF: {str(e)}") | |
| return f"Error reading PDF: {str(e)}" | |
| def extract_text_from_docx(file_path: str) -> Tuple[str, list]: | |
| """Extract text from DOCX file while preserving paragraph structure and formatting info""" | |
| try: | |
| doc = Document(file_path) | |
| paragraphs = [] | |
| formatting_info = [] | |
| for para in doc.paragraphs: | |
| text = para.text.strip() | |
| if text: | |
| paragraphs.append(text) | |
| # Store comprehensive paragraph formatting | |
| para_format = { | |
| 'alignment': para.alignment, | |
| 'left_indent': para.paragraph_format.left_indent, | |
| 'right_indent': para.paragraph_format.right_indent, | |
| 'first_line_indent': para.paragraph_format.first_line_indent, | |
| 'space_before': para.paragraph_format.space_before, | |
| 'space_after': para.paragraph_format.space_after, | |
| 'line_spacing': para.paragraph_format.line_spacing, | |
| 'runs': [] | |
| } | |
| # Store detailed run-level formatting | |
| for run in para.runs: | |
| if run.text.strip(): | |
| run_format = { | |
| 'text': run.text, | |
| 'bold': run.bold, | |
| 'italic': run.italic, | |
| 'underline': run.underline, | |
| 'font_name': run.font.name, | |
| 'font_size': run.font.size, | |
| 'font_color_rgb': None, | |
| 'font_color_theme': None, | |
| 'highlight_color': None, | |
| 'superscript': None, | |
| 'subscript': None, | |
| 'strike': None, | |
| 'double_strike': None, | |
| 'all_caps': None, | |
| 'small_caps': None | |
| } | |
| # Get font color (RGB) | |
| try: | |
| if run.font.color and run.font.color.rgb: | |
| run_format['font_color_rgb'] = run.font.color.rgb | |
| except: | |
| pass | |
| # Get font color (theme color) | |
| try: | |
| if run.font.color and run.font.color.theme_color: | |
| run_format['font_color_theme'] = run.font.color.theme_color | |
| except: | |
| pass | |
| # Get highlight color | |
| try: | |
| if run.font.highlight_color: | |
| run_format['highlight_color'] = run.font.highlight_color | |
| except: | |
| pass | |
| # Get additional formatting | |
| try: | |
| run_format['superscript'] = run.font.superscript | |
| run_format['subscript'] = run.font.subscript | |
| run_format['strike'] = run.font.strike | |
| run_format['double_strike'] = run.font.double_strike | |
| run_format['all_caps'] = run.font.all_caps | |
| run_format['small_caps'] = run.font.small_caps | |
| except: | |
| pass | |
| para_format['runs'].append(run_format) | |
| formatting_info.append(para_format) | |
| text = '\n\n'.join(paragraphs) | |
| return text, formatting_info | |
| except Exception as e: | |
| logger.error(f"Error extracting text from DOCX: {str(e)}") | |
| return f"Error reading DOCX: {str(e)}", [] | |
| def create_formatted_docx(translated_paragraphs: list, formatting_info: list, filename: str) -> str: | |
| """Create a DOCX file with translated text while preserving original formatting""" | |
| try: | |
| doc = Document() | |
| # Remove default paragraph | |
| if doc.paragraphs: | |
| p = doc.paragraphs[0] | |
| p._element.getparent().remove(p._element) | |
| for i, (para_text, para_format) in enumerate(zip(translated_paragraphs, formatting_info)): | |
| if not para_text.strip(): | |
| continue | |
| paragraph = doc.add_paragraph() | |
| # Apply paragraph-level formatting | |
| try: | |
| if para_format.get('alignment') is not None: | |
| paragraph.alignment = para_format['alignment'] | |
| if para_format.get('left_indent') is not None: | |
| paragraph.paragraph_format.left_indent = para_format['left_indent'] | |
| if para_format.get('right_indent') is not None: | |
| paragraph.paragraph_format.right_indent = para_format['right_indent'] | |
| if para_format.get('first_line_indent') is not None: | |
| paragraph.paragraph_format.first_line_indent = para_format['first_line_indent'] | |
| if para_format.get('space_before') is not None: | |
| paragraph.paragraph_format.space_before = para_format['space_before'] | |
| if para_format.get('space_after') is not None: | |
| paragraph.paragraph_format.space_after = para_format['space_after'] | |
| if para_format.get('line_spacing') is not None: | |
| paragraph.paragraph_format.line_spacing = para_format['line_spacing'] | |
| except Exception as e: | |
| logger.warning(f"Could not apply paragraph formatting: {e}") | |
| # Apply run-level formatting with full preservation | |
| runs_info = para_format.get('runs', []) | |
| if runs_info: | |
| # Analyze the dominant formatting for the paragraph | |
| total_runs = len(runs_info) | |
| # Count formatting occurrences | |
| bold_count = sum(1 for r in runs_info if r.get('bold')) | |
| italic_count = sum(1 for r in runs_info if r.get('italic')) | |
| underline_count = sum(1 for r in runs_info if r.get('underline')) | |
| # Get most common formatting values | |
| font_names = [r.get('font_name') for r in runs_info if r.get('font_name')] | |
| font_sizes = [r.get('font_size') for r in runs_info if r.get('font_size')] | |
| font_colors_rgb = [r.get('font_color_rgb') for r in runs_info if r.get('font_color_rgb')] | |
| font_colors_theme = [r.get('font_color_theme') for r in runs_info if r.get('font_color_theme')] | |
| highlight_colors = [r.get('highlight_color') for r in runs_info if r.get('highlight_color')] | |
| # Create run with translated text | |
| run = paragraph.add_run(para_text) | |
| try: | |
| # Apply basic formatting (use majority rule) | |
| if bold_count > total_runs / 2: | |
| run.bold = True | |
| if italic_count > total_runs / 2: | |
| run.italic = True | |
| if underline_count > total_runs / 2: | |
| run.underline = True | |
| # Apply font name (most common) | |
| if font_names: | |
| most_common_font = max(set(font_names), key=font_names.count) | |
| run.font.name = most_common_font | |
| # Apply font size (most common) | |
| if font_sizes: | |
| most_common_size = max(set(font_sizes), key=font_sizes.count) | |
| run.font.size = most_common_size | |
| # Apply font color (RGB - most common) | |
| if font_colors_rgb: | |
| most_common_color = max(set(font_colors_rgb), key=font_colors_rgb.count) | |
| run.font.color.rgb = most_common_color | |
| # Apply font color (theme - most common) | |
| elif font_colors_theme: | |
| most_common_theme = max(set(font_colors_theme), key=font_colors_theme.count) | |
| run.font.color.theme_color = most_common_theme | |
| # Apply highlight color (most common) | |
| if highlight_colors: | |
| most_common_highlight = max(set(highlight_colors), key=highlight_colors.count) | |
| run.font.highlight_color = most_common_highlight | |
| # Apply additional formatting if majority of runs have it | |
| superscript_count = sum(1 for r in runs_info if r.get('superscript')) | |
| subscript_count = sum(1 for r in runs_info if r.get('subscript')) | |
| strike_count = sum(1 for r in runs_info if r.get('strike')) | |
| double_strike_count = sum(1 for r in runs_info if r.get('double_strike')) | |
| all_caps_count = sum(1 for r in runs_info if r.get('all_caps')) | |
| small_caps_count = sum(1 for r in runs_info if r.get('small_caps')) | |
| if superscript_count > total_runs / 2: | |
| run.font.superscript = True | |
| if subscript_count > total_runs / 2: | |
| run.font.subscript = True | |
| if strike_count > total_runs / 2: | |
| run.font.strike = True | |
| if double_strike_count > total_runs / 2: | |
| run.font.double_strike = True | |
| if all_caps_count > total_runs / 2: | |
| run.font.all_caps = True | |
| if small_caps_count > total_runs / 2: | |
| run.font.small_caps = True | |
| except Exception as e: | |
| logger.warning(f"Could not apply some run formatting: {e}") | |
| else: | |
| # No run formatting info, just add the text | |
| paragraph.add_run(para_text) | |
| doc.save(filename) | |
| logger.info(f"Created formatted DOCX with full formatting preservation: {filename}") | |
| return filename | |
| except Exception as e: | |
| logger.error(f"Error creating formatted DOCX: {str(e)}") | |
| return create_docx_with_text('\n\n'.join(translated_paragraphs), filename) | |
| def create_docx_with_text(text: str, filename: str) -> str: | |
| """Create a DOCX file with the given text""" | |
| try: | |
| doc = Document() | |
| paragraphs = text.split('\n\n') | |
| for para_text in paragraphs: | |
| if para_text.strip(): | |
| cleaned_text = para_text.replace('\n', ' ').strip() | |
| doc.add_paragraph(cleaned_text) | |
| doc.save(filename) | |
| return filename | |
| except Exception as e: | |
| logger.error(f"Error creating DOCX: {str(e)}") | |
| return None | |
| def translate_text_input(text: str, source_lang: str, target_lang: str, session_id: str = "") -> str: | |
| """Handle text input translation""" | |
| if not is_authenticated(session_id): | |
| return "❌ Please log in to use this feature." | |
| if not text.strip(): | |
| return "Please enter some text to translate." | |
| if source_lang not in LANGUAGE_CODES or target_lang not in LANGUAGE_CODES: | |
| return "Invalid language selection." | |
| return translator.translate_text(text, source_lang, target_lang) | |
| def translate_document(file, source_lang: str, target_lang: str, session_id: str = "") -> Tuple[Optional[str], str]: | |
| """Handle document translation while preserving original formatting""" | |
| if not is_authenticated(session_id): | |
| return None, "❌ Please log in to use this feature." | |
| if file is None: | |
| return None, "Please upload a document." | |
| if source_lang not in LANGUAGE_CODES or target_lang not in LANGUAGE_CODES: | |
| return None, "Invalid language selection." | |
| start_time = time.time() | |
| try: | |
| file_extension = os.path.splitext(file.name)[1].lower() | |
| formatting_info = None | |
| logger.info(f"Starting document translation: {source_lang} → {target_lang}") | |
| if file_extension == '.pdf': | |
| text = extract_text_from_pdf(file.name) | |
| elif file_extension == '.docx': | |
| text, formatting_info = extract_text_from_docx(file.name) | |
| else: | |
| return None, "Unsupported file format. Please upload PDF or DOCX files only." | |
| if text.startswith("Error"): | |
| return None, text | |
| word_count = len(text.split()) | |
| char_count = len(text) | |
| logger.info(f"Document stats: {word_count} words, {char_count} characters") | |
| # Translate the text | |
| translate_start = time.time() | |
| translated_text = translator.translate_text(text, source_lang, target_lang) | |
| translate_end = time.time() | |
| translate_duration = translate_end - translate_start | |
| logger.info(f"Core translation took: {translate_duration:.2f} seconds") | |
| # Create output file | |
| output_filename = f"translated_{os.path.splitext(os.path.basename(file.name))[0]}.docx" | |
| output_path = os.path.join(tempfile.gettempdir(), output_filename) | |
| # Create formatted output | |
| if formatting_info and file_extension == '.docx': | |
| translated_paragraphs = translated_text.split('\n\n') | |
| if len(translated_paragraphs) == len(formatting_info): | |
| create_formatted_docx(translated_paragraphs, formatting_info, output_path) | |
| else: | |
| logger.warning(f"Paragraph count mismatch, using fallback") | |
| create_docx_with_text(translated_text, output_path) | |
| else: | |
| create_docx_with_text(translated_text, output_path) | |
| # Calculate timing | |
| end_time = time.time() | |
| total_duration = end_time - start_time | |
| minutes = int(total_duration // 60) | |
| seconds = int(total_duration % 60) | |
| time_str = f"{minutes}m {seconds}s" if minutes > 0 else f"{seconds}s" | |
| # Calculate speed | |
| if word_count > 0 and total_duration > 0: | |
| words_per_minute = int((word_count / total_duration) * 60) | |
| speed_info = f" • Speed: {words_per_minute} words/min" | |
| else: | |
| speed_info = "" | |
| translation_type = "Same language processed" if source_lang == target_lang else "NLLB translation" | |
| status_message = ( | |
| f"✅ Translation completed successfully!\n" | |
| f"⏱️ Time taken: {time_str}\n" | |
| f"📄 Document: {word_count} words, {char_count} characters\n" | |
| f"🔄 Type: {translation_type}{speed_info}\n" | |
| f"📁 Original formatting preserved in output file." | |
| ) | |
| logger.info(f"Document translation completed in {total_duration:.2f} seconds") | |
| return output_path, status_message | |
| except Exception as e: | |
| end_time = time.time() | |
| total_duration = end_time - start_time | |
| minutes = int(total_duration // 60) | |
| seconds = int(total_duration % 60) | |
| time_str = f"{minutes}m {seconds}s" if minutes > 0 else f"{seconds}s" | |
| logger.error(f"Document translation error after {time_str}: {str(e)}") | |
| return None, f"❌ Error during document translation (after {time_str}): {str(e)}" | |
| # Initialize translator | |
| print("Initializing NLLB Translator...") | |
| translator = NLLBTranslator(model_size="600M") # Use smaller model for stability | |
| # Create the Gradio app | |
| with gr.Blocks(title="NLLB Universal Translator", theme=gr.themes.Soft()) as demo: | |
| session_state = gr.State("") | |
| # Login interface | |
| with gr.Column(visible=True) as login_column: | |
| gr.Markdown(""" | |
| # 🌍 NLLB Universal Translator - Authentication Required | |
| Translate between **200+ languages** using Meta's NLLB (No Language Left Behind) model. | |
| Please enter your credentials to access the translation tool. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| pass | |
| with gr.Column(scale=2): | |
| with gr.Group(): | |
| gr.Markdown("### Login") | |
| username_input = gr.Textbox( | |
| label="Username", | |
| placeholder="Enter username", | |
| type="text" | |
| ) | |
| password_input = gr.Textbox( | |
| label="Password", | |
| placeholder="Enter password", | |
| type="password" | |
| ) | |
| login_btn = gr.Button("Login", variant="primary", size="lg") | |
| login_status = gr.Markdown("") | |
| with gr.Column(scale=1): | |
| pass | |
| gr.Markdown(""" | |
| --- | |
| **Features:** | |
| - 🔒 Secure authentication system | |
| - 🌍 Support for **200+ languages** using Meta's NLLB model | |
| - 📄 Document translation with formatting preservation | |
| - 🚀 High-quality neural machine translation | |
| - 💾 Preserves original document formatting and styling | |
| - 🗺️ Includes indigenous, regional, and low-resource languages | |
| - 📚 Historical and classical languages support | |
| """) | |
| # Main translator interface | |
| with gr.Column(visible=False) as main_column: | |
| gr.Markdown(""" | |
| # 🌍 NLLB Universal Translator | |
| Translate text and documents between **200+ languages** using Meta's NLLB model. | |
| Supports major world languages plus indigenous, regional, and low-resource languages. | |
| """) | |
| with gr.Tabs(): | |
| # Text Translation Tab | |
| with gr.TabItem("📝 Text Translation"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Input Text", | |
| placeholder="Enter text to translate...", | |
| lines=6 | |
| ) | |
| with gr.Row(): | |
| source_lang_text = gr.Dropdown( | |
| choices=LANGUAGE_NAMES, | |
| label="Source Language", | |
| value="English", | |
| filterable=True | |
| ) | |
| target_lang_text = gr.Dropdown( | |
| choices=LANGUAGE_NAMES, | |
| label="Target Language", | |
| value="Spanish", | |
| filterable=True | |
| ) | |
| translate_text_btn = gr.Button("🔄 Translate Text", variant="primary", size="lg") | |
| with gr.Column(): | |
| text_output = gr.Textbox( | |
| label="Translated Text", | |
| lines=6, | |
| interactive=False | |
| ) | |
| gr.Markdown(""" | |
| **Supported Languages (200+):** | |
| - 🇪🇺 **European**: English, Spanish, French, German, Italian, Russian, etc. | |
| - 🇨🇳 **East Asian**: Chinese, Japanese, Korean, Mongolian | |
| - 🇮🇳 **South Asian**: Hindi, Bengali, Tamil, Telugu, Urdu, Sanskrit, etc. | |
| - 🇸🇦 **Middle Eastern**: Arabic, Persian, Hebrew, Turkish, Kurdish | |
| - 🌍 **African**: Swahili, Yoruba, Hausa, Zulu, Amharic, Berber | |
| - 🇻🇳 **Southeast Asian**: Vietnamese, Thai, Indonesian, Filipino, Burmese | |
| - 🏝️ **Pacific**: Māori, Samoan, Hawaiian, Fijian, Tahitian | |
| - 🏛️ **Historical**: Latin, Ancient Greek, Sanskrit, Aramaic | |
| - 🗺️ **Indigenous**: Quechua, Guarani, Nahuatl, Maya, and many more | |
| - 🔤 **Regional**: Welsh, Basque, Catalan, Breton, Faroese | |
| """) | |
| # Document Translation Tab | |
| with gr.TabItem("📄 Document Translation"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.File( | |
| label="📁 Upload Document", | |
| file_types=[".pdf", ".docx"], | |
| type="filepath" | |
| ) | |
| with gr.Row(): | |
| source_lang_doc = gr.Dropdown( | |
| choices=LANGUAGE_NAMES, | |
| label="Source Language", | |
| value="English", | |
| filterable=True | |
| ) | |
| target_lang_doc = gr.Dropdown( | |
| choices=LANGUAGE_NAMES, | |
| label="Target Language", | |
| value="French", | |
| filterable=True | |
| ) | |
| translate_doc_btn = gr.Button("🔄 Translate Document", variant="primary", size="lg") | |
| gr.Markdown(""" | |
| **Document Features:** | |
| - 📝 Preserves original formatting | |
| - 📋 Maintains paragraph structure | |
| - 🎨 Keeps basic styling (bold, italic, underline) | |
| - 📊 Supports PDF and DOCX files | |
| - 💾 Outputs formatted DOCX file | |
| """) | |
| with gr.Column(): | |
| doc_status = gr.Textbox( | |
| label="📊 Translation Status", | |
| lines=6, | |
| interactive=False | |
| ) | |
| doc_output = gr.File( | |
| label="📥 Download Translated Document" | |
| ) | |
| # Examples | |
| gr.Examples( | |
| examples=[ | |
| ["Hello, how are you today?", "English", "Spanish"], | |
| ["Bonjour, comment allez-vous?", "French", "English"], | |
| ["你好,你今天好吗?", "Chinese (Simplified)", "English"], | |
| ["नमस्ते, आप कैसे हैं?", "Hindi", "English"], | |
| ["مرحبا، كيف حالك؟", "Arabic", "English"], | |
| ["Machine learning is transforming the world.", "English", "French"], | |
| ], | |
| inputs=[text_input, source_lang_text, target_lang_text], | |
| outputs=[text_output], | |
| fn=lambda text, src, tgt: translate_text_input(text, src, tgt, ""), | |
| cache_examples=False, | |
| label="Try these examples:" | |
| ) | |
| # Logout functionality | |
| with gr.Row(): | |
| logout_btn = gr.Button("🔓 Logout", variant="secondary", size="sm") | |
| def handle_login(username, password): | |
| success, session_id = authenticate(username, password) | |
| if success: | |
| return ( | |
| gr.Markdown("✅ **Login successful!** Welcome to the NLLB Universal Translator."), | |
| gr.Column(visible=False), | |
| gr.Column(visible=True), | |
| session_id | |
| ) | |
| else: | |
| return ( | |
| gr.Markdown("❌ **Invalid credentials.** Please check your username and password."), | |
| gr.Column(visible=True), | |
| gr.Column(visible=False), | |
| "" | |
| ) | |
| def handle_logout(session_id): | |
| if session_id: | |
| logout_session(session_id) | |
| return ( | |
| gr.Column(visible=True), | |
| gr.Column(visible=False), | |
| "", | |
| gr.Textbox(value=""), | |
| gr.Textbox(value=""), | |
| gr.Markdown("🔓 **Logged out successfully.** Please login again to continue.") | |
| ) | |
| # Event handlers | |
| login_btn.click( | |
| fn=handle_login, | |
| inputs=[username_input, password_input], | |
| outputs=[login_status, login_column, main_column, session_state] | |
| ) | |
| logout_btn.click( | |
| fn=handle_logout, | |
| inputs=[session_state], | |
| outputs=[login_column, main_column, session_state, username_input, password_input, login_status] | |
| ) | |
| translate_text_btn.click( | |
| fn=lambda text, src, tgt, session: translate_text_input(text, src, tgt, session), | |
| inputs=[text_input, source_lang_text, target_lang_text, session_state], | |
| outputs=[text_output] | |
| ) | |
| translate_doc_btn.click( | |
| fn=lambda file, src, tgt, session: translate_document(file, src, tgt, session), | |
| inputs=[file_input, source_lang_doc, target_lang_doc, session_state], | |
| outputs=[doc_output, doc_status] | |
| ) | |
| print("NLLB Universal Translator initialized successfully!") | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch(share=True) |