# Code v15 import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import PyPDF2 from docx import Document import tempfile import os from typing import Optional, Tuple import logging import spaces import time import re # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Authentication credentials from environment variables VALID_USERNAME = os.getenv("USERNAME", "admin") VALID_PASSWORD = os.getenv("PASSWORD", "password123") # Session management authenticated_sessions = set() def authenticate(username: str, password: str) -> tuple: """Authenticate user credentials and return session info""" if username == VALID_USERNAME and password == VALID_PASSWORD: session_id = f"session_{int(time.time())}_{hash(username)}" authenticated_sessions.add(session_id) logger.info(f"Successful login for user: {username}") return True, session_id else: logger.warning(f"Failed login attempt for user: {username}") return False, None def is_authenticated(session_id: str) -> bool: """Check if session is authenticated""" return session_id in authenticated_sessions def logout_session(session_id: str): """Remove session from authenticated sessions""" if session_id in authenticated_sessions: authenticated_sessions.remove(session_id) logger.info(f"Session logged out: {session_id}") class NLLBTranslator: def __init__(self, model_size="600M"): self.model = None self.tokenizer = None self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model_size = model_size self.load_model() def load_model(self): """Load the NLLB model and tokenizer""" try: # Use the smaller, more stable model by default if self.model_size == "600M": model_name = "facebook/nllb-200-distilled-600M" elif self.model_size == "1.3B": model_name = "facebook/nllb-200-1.3B" else: # 3.3B model_name = "facebook/nllb-200-3.3B" logger.info(f"Loading NLLB model: {model_name}") if torch.cuda.is_available(): logger.info(f"CUDA available: {torch.cuda.get_device_name(0)}") torch_dtype = torch.float16 else: logger.warning("CUDA not available, using CPU") torch_dtype = torch.float32 # Load tokenizer logger.info("Loading NLLB tokenizer...") self.tokenizer = AutoTokenizer.from_pretrained(model_name) # Load model logger.info("Loading NLLB model...") self.model = AutoModelForSeq2SeqLM.from_pretrained( model_name, torch_dtype=torch_dtype, low_cpu_mem_usage=True ) self.model = self.model.to(self.device) self.model.eval() logger.info("NLLB model loaded successfully!") except Exception as e: logger.error(f"Error loading NLLB model: {str(e)}") raise e def split_into_sentences(self, text: str) -> tuple: """Split text into sentences while preserving paragraph structure""" paragraphs = re.split(r'\n\s*\n', text) sentence_list = [] paragraph_markers = [] for para_idx, paragraph in enumerate(paragraphs): if not paragraph.strip(): continue sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip()) for sent_idx, sentence in enumerate(sentences): if sentence.strip(): sentence_list.append(sentence.strip()) is_para_end = (sent_idx == len(sentences) - 1) is_last_para = (para_idx == len(paragraphs) - 1) paragraph_markers.append({ 'is_paragraph_end': is_para_end and not is_last_para, 'original_sentence': sentence.strip() }) return sentence_list, paragraph_markers def reconstruct_formatting(self, translated_sentences: list, paragraph_markers: list) -> str: """Reconstruct text with original paragraph formatting""" if len(translated_sentences) != len(paragraph_markers): return ' '.join(translated_sentences) result = [] for i, (translation, marker) in enumerate(zip(translated_sentences, paragraph_markers)): result.append(translation) if marker['is_paragraph_end']: result.append('\n\n') elif i < len(translated_sentences) - 1: result.append(' ') return ''.join(result) @spaces.GPU def translate_text(self, text: str, source_lang: str, target_lang: str) -> str: """Translate text from source language to target language""" try: source_code = LANGUAGE_CODES.get(source_lang) target_code = LANGUAGE_CODES.get(target_lang) if not source_code or not target_code: return f"Unsupported language: {source_lang} or {target_lang}" if source_lang == target_lang: return text logger.info(f"Translating from {source_lang} ({source_code}) to {target_lang} ({target_code})") # For simple test, try a direct approach first if text.strip() == "Hello, how are you today?": logger.info("Using simple test translation") return self.simple_translate(text, source_code, target_code) # Check if simple or complex text if '\n' not in text and len(text.split('.')) <= 2: input_sentences = [text.strip()] paragraph_markers = None else: input_sentences, paragraph_markers = self.split_into_sentences(text) if not input_sentences: return "No valid text found to translate." return self.perform_translation(input_sentences, source_code, target_code, paragraph_markers) except Exception as e: logger.error(f"Translation error: {str(e)}") import traceback traceback.print_exc() return f"Error during translation: {str(e)}" def simple_translate(self, text: str, source_code: str, target_code: str) -> str: """Simple translation method for testing""" try: # Set source language self.tokenizer.src_lang = source_code # Tokenize inputs = self.tokenizer( text, return_tensors="pt", truncation=True, max_length=512 ).to(self.device) # Generate without forced language token to avoid tokenizer issues with torch.no_grad(): outputs = self.model.generate( **inputs, max_length=512, num_beams=5, early_stopping=True, do_sample=False, pad_token_id=self.tokenizer.pad_token_id, eos_token_id=self.tokenizer.eos_token_id ) # Decode translation = self.tokenizer.decode(outputs[0], skip_special_tokens=True) logger.info(f"Simple translation result: {translation}") return translation.strip() if translation.strip() else "Translation produced empty result" except Exception as e: logger.error(f"Simple translation failed: {str(e)}") return f"Simple translation failed: {str(e)}" def perform_translation(self, input_sentences: list, source_code: str, target_code: str, paragraph_markers: list) -> str: """Perform the actual translation using NLLB model""" batch_size = 2 # Conservative batch size for stability # For very long sentences, use single processing avg_sentence_length = sum(len(s.split()) for s in input_sentences) / len(input_sentences) if input_sentences else 0 if avg_sentence_length > 100: batch_size = 1 logger.info(f"Using batch size {batch_size} for average sentence length {avg_sentence_length:.1f} words") logger.info(f"Translating from {source_code} to {target_code}") all_translations = [] for i in range(0, len(input_sentences), batch_size): batch_sentences = input_sentences[i:i + batch_size] try: # Tokenize input with source language self.tokenizer.src_lang = source_code inputs = self.tokenizer( batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512 ).to(self.device) # Get target language token ID using different methods target_token_id = None try: # Method 1: Try lang_code_to_id if hasattr(self.tokenizer, 'lang_code_to_id'): target_token_id = self.tokenizer.lang_code_to_id[target_code] # Method 2: Try convert_tokens_to_ids elif hasattr(self.tokenizer, 'convert_tokens_to_ids'): target_token_id = self.tokenizer.convert_tokens_to_ids(target_code) # Method 3: Try getting from vocabulary else: target_token_id = self.tokenizer.get_vocab().get(target_code) except (KeyError, AttributeError): logger.warning(f"Could not find target language token for {target_code}") target_token_id = None # Generate translation generation_kwargs = { "max_length": 512, "num_beams": 4, "early_stopping": True, "do_sample": False, "pad_token_id": self.tokenizer.pad_token_id, "eos_token_id": self.tokenizer.eos_token_id } # Only add forced_bos_token_id if we found a valid target token if target_token_id is not None: generation_kwargs["forced_bos_token_id"] = target_token_id with torch.no_grad(): translated_tokens = self.model.generate(**inputs, **generation_kwargs) # Decode translations translations = self.tokenizer.batch_decode( translated_tokens, skip_special_tokens=True ) # Clean up translations cleaned_translations = [] for trans in translations: cleaned = trans.strip() if cleaned: cleaned_translations.append(cleaned) else: cleaned_translations.append("Translation produced empty result") all_translations.extend(cleaned_translations) # Progress logging if len(input_sentences) > 10: progress = min(100, int(((i + batch_size) / len(input_sentences)) * 100)) logger.info(f"Translation progress: {progress}%") except Exception as e: logger.error(f"Translation error in batch: {str(e)}") # Fallback: process sentences individually with simpler approach for single_sentence in batch_sentences: try: # Set source language self.tokenizer.src_lang = source_code inputs = self.tokenizer( single_sentence, return_tensors="pt", truncation=True, max_length=512 ).to(self.device) # Use simple generation without forced language tokens with torch.no_grad(): translated_tokens = self.model.generate( **inputs, max_length=512, num_beams=2, early_stopping=True, do_sample=False, pad_token_id=self.tokenizer.pad_token_id, eos_token_id=self.tokenizer.eos_token_id ) translation = self.tokenizer.decode( translated_tokens[0], skip_special_tokens=True ) # Clean the translation cleaned_translation = translation.strip() if cleaned_translation: all_translations.append(cleaned_translation) else: all_translations.append("Empty translation result") except Exception as single_e: logger.error(f"Failed to translate sentence '{single_sentence}': {str(single_e)}") all_translations.append(f"Translation failed: {str(single_e)}") # Reconstruct formatting if paragraph_markers and len(all_translations) == len(paragraph_markers): final_translation = self.reconstruct_formatting(all_translations, paragraph_markers) else: final_translation = ' '.join(all_translations) if all_translations else "Translation failed - no output generated" return final_translation # NLLB-200 supported languages (comprehensive list) LANGUAGE_CODES = { # Major European Languages "English": "eng_Latn", "French": "fra_Latn", "German": "deu_Latn", "Spanish": "spa_Latn", "Italian": "ita_Latn", "Portuguese": "por_Latn", "Russian": "rus_Cyrl", "Dutch": "nld_Latn", "Polish": "pol_Latn", "Czech": "ces_Latn", "Swedish": "swe_Latn", "Danish": "dan_Latn", "Norwegian": "nob_Latn", "Finnish": "fin_Latn", "Greek": "ell_Grek", "Hungarian": "hun_Latn", "Romanian": "ron_Latn", "Bulgarian": "bul_Cyrl", "Croatian": "hrv_Latn", "Slovak": "slk_Latn", "Ukrainian": "ukr_Cyrl", "Belarusian": "bel_Cyrl", "Serbian": "srp_Cyrl", "Slovenian": "slv_Latn", "Estonian": "est_Latn", "Latvian": "lav_Latn", "Lithuanian": "lit_Latn", "Macedonian": "mkd_Cyrl", "Albanian": "als_Latn", "Bosnian": "bos_Latn", "Montenegrin": "cnr_Latn", "Maltese": "mlt_Latn", "Luxembourgish": "ltz_Latn", # Asian Languages - East Asian "Chinese (Simplified)": "zho_Hans", "Chinese (Traditional)": "zho_Hant", "Japanese": "jpn_Jpan", "Korean": "kor_Hang", "Mongolian": "khk_Cyrl", # Asian Languages - Southeast Asian "Vietnamese": "vie_Latn", "Thai": "tha_Thai", "Indonesian": "ind_Latn", "Malay": "zsm_Latn", "Filipino": "fil_Latn", "Tagalog": "tgl_Latn", "Javanese": "jav_Latn", "Sundanese": "sun_Latn", "Burmese": "mya_Mymr", "Khmer": "khm_Khmr", "Lao": "lao_Laoo", "Cebuano": "ceb_Latn", "Minangkabau": "min_Latn", "Acehnese": "ace_Latn", "Balinese": "ban_Latn", "Banjarese": "bjn_Latn", "Bugis": "bug_Latn", "Madurese": "mad_Latn", # Asian Languages - South Asian "Hindi": "hin_Deva", "Bengali": "ben_Beng", "Tamil": "tam_Taml", "Telugu": "tel_Telu", "Marathi": "mar_Deva", "Gujarati": "guj_Gujr", "Kannada": "kan_Knda", "Malayalam": "mal_Mlym", "Punjabi": "pan_Guru", "Urdu": "urd_Arab", "Nepali": "nep_Deva", "Sinhala": "sin_Sinh", "Assamese": "asm_Beng", "Oriya": "ory_Orya", "Sanskrit": "san_Deva", "Kashmiri": "kas_Arab", "Sindhi": "snd_Arab", "Maithili": "mai_Deva", "Santali": "sat_Olck", "Manipuri": "mni_Beng", "Bodo": "brx_Deva", "Dogri": "doi_Deva", "Konkani": "gom_Deva", # Middle Eastern Languages "Arabic": "arb_Arab", "Hebrew": "heb_Hebr", "Persian": "pes_Arab", "Turkish": "tur_Latn", "Kurdish": "ckb_Arab", "Pashto": "pbt_Arab", "Dari": "prs_Arab", "Azerbaijani": "azj_Latn", "Kazakh": "kaz_Cyrl", "Kyrgyz": "kir_Cyrl", "Uzbek": "uzn_Latn", "Tajik": "tgk_Cyrl", "Turkmen": "tuk_Latn", "Uighur": "uig_Arab", "Armenian": "hye_Armn", "Georgian": "kat_Geor", "Amharic": "amh_Ethi", "Tigrinya": "tir_Ethi", "Oromo": "orm_Ethi", # African Languages "Swahili": "swh_Latn", "Yoruba": "yor_Latn", "Igbo": "ibo_Latn", "Hausa": "hau_Latn", "Zulu": "zul_Latn", "Xhosa": "xho_Latn", "Afrikaans": "afr_Latn", "Somali": "som_Latn", "Shona": "sna_Latn", "Kinyarwanda": "kin_Latn", "Rundi": "run_Latn", "Chichewa": "nya_Latn", "Luganda": "lug_Latn", "Wolof": "wol_Latn", "Fula": "fuv_Latn", "Twi": "twi_Latn", "Lingala": "lin_Latn", "Bambara": "bam_Latn", "Mossi": "mos_Latn", "Ewe": "ewe_Latn", "Akan": "aka_Latn", "Malagasy": "plt_Latn", "Sesotho": "sot_Latn", "Tswana": "tsn_Latn", "Venda": "ven_Latn", "Tsonga": "tso_Latn", "Ndebele": "nso_Latn", "Swati": "ssw_Latn", # European Celtic & Regional Languages "Welsh": "cym_Latn", "Irish": "gle_Latn", "Scottish Gaelic": "gla_Latn", "Breton": "bre_Latn", "Cornish": "cor_Latn", "Manx": "glv_Latn", "Basque": "eus_Latn", "Catalan": "cat_Latn", "Galician": "glg_Latn", "Occitan": "oci_Latn", "Sardinian": "srd_Latn", "Corsican": "cos_Latn", "Faroese": "fao_Latn", "Icelandic": "isl_Latn", "Frisian": "fry_Latn", "Kashubian": "csb_Latn", "Sorbian": "hsb_Latn", "Romansh": "roh_Latn", # Americas Indigenous Languages "Quechua": "quy_Latn", "Guarani": "grn_Latn", "Aymara": "ayr_Latn", "Nahuatl": "nah_Latn", "Maya": "mam_Latn", "Wayuu": "guc_Latn", "Otomi": "oto_Latn", "Zapotec": "zap_Latn", "Mixe": "mie_Latn", "Tzeltal": "tzh_Latn", "Tzotzil": "tzo_Latn", "Tarahumara": "tar_Latn", "Huichol": "hch_Latn", "Mazatec": "maz_Latn", "Chatino": "ctp_Latn", "Chinantec": "chq_Latn", "Mixtec": "mxt_Latn", "Triqui": "trc_Latn", "Mazahua": "maz_Latn", "Purépecha": "tsz_Latn", "Totonac": "top_Latn", "Huastec": "hus_Latn", "Zoque": "zos_Latn", "Chol": "ctu_Latn", "Mam": "mam_Latn", "Kʼicheʼ": "quc_Latn", "Kaqchikel": "cak_Latn", "Achuar": "acu_Latn", "Shuar": "jiv_Latn", "Awajún": "agr_Latn", "Shipibo": "shp_Latn", "Asháninka": "cni_Latn", # Pacific Languages "Māori": "mri_Latn", "Samoan": "smo_Latn", "Tongan": "ton_Latn", "Fijian": "fij_Latn", "Hawaiian": "haw_Latn", "Tahitian": "tah_Latn", "Chamorro": "cha_Latn", "Palauan": "pau_Latn", "Marshallese": "mah_Latn", "Chuukese": "chk_Latn", "Kosraean": "kos_Latn", "Pohnpeian": "pon_Latn", "Yapese": "yap_Latn", # Additional Asian Languages "Tibetan": "bod_Tibt", "Dzongkha": "dzo_Tibt", "Ladakhi": "lbj_Tibt", "Sherpa": "xsr_Deva", "Newari": "new_Deva", "Maithili": "mai_Deva", "Bhojpuri": "bho_Deva", "Magahi": "mag_Deva", "Angika": "anp_Deva", "Bajjika": "bpy_Beng", "Chittagonian": "ctg_Beng", "Sylheti": "syl_Beng", "Rohingya": "rhg_Arab", "Meitei": "mni_Beng", "Tripuri": "trp_Latn", "Garo": "grt_Beng", "Kokborok": "trp_Latn", "Mizo": "lus_Latn", "Nagamese": "nag_Latn", "Khasi": "kha_Latn", "Balochi": "bal_Arab", "Brahui": "brh_Arab", "Burushaski": "bsk_Arab", "Gilgiti": "shx_Arab", "Hindko": "hno_Arab", "Pahari": "phr_Deva", "Garhwali": "gbm_Deva", "Kumaoni": "kfy_Deva", # Additional African Languages "Berber": "ber_Latn", "Tamazight": "tzm_Latn", "Kabyle": "kab_Latn", "Tuareg": "taq_Latn", "Nuer": "nus_Latn", "Dinka": "din_Latn", "Kanuri": "knc_Latn", "Tiv": "tiv_Latn", "Efik": "efi_Latn", "Ibibio": "ibb_Latn", "Annang": "anw_Latn", "Ijaw": "ijc_Latn", "Urhobo": "urh_Latn", "Edo": "bin_Latn", "Igala": "igl_Latn", "Idoma": "idu_Latn", "Berom": "bom_Latn", "Gbagyi": "gbr_Latn", "Nupe": "nup_Latn", "Jukun": "jbu_Latn", "Chadic": "cdc_Latn", "Adamawa": "adm_Latn", "Gur": "gur_Latn", "Kru": "kru_Latn", "Mande": "mnd_Latn", "Nilotic": "nil_Latn", "Cushitic": "cus_Latn", "Omotic": "omo_Latn", "Khoisan": "khi_Latn", # Sign Languages (limited support) "American Sign Language": "ase_Sgnw", "British Sign Language": "bfi_Sgnw", "French Sign Language": "fsl_Sgnw", "German Sign Language": "gsg_Sgnw", "Japanese Sign Language": "jsl_Sgnw", "Chinese Sign Language": "csl_Sgnw", # Historical and Classical Languages "Latin": "lat_Latn", "Ancient Greek": "grc_Grek", "Old Church Slavonic": "chu_Cyrl", "Middle English": "enm_Latn", "Old English": "ang_Latn", "Old Norse": "non_Latn", "Gothic": "got_Goth", "Aramaic": "arc_Armi", "Coptic": "cop_Copt", "Ge'ez": "gez_Ethi", "Akkadian": "akk_Xsux", "Sumerian": "sux_Xsux", "Hittite": "hit_Xsux", "Phoenician": "phn_Phnx", "Ugaritic": "uga_Ugar", "Pahlavi": "pal_Phlv", "Avestan": "ave_Avst", "Old Persian": "peo_Xpeo", "Sogdian": "sog_Sogd", "Tocharian": "txb_Latn", "Khotanese": "kho_Brah", "Gandhari": "pgd_Khar", "Prakrit": "prc_Brah", "Pali": "pli_Latn", } # Create a sorted list for better UI LANGUAGE_NAMES = sorted(LANGUAGE_CODES.keys()) def extract_text_from_pdf(file_path: str) -> str: """Extract text from PDF file while preserving paragraph structure""" try: with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) paragraphs = [] for page in pdf_reader.pages: page_text = page.extract_text() if page_text.strip(): page_paragraphs = [p.strip() for p in page_text.split('\n\n') if p.strip()] paragraphs.extend(page_paragraphs) return '\n\n'.join(paragraphs) except Exception as e: logger.error(f"Error extracting text from PDF: {str(e)}") return f"Error reading PDF: {str(e)}" def extract_text_from_docx(file_path: str) -> Tuple[str, list]: """Extract text from DOCX file while preserving paragraph structure and formatting info""" try: doc = Document(file_path) paragraphs = [] formatting_info = [] for para in doc.paragraphs: text = para.text.strip() if text: paragraphs.append(text) # Store comprehensive paragraph formatting para_format = { 'alignment': para.alignment, 'left_indent': para.paragraph_format.left_indent, 'right_indent': para.paragraph_format.right_indent, 'first_line_indent': para.paragraph_format.first_line_indent, 'space_before': para.paragraph_format.space_before, 'space_after': para.paragraph_format.space_after, 'line_spacing': para.paragraph_format.line_spacing, 'runs': [] } # Store detailed run-level formatting for run in para.runs: if run.text.strip(): run_format = { 'text': run.text, 'bold': run.bold, 'italic': run.italic, 'underline': run.underline, 'font_name': run.font.name, 'font_size': run.font.size, 'font_color_rgb': None, 'font_color_theme': None, 'highlight_color': None, 'superscript': None, 'subscript': None, 'strike': None, 'double_strike': None, 'all_caps': None, 'small_caps': None } # Get font color (RGB) try: if run.font.color and run.font.color.rgb: run_format['font_color_rgb'] = run.font.color.rgb except: pass # Get font color (theme color) try: if run.font.color and run.font.color.theme_color: run_format['font_color_theme'] = run.font.color.theme_color except: pass # Get highlight color try: if run.font.highlight_color: run_format['highlight_color'] = run.font.highlight_color except: pass # Get additional formatting try: run_format['superscript'] = run.font.superscript run_format['subscript'] = run.font.subscript run_format['strike'] = run.font.strike run_format['double_strike'] = run.font.double_strike run_format['all_caps'] = run.font.all_caps run_format['small_caps'] = run.font.small_caps except: pass para_format['runs'].append(run_format) formatting_info.append(para_format) text = '\n\n'.join(paragraphs) return text, formatting_info except Exception as e: logger.error(f"Error extracting text from DOCX: {str(e)}") return f"Error reading DOCX: {str(e)}", [] def create_formatted_docx(translated_paragraphs: list, formatting_info: list, filename: str) -> str: """Create a DOCX file with translated text while preserving original formatting""" try: doc = Document() # Remove default paragraph if doc.paragraphs: p = doc.paragraphs[0] p._element.getparent().remove(p._element) for i, (para_text, para_format) in enumerate(zip(translated_paragraphs, formatting_info)): if not para_text.strip(): continue paragraph = doc.add_paragraph() # Apply paragraph-level formatting try: if para_format.get('alignment') is not None: paragraph.alignment = para_format['alignment'] if para_format.get('left_indent') is not None: paragraph.paragraph_format.left_indent = para_format['left_indent'] if para_format.get('right_indent') is not None: paragraph.paragraph_format.right_indent = para_format['right_indent'] if para_format.get('first_line_indent') is not None: paragraph.paragraph_format.first_line_indent = para_format['first_line_indent'] if para_format.get('space_before') is not None: paragraph.paragraph_format.space_before = para_format['space_before'] if para_format.get('space_after') is not None: paragraph.paragraph_format.space_after = para_format['space_after'] if para_format.get('line_spacing') is not None: paragraph.paragraph_format.line_spacing = para_format['line_spacing'] except Exception as e: logger.warning(f"Could not apply paragraph formatting: {e}") # Apply run-level formatting with full preservation runs_info = para_format.get('runs', []) if runs_info: # Analyze the dominant formatting for the paragraph total_runs = len(runs_info) # Count formatting occurrences bold_count = sum(1 for r in runs_info if r.get('bold')) italic_count = sum(1 for r in runs_info if r.get('italic')) underline_count = sum(1 for r in runs_info if r.get('underline')) # Get most common formatting values font_names = [r.get('font_name') for r in runs_info if r.get('font_name')] font_sizes = [r.get('font_size') for r in runs_info if r.get('font_size')] font_colors_rgb = [r.get('font_color_rgb') for r in runs_info if r.get('font_color_rgb')] font_colors_theme = [r.get('font_color_theme') for r in runs_info if r.get('font_color_theme')] highlight_colors = [r.get('highlight_color') for r in runs_info if r.get('highlight_color')] # Create run with translated text run = paragraph.add_run(para_text) try: # Apply basic formatting (use majority rule) if bold_count > total_runs / 2: run.bold = True if italic_count > total_runs / 2: run.italic = True if underline_count > total_runs / 2: run.underline = True # Apply font name (most common) if font_names: most_common_font = max(set(font_names), key=font_names.count) run.font.name = most_common_font # Apply font size (most common) if font_sizes: most_common_size = max(set(font_sizes), key=font_sizes.count) run.font.size = most_common_size # Apply font color (RGB - most common) if font_colors_rgb: most_common_color = max(set(font_colors_rgb), key=font_colors_rgb.count) run.font.color.rgb = most_common_color # Apply font color (theme - most common) elif font_colors_theme: most_common_theme = max(set(font_colors_theme), key=font_colors_theme.count) run.font.color.theme_color = most_common_theme # Apply highlight color (most common) if highlight_colors: most_common_highlight = max(set(highlight_colors), key=highlight_colors.count) run.font.highlight_color = most_common_highlight # Apply additional formatting if majority of runs have it superscript_count = sum(1 for r in runs_info if r.get('superscript')) subscript_count = sum(1 for r in runs_info if r.get('subscript')) strike_count = sum(1 for r in runs_info if r.get('strike')) double_strike_count = sum(1 for r in runs_info if r.get('double_strike')) all_caps_count = sum(1 for r in runs_info if r.get('all_caps')) small_caps_count = sum(1 for r in runs_info if r.get('small_caps')) if superscript_count > total_runs / 2: run.font.superscript = True if subscript_count > total_runs / 2: run.font.subscript = True if strike_count > total_runs / 2: run.font.strike = True if double_strike_count > total_runs / 2: run.font.double_strike = True if all_caps_count > total_runs / 2: run.font.all_caps = True if small_caps_count > total_runs / 2: run.font.small_caps = True except Exception as e: logger.warning(f"Could not apply some run formatting: {e}") else: # No run formatting info, just add the text paragraph.add_run(para_text) doc.save(filename) logger.info(f"Created formatted DOCX with full formatting preservation: {filename}") return filename except Exception as e: logger.error(f"Error creating formatted DOCX: {str(e)}") return create_docx_with_text('\n\n'.join(translated_paragraphs), filename) def create_docx_with_text(text: str, filename: str) -> str: """Create a DOCX file with the given text""" try: doc = Document() paragraphs = text.split('\n\n') for para_text in paragraphs: if para_text.strip(): cleaned_text = para_text.replace('\n', ' ').strip() doc.add_paragraph(cleaned_text) doc.save(filename) return filename except Exception as e: logger.error(f"Error creating DOCX: {str(e)}") return None @spaces.GPU def translate_text_input(text: str, source_lang: str, target_lang: str, session_id: str = "") -> str: """Handle text input translation""" if not is_authenticated(session_id): return "❌ Please log in to use this feature." if not text.strip(): return "Please enter some text to translate." if source_lang not in LANGUAGE_CODES or target_lang not in LANGUAGE_CODES: return "Invalid language selection." return translator.translate_text(text, source_lang, target_lang) @spaces.GPU def translate_document(file, source_lang: str, target_lang: str, session_id: str = "") -> Tuple[Optional[str], str]: """Handle document translation while preserving original formatting""" if not is_authenticated(session_id): return None, "❌ Please log in to use this feature." if file is None: return None, "Please upload a document." if source_lang not in LANGUAGE_CODES or target_lang not in LANGUAGE_CODES: return None, "Invalid language selection." start_time = time.time() try: file_extension = os.path.splitext(file.name)[1].lower() formatting_info = None logger.info(f"Starting document translation: {source_lang} → {target_lang}") if file_extension == '.pdf': text = extract_text_from_pdf(file.name) elif file_extension == '.docx': text, formatting_info = extract_text_from_docx(file.name) else: return None, "Unsupported file format. Please upload PDF or DOCX files only." if text.startswith("Error"): return None, text word_count = len(text.split()) char_count = len(text) logger.info(f"Document stats: {word_count} words, {char_count} characters") # Translate the text translate_start = time.time() translated_text = translator.translate_text(text, source_lang, target_lang) translate_end = time.time() translate_duration = translate_end - translate_start logger.info(f"Core translation took: {translate_duration:.2f} seconds") # Create output file output_filename = f"translated_{os.path.splitext(os.path.basename(file.name))[0]}.docx" output_path = os.path.join(tempfile.gettempdir(), output_filename) # Create formatted output if formatting_info and file_extension == '.docx': translated_paragraphs = translated_text.split('\n\n') if len(translated_paragraphs) == len(formatting_info): create_formatted_docx(translated_paragraphs, formatting_info, output_path) else: logger.warning(f"Paragraph count mismatch, using fallback") create_docx_with_text(translated_text, output_path) else: create_docx_with_text(translated_text, output_path) # Calculate timing end_time = time.time() total_duration = end_time - start_time minutes = int(total_duration // 60) seconds = int(total_duration % 60) time_str = f"{minutes}m {seconds}s" if minutes > 0 else f"{seconds}s" # Calculate speed if word_count > 0 and total_duration > 0: words_per_minute = int((word_count / total_duration) * 60) speed_info = f" • Speed: {words_per_minute} words/min" else: speed_info = "" translation_type = "Same language processed" if source_lang == target_lang else "NLLB translation" status_message = ( f"✅ Translation completed successfully!\n" f"⏱️ Time taken: {time_str}\n" f"📄 Document: {word_count} words, {char_count} characters\n" f"🔄 Type: {translation_type}{speed_info}\n" f"📁 Original formatting preserved in output file." ) logger.info(f"Document translation completed in {total_duration:.2f} seconds") return output_path, status_message except Exception as e: end_time = time.time() total_duration = end_time - start_time minutes = int(total_duration // 60) seconds = int(total_duration % 60) time_str = f"{minutes}m {seconds}s" if minutes > 0 else f"{seconds}s" logger.error(f"Document translation error after {time_str}: {str(e)}") return None, f"❌ Error during document translation (after {time_str}): {str(e)}" # Initialize translator print("Initializing NLLB Translator...") translator = NLLBTranslator(model_size="600M") # Use smaller model for stability # Create the Gradio app with gr.Blocks(title="NLLB Universal Translator", theme=gr.themes.Soft()) as demo: session_state = gr.State("") # Login interface with gr.Column(visible=True) as login_column: gr.Markdown(""" # 🌍 NLLB Universal Translator - Authentication Required Translate between **200+ languages** using Meta's NLLB (No Language Left Behind) model. Please enter your credentials to access the translation tool. """) with gr.Row(): with gr.Column(scale=1): pass with gr.Column(scale=2): with gr.Group(): gr.Markdown("### Login") username_input = gr.Textbox( label="Username", placeholder="Enter username", type="text" ) password_input = gr.Textbox( label="Password", placeholder="Enter password", type="password" ) login_btn = gr.Button("Login", variant="primary", size="lg") login_status = gr.Markdown("") with gr.Column(scale=1): pass gr.Markdown(""" --- **Features:** - 🔒 Secure authentication system - 🌍 Support for **200+ languages** using Meta's NLLB model - 📄 Document translation with formatting preservation - 🚀 High-quality neural machine translation - 💾 Preserves original document formatting and styling - 🗺️ Includes indigenous, regional, and low-resource languages - 📚 Historical and classical languages support """) # Main translator interface with gr.Column(visible=False) as main_column: gr.Markdown(""" # 🌍 NLLB Universal Translator Translate text and documents between **200+ languages** using Meta's NLLB model. Supports major world languages plus indigenous, regional, and low-resource languages. """) with gr.Tabs(): # Text Translation Tab with gr.TabItem("📝 Text Translation"): with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Input Text", placeholder="Enter text to translate...", lines=6 ) with gr.Row(): source_lang_text = gr.Dropdown( choices=LANGUAGE_NAMES, label="Source Language", value="English", filterable=True ) target_lang_text = gr.Dropdown( choices=LANGUAGE_NAMES, label="Target Language", value="Spanish", filterable=True ) translate_text_btn = gr.Button("🔄 Translate Text", variant="primary", size="lg") with gr.Column(): text_output = gr.Textbox( label="Translated Text", lines=6, interactive=False ) gr.Markdown(""" **Supported Languages (200+):** - 🇪🇺 **European**: English, Spanish, French, German, Italian, Russian, etc. - 🇨🇳 **East Asian**: Chinese, Japanese, Korean, Mongolian - 🇮🇳 **South Asian**: Hindi, Bengali, Tamil, Telugu, Urdu, Sanskrit, etc. - 🇸🇦 **Middle Eastern**: Arabic, Persian, Hebrew, Turkish, Kurdish - 🌍 **African**: Swahili, Yoruba, Hausa, Zulu, Amharic, Berber - 🇻🇳 **Southeast Asian**: Vietnamese, Thai, Indonesian, Filipino, Burmese - 🏝️ **Pacific**: Māori, Samoan, Hawaiian, Fijian, Tahitian - 🏛️ **Historical**: Latin, Ancient Greek, Sanskrit, Aramaic - 🗺️ **Indigenous**: Quechua, Guarani, Nahuatl, Maya, and many more - 🔤 **Regional**: Welsh, Basque, Catalan, Breton, Faroese """) # Document Translation Tab with gr.TabItem("📄 Document Translation"): with gr.Row(): with gr.Column(): file_input = gr.File( label="📁 Upload Document", file_types=[".pdf", ".docx"], type="filepath" ) with gr.Row(): source_lang_doc = gr.Dropdown( choices=LANGUAGE_NAMES, label="Source Language", value="English", filterable=True ) target_lang_doc = gr.Dropdown( choices=LANGUAGE_NAMES, label="Target Language", value="French", filterable=True ) translate_doc_btn = gr.Button("🔄 Translate Document", variant="primary", size="lg") gr.Markdown(""" **Document Features:** - 📝 Preserves original formatting - 📋 Maintains paragraph structure - 🎨 Keeps basic styling (bold, italic, underline) - 📊 Supports PDF and DOCX files - 💾 Outputs formatted DOCX file """) with gr.Column(): doc_status = gr.Textbox( label="📊 Translation Status", lines=6, interactive=False ) doc_output = gr.File( label="📥 Download Translated Document" ) # Examples gr.Examples( examples=[ ["Hello, how are you today?", "English", "Spanish"], ["Bonjour, comment allez-vous?", "French", "English"], ["你好,你今天好吗?", "Chinese (Simplified)", "English"], ["नमस्ते, आप कैसे हैं?", "Hindi", "English"], ["مرحبا، كيف حالك؟", "Arabic", "English"], ["Machine learning is transforming the world.", "English", "French"], ], inputs=[text_input, source_lang_text, target_lang_text], outputs=[text_output], fn=lambda text, src, tgt: translate_text_input(text, src, tgt, ""), cache_examples=False, label="Try these examples:" ) # Logout functionality with gr.Row(): logout_btn = gr.Button("🔓 Logout", variant="secondary", size="sm") def handle_login(username, password): success, session_id = authenticate(username, password) if success: return ( gr.Markdown("✅ **Login successful!** Welcome to the NLLB Universal Translator."), gr.Column(visible=False), gr.Column(visible=True), session_id ) else: return ( gr.Markdown("❌ **Invalid credentials.** Please check your username and password."), gr.Column(visible=True), gr.Column(visible=False), "" ) def handle_logout(session_id): if session_id: logout_session(session_id) return ( gr.Column(visible=True), gr.Column(visible=False), "", gr.Textbox(value=""), gr.Textbox(value=""), gr.Markdown("🔓 **Logged out successfully.** Please login again to continue.") ) # Event handlers login_btn.click( fn=handle_login, inputs=[username_input, password_input], outputs=[login_status, login_column, main_column, session_state] ) logout_btn.click( fn=handle_logout, inputs=[session_state], outputs=[login_column, main_column, session_state, username_input, password_input, login_status] ) translate_text_btn.click( fn=lambda text, src, tgt, session: translate_text_input(text, src, tgt, session), inputs=[text_input, source_lang_text, target_lang_text, session_state], outputs=[text_output] ) translate_doc_btn.click( fn=lambda file, src, tgt, session: translate_document(file, src, tgt, session), inputs=[file_input, source_lang_doc, target_lang_doc, session_state], outputs=[doc_output, doc_status] ) print("NLLB Universal Translator initialized successfully!") # Launch the app if __name__ == "__main__": demo.launch(share=True)