""" English-to-Urdu Neural Machine Translation App ================================================ Model : Helsinki-NLP/opus-mt-en-ur (MarianMT) UI : Gradio 4.x Deploy : HuggingFace Spaces | Google Colab DEPLOYMENT STEPS (HuggingFace Spaces) -------------------------------------- 1. Go to https://huggingface.co/new-space 2. Name your space, choose "Gradio" as the SDK 3. Upload: app.py, requirements.txt, README.md 4. Space auto-builds and launches — no extra config needed 5. Share the public URL from the "App" tab """ # ── Standard library ────────────────────────────────────────────────────────── import os import re import signal import unicodedata from pathlib import Path from typing import Optional # ── Third-party ─────────────────────────────────────────────────────────────── import gradio as gr from transformers import MarianMTModel, MarianTokenizer, pipeline # ── Constants ───────────────────────────────────────────────────────────────── MODEL_NAME: str = "Helsinki-NLP/opus-mt-en-ur" MAX_CHARS: int = 500 TRANSLATION_TIMEOUT: int = 30 # seconds CACHE_DIR: Path = Path(os.getenv("HF_HOME", Path.home() / ".cache" / "huggingface")) # ── Global model singleton ──────────────────────────────────────────────────── _translator = None # ───────────────────────────────────────────────────────────────────────────── # 1. MODEL LOADING # ───────────────────────────────────────────────────────────────────────────── def load_model() -> object: """ Load the MarianMT translation pipeline (English → Urdu). Uses a global singleton so the model is loaded only once per process. The model is downloaded to CACHE_DIR on first run and reused thereafter. Returns: HuggingFace translation pipeline object. Raises: RuntimeError: If the model cannot be loaded after retrying. """ global _translator if _translator is not None: return _translator try: tokenizer = MarianTokenizer.from_pretrained( MODEL_NAME, cache_dir=str(CACHE_DIR) ) model = MarianMTModel.from_pretrained( MODEL_NAME, cache_dir=str(CACHE_DIR) ) _translator = pipeline( "translation", model=model, tokenizer=tokenizer, device=-1, # CPU only — no CUDA dependency ) return _translator except Exception as exc: raise RuntimeError( f"Failed to load translation model '{MODEL_NAME}': {exc}" ) from exc # ───────────────────────────────────────────────────────────────────────────── # 2. PREPROCESSING # ───────────────────────────────────────────────────────────────────────────── def preprocess(text: str) -> str: """ Clean and normalise raw English input before sending to the model. Steps: - Strip leading/trailing whitespace - Collapse multiple spaces/tabs into a single space - Normalise unicode to NFC (composed form) - Remove non-printable control characters (except newlines) Args: text: Raw English string from the UI. Returns: Cleaned, unicode-normalised string. """ if not text: return "" # Unicode normalisation (NFC — composed form) text = unicodedata.normalize("NFC", text) # Remove non-printable control chars (keep \n for sentence splitting) text = "".join( ch for ch in text if unicodedata.category(ch)[0] != "C" or ch == "\n" ) # Collapse runs of spaces/tabs text = re.sub(r"[ \t]+", " ", text) # Trim each line lines = [line.strip() for line in text.splitlines()] return "\n".join(lines).strip() # ───────────────────────────────────────────────────────────────────────────── # 3. SENTENCE SPLITTING # ───────────────────────────────────────────────────────────────────────────── def split_into_sentences(text: str) -> list[str]: """ Split a paragraph into individual sentences for batch translation. Splits on '.', '?', '!' and newlines while preserving the delimiter at the end of each sentence. Args: text: Preprocessed English paragraph. Returns: List of non-empty sentence strings. """ # Split on sentence-ending punctuation, keeping the delimiter parts = re.split(r"(?<=[.?!])\s+|\n+", text) return [s.strip() for s in parts if s.strip()] # ───────────────────────────────────────────────────────────────────────────── # 4. CORE TRANSLATION # ───────────────────────────────────────────────────────────────────────────── def _timeout_handler(signum: int, frame) -> None: """SIGALRM handler — raises TimeoutError when translation exceeds limit.""" raise TimeoutError(f"Translation timed out after {TRANSLATION_TIMEOUT} seconds.") def translate(text: str) -> str: """ Translate preprocessed English text to Urdu using MarianMT. Performs sentence-level batching: long paragraphs are split into individual sentences, each translated separately, then rejoined. A SIGALRM-based timeout guard (POSIX only) aborts calls that exceed TRANSLATION_TIMEOUT seconds. Args: text: Preprocessed English string (output of preprocess()). Returns: Raw Urdu translation string (before postprocessing). Raises: ValueError: If input text is empty. TimeoutError: If translation exceeds TRANSLATION_TIMEOUT seconds. RuntimeError: If model inference fails. """ if not text.strip(): raise ValueError("Input text is empty. Please enter some English text.") translator = load_model() sentences = split_into_sentences(text) # Arm timeout (SIGALRM — works on Linux/macOS; no-op on Windows) try: signal.signal(signal.SIGALRM, _timeout_handler) signal.alarm(TRANSLATION_TIMEOUT) except (AttributeError, OSError): pass # Windows — skip timeout guard try: results = translator(sentences, max_length=512) except TimeoutError: raise except Exception as exc: raise RuntimeError(f"Model inference failed: {exc}") from exc finally: try: signal.alarm(0) # Disarm alarm except (AttributeError, OSError): pass translated_sentences = [r["translation_text"] for r in results] return " ".join(translated_sentences) # ───────────────────────────────────────────────────────────────────────────── # 5. POSTPROCESSING # ───────────────────────────────────────────────────────────────────────────── def postprocess(urdu_text: str) -> str: """ Format the raw Urdu translation for correct RTL display. Steps: - Strip extra whitespace - Add Unicode RLM (Right-to-Left Mark) at the start to force RTL rendering in environments that don't auto-detect Urdu script - Ensure the text ends with a single newline Args: urdu_text: Raw Urdu string from the translation model. Returns: RTL-formatted Urdu string ready for the Gradio output box. """ if not urdu_text: return "" text = urdu_text.strip() # Insert RLM marker so RTL is enforced even in LTR containers RLM = "\u200F" if not text.startswith(RLM): text = RLM + text return text # ───────────────────────────────────────────────────────────────────────────── # 6. ORCHESTRATION — full pipeline # ───────────────────────────────────────────────────────────────────────────── def run_translation(input_text: str) -> tuple[str, str]: """ Full end-to-end translation pipeline: preprocess → translate → postprocess. This is the function wired to the Gradio interface. Args: input_text: Raw English text from the UI textbox. Returns: Tuple of (urdu_output: str, status_message: str). On error, urdu_output is "" and status_message contains the error. """ try: cleaned = preprocess(input_text) if not cleaned: return "", "⚠️ Please enter some English text before translating." if len(cleaned) > MAX_CHARS: return "", ( f"⚠️ Input exceeds {MAX_CHARS} characters " f"({len(cleaned)} chars). Please shorten your text." ) raw_urdu = translate(cleaned) formatted_urdu = postprocess(raw_urdu) word_count_in = len(cleaned.split()) word_count_out = len(formatted_urdu.split()) status = ( f"✅ Translation complete — " f"{word_count_in} English words → {word_count_out} Urdu words." ) return formatted_urdu, status except ValueError as e: return "", f"⚠️ {e}" except TimeoutError as e: return "", f"⏱️ {e}" except RuntimeError as e: return "", f"❌ {e}" except Exception as e: return "", f"❌ Unexpected error: {e}" def get_word_count(text: str) -> str: """ Return a live word-count string for a given text input. Args: text: Any string (English input or Urdu output). Returns: Human-readable word/char count label. """ if not text: return "0 words · 0 chars" words = len(text.split()) chars = len(text) warn = f" ⚠️ limit is {MAX_CHARS}" if chars > MAX_CHARS else "" return f"{words} words · {chars} chars{warn}" # ───────────────────────────────────────────────────────────────────────────── # 7. GRADIO UI # ───────────────────────────────────────────────────────────────────────────── EXAMPLES: list[list[str]] = [ ["Artificial intelligence is transforming the world rapidly."], ["Pakistan is a beautiful country with rich culture and history."], ["The patient needs immediate medical attention and care."], ["Education is the most powerful weapon to change the world."], ["Good morning! How are you feeling today?"], [ "Machine learning models require large datasets for training. " "The quality of data directly affects model performance." ], ] CUSTOM_CSS: str = """ /* ── Urdu output — force RTL ── */ #urdu-output textarea { direction: rtl !important; text-align: right !important; font-family: 'Noto Nastaliq Urdu', 'Jameel Noori Nastaleeq', 'Urdu Typesetting', 'Segoe UI', sans-serif !important; font-size: 18px !important; line-height: 2.2 !important; unicode-bidi: bidi-override; } /* ── Status bar ── */ #status-bar { font-size: 13px; color: #555; padding: 6px 10px; border-radius: 6px; background: #f8f9fa; min-height: 34px; } /* ── Word count labels ── */ .count-label { font-size: 12px; color: #888; text-align: right; padding: 2px 4px; } /* ── Translate button accent ── */ #translate-btn { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; color: white !important; font-weight: 600 !important; border: none !important; } #translate-btn:hover { opacity: 0.92 !important; transform: translateY(-1px); } """ def build_ui() -> gr.Blocks: """ Construct and return the Gradio Blocks UI. Layout: - Header with app title and description - Two-column panel: English input (left) | Urdu output (right) - Live word/char counters below each panel - Action buttons: Translate · Clear · (Copy handled natively by Gradio) - Status bar showing result metadata or error messages - Example inputs at the bottom Returns: Configured gr.Blocks instance (not yet launched). """ theme = gr.themes.Soft( primary_hue="violet", secondary_hue="purple", neutral_hue="slate", font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "sans-serif"], ) with gr.Blocks( theme=theme, css=CUSTOM_CSS, title="English → Urdu Translator", ) as demo: # ── Header ──────────────────────────────────────────────────────────── gr.HTML("""

🌐 English → Urdu Translator

Neural Machine Translation · Helsinki-NLP/opus-mt-en-ur · MarianMT

""") # ── Main panels ─────────────────────────────────────────────────────── with gr.Row(equal_height=True): with gr.Column(): gr.Markdown("#### English Input") input_box = gr.Textbox( label="", placeholder="Type or paste English text here… (max 500 characters)", lines=10, max_lines=20, show_copy_button=True, elem_id="english-input", ) input_count = gr.Markdown( value="0 words · 0 chars", elem_classes=["count-label"], ) with gr.Column(): gr.Markdown("#### Urdu Output (اردو)") output_box = gr.Textbox( label="", placeholder="ترجمہ یہاں ظاہر ہوگا…", lines=10, max_lines=20, interactive=False, show_copy_button=True, elem_id="urdu-output", ) output_count = gr.Markdown( value="0 words · 0 chars", elem_classes=["count-label"], ) # ── Buttons ─────────────────────────────────────────────────────────── with gr.Row(): translate_btn = gr.Button( "🔄 Translate", variant="primary", scale=3, elem_id="translate-btn", ) clear_btn = gr.ClearButton( components=[input_box, output_box], value="🗑 Clear", scale=1, ) # ── Status bar ──────────────────────────────────────────────────────── status_bar = gr.Markdown( value="", elem_id="status-bar", ) # ── Examples ───────────────────────────────────────────────────────── gr.Examples( examples=EXAMPLES, inputs=input_box, label="📋 Example Inputs — click to load", examples_per_page=6, ) # ── Footer ──────────────────────────────────────────────────────────── gr.HTML("""
Powered by Helsinki-NLP/opus-mt-en-ur · Built with Gradio 4
""") # ── Wiring ──────────────────────────────────────────────────────────── # Live word counter for input input_box.change( fn=get_word_count, inputs=input_box, outputs=input_count, ) # Live word counter for output output_box.change( fn=get_word_count, inputs=output_box, outputs=output_count, ) # Translate button translate_btn.click( fn=run_translation, inputs=input_box, outputs=[output_box, status_bar], api_name="translate", ) # Also allow Enter-key submission (Shift+Enter for newline) input_box.submit( fn=run_translation, inputs=input_box, outputs=[output_box, status_bar], ) # Clear status bar when input is cleared clear_btn.click( fn=lambda: ("", ""), outputs=[status_bar, output_count], ) return demo # ───────────────────────────────────────────────────────────────────────────── # 8. ENTRY POINT # ───────────────────────────────────────────────────────────────────────────── if __name__ == "__main__": """ Launch the Gradio app. - server_name="0.0.0.0" → accessible on local network - share=False → set True in Colab (see colab_run.py) - HuggingFace Spaces auto-detects app.py and calls demo.launch() itself via the Gradio SDK runner, so no explicit launch() is needed there. """ demo = build_ui() demo.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True, )