Spaces:
Runtime error
Runtime error
| """ | |
| English-to-Urdu Neural Machine Translation App | |
| ================================================ | |
| Model : Helsinki-NLP/opus-mt-en-ur (MarianMT) | |
| UI : Gradio 4.x | |
| Deploy : HuggingFace Spaces | Google Colab | |
| DEPLOYMENT STEPS (HuggingFace Spaces) | |
| -------------------------------------- | |
| 1. Go to https://huggingface.co/new-space | |
| 2. Name your space, choose "Gradio" as the SDK | |
| 3. Upload: app.py, requirements.txt, README.md | |
| 4. Space auto-builds and launches β no extra config needed | |
| 5. Share the public URL from the "App" tab | |
| """ | |
| # ββ Standard library ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| import os | |
| import re | |
| import signal | |
| import unicodedata | |
| from pathlib import Path | |
| from typing import Optional | |
| # ββ Third-party βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| import gradio as gr | |
| from transformers import MarianMTModel, MarianTokenizer, pipeline | |
| # ββ Constants βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MODEL_NAME: str = "Helsinki-NLP/opus-mt-en-ur" | |
| MAX_CHARS: int = 500 | |
| TRANSLATION_TIMEOUT: int = 30 # seconds | |
| CACHE_DIR: Path = Path(os.getenv("HF_HOME", Path.home() / ".cache" / "huggingface")) | |
| # ββ Global model singleton ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _translator = None | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. MODEL LOADING | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_model() -> object: | |
| """ | |
| Load the MarianMT translation pipeline (English β Urdu). | |
| Uses a global singleton so the model is loaded only once per process. | |
| The model is downloaded to CACHE_DIR on first run and reused thereafter. | |
| Returns: | |
| HuggingFace translation pipeline object. | |
| Raises: | |
| RuntimeError: If the model cannot be loaded after retrying. | |
| """ | |
| global _translator | |
| if _translator is not None: | |
| return _translator | |
| try: | |
| tokenizer = MarianTokenizer.from_pretrained( | |
| MODEL_NAME, cache_dir=str(CACHE_DIR) | |
| ) | |
| model = MarianMTModel.from_pretrained( | |
| MODEL_NAME, cache_dir=str(CACHE_DIR) | |
| ) | |
| _translator = pipeline( | |
| "translation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| device=-1, # CPU only β no CUDA dependency | |
| ) | |
| return _translator | |
| except Exception as exc: | |
| raise RuntimeError( | |
| f"Failed to load translation model '{MODEL_NAME}': {exc}" | |
| ) from exc | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. PREPROCESSING | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def preprocess(text: str) -> str: | |
| """ | |
| Clean and normalise raw English input before sending to the model. | |
| Steps: | |
| - Strip leading/trailing whitespace | |
| - Collapse multiple spaces/tabs into a single space | |
| - Normalise unicode to NFC (composed form) | |
| - Remove non-printable control characters (except newlines) | |
| Args: | |
| text: Raw English string from the UI. | |
| Returns: | |
| Cleaned, unicode-normalised string. | |
| """ | |
| if not text: | |
| return "" | |
| # Unicode normalisation (NFC β composed form) | |
| text = unicodedata.normalize("NFC", text) | |
| # Remove non-printable control chars (keep \n for sentence splitting) | |
| text = "".join( | |
| ch for ch in text if unicodedata.category(ch)[0] != "C" or ch == "\n" | |
| ) | |
| # Collapse runs of spaces/tabs | |
| text = re.sub(r"[ \t]+", " ", text) | |
| # Trim each line | |
| lines = [line.strip() for line in text.splitlines()] | |
| return "\n".join(lines).strip() | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. SENTENCE SPLITTING | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def split_into_sentences(text: str) -> list[str]: | |
| """ | |
| Split a paragraph into individual sentences for batch translation. | |
| Splits on '.', '?', '!' and newlines while preserving the delimiter | |
| at the end of each sentence. | |
| Args: | |
| text: Preprocessed English paragraph. | |
| Returns: | |
| List of non-empty sentence strings. | |
| """ | |
| # Split on sentence-ending punctuation, keeping the delimiter | |
| parts = re.split(r"(?<=[.?!])\s+|\n+", text) | |
| return [s.strip() for s in parts if s.strip()] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. CORE TRANSLATION | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _timeout_handler(signum: int, frame) -> None: | |
| """SIGALRM handler β raises TimeoutError when translation exceeds limit.""" | |
| raise TimeoutError(f"Translation timed out after {TRANSLATION_TIMEOUT} seconds.") | |
| def translate(text: str) -> str: | |
| """ | |
| Translate preprocessed English text to Urdu using MarianMT. | |
| Performs sentence-level batching: long paragraphs are split into | |
| individual sentences, each translated separately, then rejoined. | |
| A SIGALRM-based timeout guard (POSIX only) aborts calls that exceed | |
| TRANSLATION_TIMEOUT seconds. | |
| Args: | |
| text: Preprocessed English string (output of preprocess()). | |
| Returns: | |
| Raw Urdu translation string (before postprocessing). | |
| Raises: | |
| ValueError: If input text is empty. | |
| TimeoutError: If translation exceeds TRANSLATION_TIMEOUT seconds. | |
| RuntimeError: If model inference fails. | |
| """ | |
| if not text.strip(): | |
| raise ValueError("Input text is empty. Please enter some English text.") | |
| translator = load_model() | |
| sentences = split_into_sentences(text) | |
| # Arm timeout (SIGALRM β works on Linux/macOS; no-op on Windows) | |
| try: | |
| signal.signal(signal.SIGALRM, _timeout_handler) | |
| signal.alarm(TRANSLATION_TIMEOUT) | |
| except (AttributeError, OSError): | |
| pass # Windows β skip timeout guard | |
| try: | |
| results = translator(sentences, max_length=512) | |
| except TimeoutError: | |
| raise | |
| except Exception as exc: | |
| raise RuntimeError(f"Model inference failed: {exc}") from exc | |
| finally: | |
| try: | |
| signal.alarm(0) # Disarm alarm | |
| except (AttributeError, OSError): | |
| pass | |
| translated_sentences = [r["translation_text"] for r in results] | |
| return " ".join(translated_sentences) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. POSTPROCESSING | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def postprocess(urdu_text: str) -> str: | |
| """ | |
| Format the raw Urdu translation for correct RTL display. | |
| Steps: | |
| - Strip extra whitespace | |
| - Add Unicode RLM (Right-to-Left Mark) at the start to force RTL | |
| rendering in environments that don't auto-detect Urdu script | |
| - Ensure the text ends with a single newline | |
| Args: | |
| urdu_text: Raw Urdu string from the translation model. | |
| Returns: | |
| RTL-formatted Urdu string ready for the Gradio output box. | |
| """ | |
| if not urdu_text: | |
| return "" | |
| text = urdu_text.strip() | |
| # Insert RLM marker so RTL is enforced even in LTR containers | |
| RLM = "\u200F" | |
| if not text.startswith(RLM): | |
| text = RLM + text | |
| return text | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 6. ORCHESTRATION β full pipeline | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_translation(input_text: str) -> tuple[str, str]: | |
| """ | |
| Full end-to-end translation pipeline: preprocess β translate β postprocess. | |
| This is the function wired to the Gradio interface. | |
| Args: | |
| input_text: Raw English text from the UI textbox. | |
| Returns: | |
| Tuple of (urdu_output: str, status_message: str). | |
| On error, urdu_output is "" and status_message contains the error. | |
| """ | |
| try: | |
| cleaned = preprocess(input_text) | |
| if not cleaned: | |
| return "", "β οΈ Please enter some English text before translating." | |
| if len(cleaned) > MAX_CHARS: | |
| return "", ( | |
| f"β οΈ Input exceeds {MAX_CHARS} characters " | |
| f"({len(cleaned)} chars). Please shorten your text." | |
| ) | |
| raw_urdu = translate(cleaned) | |
| formatted_urdu = postprocess(raw_urdu) | |
| word_count_in = len(cleaned.split()) | |
| word_count_out = len(formatted_urdu.split()) | |
| status = ( | |
| f"β Translation complete β " | |
| f"{word_count_in} English words β {word_count_out} Urdu words." | |
| ) | |
| return formatted_urdu, status | |
| except ValueError as e: | |
| return "", f"β οΈ {e}" | |
| except TimeoutError as e: | |
| return "", f"β±οΈ {e}" | |
| except RuntimeError as e: | |
| return "", f"β {e}" | |
| except Exception as e: | |
| return "", f"β Unexpected error: {e}" | |
| def get_word_count(text: str) -> str: | |
| """ | |
| Return a live word-count string for a given text input. | |
| Args: | |
| text: Any string (English input or Urdu output). | |
| Returns: | |
| Human-readable word/char count label. | |
| """ | |
| if not text: | |
| return "0 words Β· 0 chars" | |
| words = len(text.split()) | |
| chars = len(text) | |
| warn = f" β οΈ limit is {MAX_CHARS}" if chars > MAX_CHARS else "" | |
| return f"{words} words Β· {chars} chars{warn}" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 7. GRADIO UI | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| EXAMPLES: list[list[str]] = [ | |
| ["Artificial intelligence is transforming the world rapidly."], | |
| ["Pakistan is a beautiful country with rich culture and history."], | |
| ["The patient needs immediate medical attention and care."], | |
| ["Education is the most powerful weapon to change the world."], | |
| ["Good morning! How are you feeling today?"], | |
| [ | |
| "Machine learning models require large datasets for training. " | |
| "The quality of data directly affects model performance." | |
| ], | |
| ] | |
| CUSTOM_CSS: str = """ | |
| /* ββ Urdu output β force RTL ββ */ | |
| #urdu-output textarea { | |
| direction: rtl !important; | |
| text-align: right !important; | |
| font-family: 'Noto Nastaliq Urdu', 'Jameel Noori Nastaleeq', | |
| 'Urdu Typesetting', 'Segoe UI', sans-serif !important; | |
| font-size: 18px !important; | |
| line-height: 2.2 !important; | |
| unicode-bidi: bidi-override; | |
| } | |
| /* ββ Status bar ββ */ | |
| #status-bar { | |
| font-size: 13px; | |
| color: #555; | |
| padding: 6px 10px; | |
| border-radius: 6px; | |
| background: #f8f9fa; | |
| min-height: 34px; | |
| } | |
| /* ββ Word count labels ββ */ | |
| .count-label { | |
| font-size: 12px; | |
| color: #888; | |
| text-align: right; | |
| padding: 2px 4px; | |
| } | |
| /* ββ Translate button accent ββ */ | |
| #translate-btn { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; | |
| color: white !important; | |
| font-weight: 600 !important; | |
| border: none !important; | |
| } | |
| #translate-btn:hover { | |
| opacity: 0.92 !important; | |
| transform: translateY(-1px); | |
| } | |
| """ | |
| def build_ui() -> gr.Blocks: | |
| """ | |
| Construct and return the Gradio Blocks UI. | |
| Layout: | |
| - Header with app title and description | |
| - Two-column panel: English input (left) | Urdu output (right) | |
| - Live word/char counters below each panel | |
| - Action buttons: Translate Β· Clear Β· (Copy handled natively by Gradio) | |
| - Status bar showing result metadata or error messages | |
| - Example inputs at the bottom | |
| Returns: | |
| Configured gr.Blocks instance (not yet launched). | |
| """ | |
| theme = gr.themes.Soft( | |
| primary_hue="violet", | |
| secondary_hue="purple", | |
| neutral_hue="slate", | |
| font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "sans-serif"], | |
| ) | |
| with gr.Blocks( | |
| theme=theme, | |
| css=CUSTOM_CSS, | |
| title="English β Urdu Translator", | |
| ) as demo: | |
| # ββ Header ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| gr.HTML(""" | |
| <div style="text-align:center; padding: 24px 0 8px;"> | |
| <h1 style="font-size:2rem; font-weight:700; margin:0;"> | |
| π English β Urdu Translator | |
| </h1> | |
| <p style="color:#666; margin-top:8px; font-size:15px;"> | |
| Neural Machine Translation Β· Helsinki-NLP/opus-mt-en-ur Β· MarianMT | |
| </p> | |
| </div> | |
| """) | |
| # ββ Main panels βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Row(equal_height=True): | |
| with gr.Column(): | |
| gr.Markdown("#### English Input") | |
| input_box = gr.Textbox( | |
| label="", | |
| placeholder="Type or paste English text here⦠(max 500 characters)", | |
| lines=10, | |
| max_lines=20, | |
| show_copy_button=True, | |
| elem_id="english-input", | |
| ) | |
| input_count = gr.Markdown( | |
| value="0 words Β· 0 chars", | |
| elem_classes=["count-label"], | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("#### Urdu Output (Ψ§Ψ±Ψ―Ω)") | |
| output_box = gr.Textbox( | |
| label="", | |
| placeholder="ΨͺΨ±Ψ¬Ω Ϋ ΫΫΨ§ΪΊ ΨΈΨ§ΫΨ± ΫΩΪ―Ψ§β¦", | |
| lines=10, | |
| max_lines=20, | |
| interactive=False, | |
| show_copy_button=True, | |
| elem_id="urdu-output", | |
| ) | |
| output_count = gr.Markdown( | |
| value="0 words Β· 0 chars", | |
| elem_classes=["count-label"], | |
| ) | |
| # ββ Buttons βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Row(): | |
| translate_btn = gr.Button( | |
| "π Translate", | |
| variant="primary", | |
| scale=3, | |
| elem_id="translate-btn", | |
| ) | |
| clear_btn = gr.ClearButton( | |
| components=[input_box, output_box], | |
| value="π Clear", | |
| scale=1, | |
| ) | |
| # ββ Status bar ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| status_bar = gr.Markdown( | |
| value="", | |
| elem_id="status-bar", | |
| ) | |
| # ββ Examples βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| gr.Examples( | |
| examples=EXAMPLES, | |
| inputs=input_box, | |
| label="π Example Inputs β click to load", | |
| examples_per_page=6, | |
| ) | |
| # ββ Footer ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| gr.HTML(""" | |
| <div style="text-align:center; padding:16px 0 4px; color:#aaa; font-size:12px;"> | |
| Powered by | |
| <a href="https://huggingface.co/Helsinki-NLP/opus-mt-en-ur" | |
| target="_blank" style="color:#764ba2;">Helsinki-NLP/opus-mt-en-ur</a> | |
| Β· Built with | |
| <a href="https://gradio.app" target="_blank" style="color:#764ba2;">Gradio 4</a> | |
| </div> | |
| """) | |
| # ββ Wiring ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Live word counter for input | |
| input_box.change( | |
| fn=get_word_count, | |
| inputs=input_box, | |
| outputs=input_count, | |
| ) | |
| # Live word counter for output | |
| output_box.change( | |
| fn=get_word_count, | |
| inputs=output_box, | |
| outputs=output_count, | |
| ) | |
| # Translate button | |
| translate_btn.click( | |
| fn=run_translation, | |
| inputs=input_box, | |
| outputs=[output_box, status_bar], | |
| api_name="translate", | |
| ) | |
| # Also allow Enter-key submission (Shift+Enter for newline) | |
| input_box.submit( | |
| fn=run_translation, | |
| inputs=input_box, | |
| outputs=[output_box, status_bar], | |
| ) | |
| # Clear status bar when input is cleared | |
| clear_btn.click( | |
| fn=lambda: ("", ""), | |
| outputs=[status_bar, output_count], | |
| ) | |
| return demo | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 8. ENTRY POINT | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| """ | |
| Launch the Gradio app. | |
| - server_name="0.0.0.0" β accessible on local network | |
| - share=False β set True in Colab (see colab_run.py) | |
| - HuggingFace Spaces auto-detects app.py and calls demo.launch() itself | |
| via the Gradio SDK runner, so no explicit launch() is needed there. | |
| """ | |
| demo = build_ui() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| show_error=True, | |
| ) |