Spaces:

ShahbazAhmad-Lab
/

transformer

Runtime error

App Files Files Community

ShahbazAhmad-Lab commited on Apr 17

Commit

7ac1dc0

verified ·

1 Parent(s): e4c02e8

Create app.py

Browse files

Files changed (1) hide show

app.py +529 -0

app.py ADDED Viewed

	@@ -0,0 +1,529 @@

+"""
+English-to-Urdu Neural Machine Translation App
+================================================
+Model  : Helsinki-NLP/opus-mt-en-ur (MarianMT)
+UI     : Gradio 4.x
+Deploy : HuggingFace Spaces  |  Google Colab
+DEPLOYMENT STEPS (HuggingFace Spaces)
+--------------------------------------
+1. Go to https://huggingface.co/new-space
+2. Name your space, choose "Gradio" as the SDK
+3. Upload: app.py, requirements.txt, README.md
+4. Space auto-builds and launches — no extra config needed
+5. Share the public URL from the "App" tab
+"""
+# ── Standard library ──────────────────────────────────────────────────────────
+import os
+import re
+import signal
+import unicodedata
+from pathlib import Path
+from typing import Optional
+# ── Third-party ───────────────────────────────────────────────────────────────
+import gradio as gr
+from transformers import MarianMTModel, MarianTokenizer, pipeline
+# ── Constants ─────────────────────────────────────────────────────────────────
+MODEL_NAME: str = "Helsinki-NLP/opus-mt-en-ur"
+MAX_CHARS: int = 500
+TRANSLATION_TIMEOUT: int = 30  # seconds
+CACHE_DIR: Path = Path(os.getenv("HF_HOME", Path.home() / ".cache" / "huggingface"))
+# ── Global model singleton ────────────────────────────────────────────────────
+_translator = None
+# ─────────────────────────────────────────────────────────────────────────────
+# 1. MODEL LOADING
+# ─────────────────────────────────────────────────────────────────────────────
+def load_model() -> object:
+    """
+    Load the MarianMT translation pipeline (English → Urdu).
+    Uses a global singleton so the model is loaded only once per process.
+    The model is downloaded to CACHE_DIR on first run and reused thereafter.
+    Returns:
+        HuggingFace translation pipeline object.
+    Raises:
+        RuntimeError: If the model cannot be loaded after retrying.
+    """
+    global _translator
+    if _translator is not None:
+        return _translator
+    try:
+        tokenizer = MarianTokenizer.from_pretrained(
+            MODEL_NAME, cache_dir=str(CACHE_DIR)
+        )
+        model = MarianMTModel.from_pretrained(
+            MODEL_NAME, cache_dir=str(CACHE_DIR)
+        )
+        _translator = pipeline(
+            "translation",
+            model=model,
+            tokenizer=tokenizer,
+            device=-1,          # CPU only — no CUDA dependency
+        )
+        return _translator
+    except Exception as exc:
+        raise RuntimeError(
+            f"Failed to load translation model '{MODEL_NAME}': {exc}"
+        ) from exc
+# ─────────────────────────────────────────────────────────────────────────────
+# 2. PREPROCESSING
+# ─────────────────────────────────────────────────────────────────────────────
+def preprocess(text: str) -> str:
+    """
+    Clean and normalise raw English input before sending to the model.
+    Steps:
+      - Strip leading/trailing whitespace
+      - Collapse multiple spaces/tabs into a single space
+      - Normalise unicode to NFC (composed form)
+      - Remove non-printable control characters (except newlines)
+    Args:
+        text: Raw English string from the UI.
+    Returns:
+        Cleaned, unicode-normalised string.
+    """
+    if not text:
+        return ""
+    # Unicode normalisation (NFC — composed form)
+    text = unicodedata.normalize("NFC", text)
+    # Remove non-printable control chars (keep \n for sentence splitting)
+    text = "".join(
+        ch for ch in text if unicodedata.category(ch)[0] != "C" or ch == "\n"
+    )
+    # Collapse runs of spaces/tabs
+    text = re.sub(r"[ \t]+", " ", text)
+    # Trim each line
+    lines = [line.strip() for line in text.splitlines()]
+    return "\n".join(lines).strip()
+# ────────────────────────────────────────────────────────────────��────────────
+# 3. SENTENCE SPLITTING
+# ─────────────────────────────────────────────────────────────────────────────
+def split_into_sentences(text: str) -> list[str]:
+    """
+    Split a paragraph into individual sentences for batch translation.
+    Splits on '.', '?', '!' and newlines while preserving the delimiter
+    at the end of each sentence.
+    Args:
+        text: Preprocessed English paragraph.
+    Returns:
+        List of non-empty sentence strings.
+    """
+    # Split on sentence-ending punctuation, keeping the delimiter
+    parts = re.split(r"(?<=[.?!])\s+|\n+", text)
+    return [s.strip() for s in parts if s.strip()]
+# ─────────────────────────────────────────────────────────────────────────────
+# 4. CORE TRANSLATION
+# ─────────────────────────────────────────────────────────────────────────────
+def _timeout_handler(signum: int, frame) -> None:
+    """SIGALRM handler — raises TimeoutError when translation exceeds limit."""
+    raise TimeoutError(f"Translation timed out after {TRANSLATION_TIMEOUT} seconds.")
+def translate(text: str) -> str:
+    """
+    Translate preprocessed English text to Urdu using MarianMT.
+    Performs sentence-level batching: long paragraphs are split into
+    individual sentences, each translated separately, then rejoined.
+    A SIGALRM-based timeout guard (POSIX only) aborts calls that exceed
+    TRANSLATION_TIMEOUT seconds.
+    Args:
+        text: Preprocessed English string (output of preprocess()).
+    Returns:
+        Raw Urdu translation string (before postprocessing).
+    Raises:
+        ValueError: If input text is empty.
+        TimeoutError: If translation exceeds TRANSLATION_TIMEOUT seconds.
+        RuntimeError: If model inference fails.
+    """
+    if not text.strip():
+        raise ValueError("Input text is empty. Please enter some English text.")
+    translator = load_model()
+    sentences = split_into_sentences(text)
+    # Arm timeout (SIGALRM — works on Linux/macOS; no-op on Windows)
+    try:
+        signal.signal(signal.SIGALRM, _timeout_handler)
+        signal.alarm(TRANSLATION_TIMEOUT)
+    except (AttributeError, OSError):
+        pass  # Windows — skip timeout guard
+    try:
+        results = translator(sentences, max_length=512)
+    except TimeoutError:
+        raise
+    except Exception as exc:
+        raise RuntimeError(f"Model inference failed: {exc}") from exc
+    finally:
+        try:
+            signal.alarm(0)  # Disarm alarm
+        except (AttributeError, OSError):
+            pass
+    translated_sentences = [r["translation_text"] for r in results]
+    return " ".join(translated_sentences)
+# ─────────────────────────────────────────────────────────────────────────────
+# 5. POSTPROCESSING
+# ─────────────────────────────────────────────────────────────────────────────
+def postprocess(urdu_text: str) -> str:
+    """
+    Format the raw Urdu translation for correct RTL display.
+    Steps:
+      - Strip extra whitespace
+      - Add Unicode RLM (Right-to-Left Mark) at the start to force RTL
+        rendering in environments that don't auto-detect Urdu script
+      - Ensure the text ends with a single newline
+    Args:
+        urdu_text: Raw Urdu string from the translation model.
+    Returns:
+        RTL-formatted Urdu string ready for the Gradio output box.
+    """
+    if not urdu_text:
+        return ""
+    text = urdu_text.strip()
+    # Insert RLM marker so RTL is enforced even in LTR containers
+    RLM = "\u200F"
+    if not text.startswith(RLM):
+        text = RLM + text
+    return text
+# ─────────────────────────────────────────────────────────────────────────────
+# 6. ORCHESTRATION — full pipeline
+# ─────────────────────────────────────────────────────────────────────────────
+def run_translation(input_text: str) -> tuple[str, str]:
+    """
+    Full end-to-end translation pipeline: preprocess → translate → postprocess.
+    This is the function wired to the Gradio interface.
+    Args:
+        input_text: Raw English text from the UI textbox.
+    Returns:
+        Tuple of (urdu_output: str, status_message: str).
+        On error, urdu_output is "" and status_message contains the error.
+    """
+    try:
+        cleaned = preprocess(input_text)
+        if not cleaned:
+            return "", "⚠️ Please enter some English text before translating."
+        if len(cleaned) > MAX_CHARS:
+            return "", (
+                f"⚠️ Input exceeds {MAX_CHARS} characters "
+                f"({len(cleaned)} chars). Please shorten your text."
+            )
+        raw_urdu = translate(cleaned)
+        formatted_urdu = postprocess(raw_urdu)
+        word_count_in = len(cleaned.split())
+        word_count_out = len(formatted_urdu.split())
+        status = (
+            f"✅ Translation complete — "
+            f"{word_count_in} English words → {word_count_out} Urdu words."
+        )
+        return formatted_urdu, status
+    except ValueError as e:
+        return "", f"⚠️ {e}"
+    except TimeoutError as e:
+        return "", f"⏱️ {e}"
+    except RuntimeError as e:
+        return "", f"❌ {e}"
+    except Exception as e:
+        return "", f"❌ Unexpected error: {e}"
+def get_word_count(text: str) -> str:
+    """
+    Return a live word-count string for a given text input.
+    Args:
+        text: Any string (English input or Urdu output).
+    Returns:
+        Human-readable word/char count label.
+    """
+    if not text:
+        return "0 words · 0 chars"
+    words = len(text.split())
+    chars = len(text)
+    warn = f" ⚠️ limit is {MAX_CHARS}" if chars > MAX_CHARS else ""
+    return f"{words} words · {chars} chars{warn}"
+# ─────────────────────────────────────────────────────────────────────────────
+# 7. GRADIO UI
+# ─────────────────────────────────────────────────────────────────────────────
+EXAMPLES: list[list[str]] = [
+    ["Artificial intelligence is transforming the world rapidly."],
+    ["Pakistan is a beautiful country with rich culture and history."],
+    ["The patient needs immediate medical attention and care."],
+    ["Education is the most powerful weapon to change the world."],
+    ["Good morning! How are you feeling today?"],
+    [
+        "Machine learning models require large datasets for training. "
+        "The quality of data directly affects model performance."
+    ],
+]
+CUSTOM_CSS: str = """
+/* ── Urdu output — force RTL ── */
+#urdu-output textarea {
+    direction: rtl !important;
+    text-align: right !important;
+    font-family: 'Noto Nastaliq Urdu', 'Jameel Noori Nastaleeq',
+                 'Urdu Typesetting', 'Segoe UI', sans-serif !important;
+    font-size: 18px !important;
+    line-height: 2.2 !important;
+    unicode-bidi: bidi-override;
+}
+/* ── Status bar ── */
+#status-bar {
+    font-size: 13px;
+    color: #555;
+    padding: 6px 10px;
+    border-radius: 6px;
+    background: #f8f9fa;
+    min-height: 34px;
+}
+/* ── Word count labels ── */
+.count-label {
+    font-size: 12px;
+    color: #888;
+    text-align: right;
+    padding: 2px 4px;
+}
+/* ── Translate button accent ── */
+#translate-btn {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+    color: white !important;
+    font-weight: 600 !important;
+    border: none !important;
+}
+#translate-btn:hover {
+    opacity: 0.92 !important;
+    transform: translateY(-1px);
+}
+"""
+def build_ui() -> gr.Blocks:
+    """
+    Construct and return the Gradio Blocks UI.
+    Layout:
+      - Header with app title and description
+      - Two-column panel: English input (left) | Urdu output (right)
+      - Live word/char counters below each panel
+      - Action buttons: Translate · Clear · (Copy handled natively by Gradio)
+      - Status bar showing result metadata or error messages
+      - Example inputs at the bottom
+    Returns:
+        Configured gr.Blocks instance (not yet launched).
+    """
+    theme = gr.themes.Soft(
+        primary_hue="violet",
+        secondary_hue="purple",
+        neutral_hue="slate",
+        font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "sans-serif"],
+    )
+    with gr.Blocks(
+        theme=theme,
+        css=CUSTOM_CSS,
+        title="English → Urdu Translator",
+    ) as demo:
+        # ── Header ────────────────────────────────────────────────────────────
+        gr.HTML("""
+        <div style="text-align:center; padding: 24px 0 8px;">
+          <h1 style="font-size:2rem; font-weight:700; margin:0;">
+            🌐 English → Urdu Translator
+          </h1>
+          <p style="color:#666; margin-top:8px; font-size:15px;">
+            Neural Machine Translation · Helsinki-NLP/opus-mt-en-ur · MarianMT
+          </p>
+        </div>
+        """)
+        # ── Main panels ───────────────────────────────────────────────────────
+        with gr.Row(equal_height=True):
+            with gr.Column():
+                gr.Markdown("#### English Input")
+                input_box = gr.Textbox(
+                    label="",
+                    placeholder="Type or paste English text here… (max 500 characters)",
+                    lines=10,
+                    max_lines=20,
+                    show_copy_button=True,
+                    elem_id="english-input",
+                )
+                input_count = gr.Markdown(
+                    value="0 words · 0 chars",
+                    elem_classes=["count-label"],
+                )
+            with gr.Column():
+                gr.Markdown("#### Urdu Output (اردو)")
+                output_box = gr.Textbox(
+                    label="",
+                    placeholder="ترجمہ یہاں ظاہر ہوگا…",
+                    lines=10,
+                    max_lines=20,
+                    interactive=False,
+                    show_copy_button=True,
+                    elem_id="urdu-output",
+                )
+                output_count = gr.Markdown(
+                    value="0 words · 0 chars",
+                    elem_classes=["count-label"],
+                )
+        # ── Buttons ───────────────────────────────────────────────────────────
+        with gr.Row():
+            translate_btn = gr.Button(
+                "🔄  Translate",
+                variant="primary",
+                scale=3,
+                elem_id="translate-btn",
+            )
+            clear_btn = gr.ClearButton(
+                components=[input_box, output_box],
+                value="🗑  Clear",
+                scale=1,
+            )
+        # ── Status bar ────────────────────────────────────────────────────────
+        status_bar = gr.Markdown(
+            value="",
+            elem_id="status-bar",
+        )
+        # ── Examples ─────────────────────────────────────────────────────────
+        gr.Examples(
+            examples=EXAMPLES,
+            inputs=input_box,
+            label="📋 Example Inputs — click to load",
+            examples_per_page=6,
+        )
+        # ── Footer ────────────────────────────────────────────────────────────
+        gr.HTML("""
+        <div style="text-align:center; padding:16px 0 4px; color:#aaa; font-size:12px;">
+          Powered by
+          <a href="https://huggingface.co/Helsinki-NLP/opus-mt-en-ur"
+             target="_blank" style="color:#764ba2;">Helsinki-NLP/opus-mt-en-ur</a>
+          · Built with
+          <a href="https://gradio.app" target="_blank" style="color:#764ba2;">Gradio 4</a>
+        </div>
+        """)
+        # ── Wiring ────────────────────────────────────────────────────────────
+        # Live word counter for input
+        input_box.change(
+            fn=get_word_count,
+            inputs=input_box,
+            outputs=input_count,
+        )
+        # Live word counter for output
+        output_box.change(
+            fn=get_word_count,
+            inputs=output_box,
+            outputs=output_count,
+        )
+        # Translate button
+        translate_btn.click(
+            fn=run_translation,
+            inputs=input_box,
+            outputs=[output_box, status_bar],
+            api_name="translate",
+        )
+        # Also allow Enter-key submission (Shift+Enter for newline)
+        input_box.submit(
+            fn=run_translation,
+            inputs=input_box,
+            outputs=[output_box, status_bar],
+        )
+        # Clear status bar when input is cleared
+        clear_btn.click(
+            fn=lambda: ("", ""),
+            outputs=[status_bar, output_count],
+        )
+    return demo
+# ─────────────────────────────────────────────────────────────────────────────
+# 8. ENTRY POINT
+# ─────────────────────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    """
+    Launch the Gradio app.
+    - server_name="0.0.0.0"  → accessible on local network
+    - share=False             → set True in Colab (see colab_run.py)
+    - HuggingFace Spaces auto-detects app.py and calls demo.launch() itself
+      via the Gradio SDK runner, so no explicit launch() is needed there.
+    """
+    demo = build_ui()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True,
+    )