ShahbazAhmad-Lab commited on
Commit
7ac1dc0
Β·
verified Β·
1 Parent(s): e4c02e8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +529 -0
app.py ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ English-to-Urdu Neural Machine Translation App
3
+ ================================================
4
+ Model : Helsinki-NLP/opus-mt-en-ur (MarianMT)
5
+ UI : Gradio 4.x
6
+ Deploy : HuggingFace Spaces | Google Colab
7
+
8
+ DEPLOYMENT STEPS (HuggingFace Spaces)
9
+ --------------------------------------
10
+ 1. Go to https://huggingface.co/new-space
11
+ 2. Name your space, choose "Gradio" as the SDK
12
+ 3. Upload: app.py, requirements.txt, README.md
13
+ 4. Space auto-builds and launches β€” no extra config needed
14
+ 5. Share the public URL from the "App" tab
15
+ """
16
+
17
+ # ── Standard library ──────────────────────────────────────────────────────────
18
+ import os
19
+ import re
20
+ import signal
21
+ import unicodedata
22
+ from pathlib import Path
23
+ from typing import Optional
24
+
25
+ # ── Third-party ───────────────────────────────────────────────────────────────
26
+ import gradio as gr
27
+ from transformers import MarianMTModel, MarianTokenizer, pipeline
28
+
29
+ # ── Constants ─────────────────────────────────────────────────────────────────
30
+ MODEL_NAME: str = "Helsinki-NLP/opus-mt-en-ur"
31
+ MAX_CHARS: int = 500
32
+ TRANSLATION_TIMEOUT: int = 30 # seconds
33
+ CACHE_DIR: Path = Path(os.getenv("HF_HOME", Path.home() / ".cache" / "huggingface"))
34
+
35
+ # ── Global model singleton ────────────────────────────────────────────────────
36
+ _translator = None
37
+
38
+
39
+ # ─────────────────────────────────────────────────────────────────────────────
40
+ # 1. MODEL LOADING
41
+ # ─────────────────────────────────────────────────────────────────────────────
42
+
43
+ def load_model() -> object:
44
+ """
45
+ Load the MarianMT translation pipeline (English β†’ Urdu).
46
+
47
+ Uses a global singleton so the model is loaded only once per process.
48
+ The model is downloaded to CACHE_DIR on first run and reused thereafter.
49
+
50
+ Returns:
51
+ HuggingFace translation pipeline object.
52
+
53
+ Raises:
54
+ RuntimeError: If the model cannot be loaded after retrying.
55
+ """
56
+ global _translator
57
+ if _translator is not None:
58
+ return _translator
59
+
60
+ try:
61
+ tokenizer = MarianTokenizer.from_pretrained(
62
+ MODEL_NAME, cache_dir=str(CACHE_DIR)
63
+ )
64
+ model = MarianMTModel.from_pretrained(
65
+ MODEL_NAME, cache_dir=str(CACHE_DIR)
66
+ )
67
+ _translator = pipeline(
68
+ "translation",
69
+ model=model,
70
+ tokenizer=tokenizer,
71
+ device=-1, # CPU only β€” no CUDA dependency
72
+ )
73
+ return _translator
74
+ except Exception as exc:
75
+ raise RuntimeError(
76
+ f"Failed to load translation model '{MODEL_NAME}': {exc}"
77
+ ) from exc
78
+
79
+
80
+ # ─────────────────────────────────────────────────────────────────────────────
81
+ # 2. PREPROCESSING
82
+ # ─────────────────────────────────────────────────────────────────────────────
83
+
84
+ def preprocess(text: str) -> str:
85
+ """
86
+ Clean and normalise raw English input before sending to the model.
87
+
88
+ Steps:
89
+ - Strip leading/trailing whitespace
90
+ - Collapse multiple spaces/tabs into a single space
91
+ - Normalise unicode to NFC (composed form)
92
+ - Remove non-printable control characters (except newlines)
93
+
94
+ Args:
95
+ text: Raw English string from the UI.
96
+
97
+ Returns:
98
+ Cleaned, unicode-normalised string.
99
+ """
100
+ if not text:
101
+ return ""
102
+
103
+ # Unicode normalisation (NFC β€” composed form)
104
+ text = unicodedata.normalize("NFC", text)
105
+
106
+ # Remove non-printable control chars (keep \n for sentence splitting)
107
+ text = "".join(
108
+ ch for ch in text if unicodedata.category(ch)[0] != "C" or ch == "\n"
109
+ )
110
+
111
+ # Collapse runs of spaces/tabs
112
+ text = re.sub(r"[ \t]+", " ", text)
113
+
114
+ # Trim each line
115
+ lines = [line.strip() for line in text.splitlines()]
116
+ return "\n".join(lines).strip()
117
+
118
+
119
+ # ────────────────────────────────────────────────────────────────��────────────
120
+ # 3. SENTENCE SPLITTING
121
+ # ─────────────────────────────────────────────────────────────────────────────
122
+
123
+ def split_into_sentences(text: str) -> list[str]:
124
+ """
125
+ Split a paragraph into individual sentences for batch translation.
126
+
127
+ Splits on '.', '?', '!' and newlines while preserving the delimiter
128
+ at the end of each sentence.
129
+
130
+ Args:
131
+ text: Preprocessed English paragraph.
132
+
133
+ Returns:
134
+ List of non-empty sentence strings.
135
+ """
136
+ # Split on sentence-ending punctuation, keeping the delimiter
137
+ parts = re.split(r"(?<=[.?!])\s+|\n+", text)
138
+ return [s.strip() for s in parts if s.strip()]
139
+
140
+
141
+ # ─────────────────────────────────────────────────────────────────────────────
142
+ # 4. CORE TRANSLATION
143
+ # ─────────────────────────────────────────────────────────────────────────────
144
+
145
+ def _timeout_handler(signum: int, frame) -> None:
146
+ """SIGALRM handler β€” raises TimeoutError when translation exceeds limit."""
147
+ raise TimeoutError(f"Translation timed out after {TRANSLATION_TIMEOUT} seconds.")
148
+
149
+
150
+ def translate(text: str) -> str:
151
+ """
152
+ Translate preprocessed English text to Urdu using MarianMT.
153
+
154
+ Performs sentence-level batching: long paragraphs are split into
155
+ individual sentences, each translated separately, then rejoined.
156
+ A SIGALRM-based timeout guard (POSIX only) aborts calls that exceed
157
+ TRANSLATION_TIMEOUT seconds.
158
+
159
+ Args:
160
+ text: Preprocessed English string (output of preprocess()).
161
+
162
+ Returns:
163
+ Raw Urdu translation string (before postprocessing).
164
+
165
+ Raises:
166
+ ValueError: If input text is empty.
167
+ TimeoutError: If translation exceeds TRANSLATION_TIMEOUT seconds.
168
+ RuntimeError: If model inference fails.
169
+ """
170
+ if not text.strip():
171
+ raise ValueError("Input text is empty. Please enter some English text.")
172
+
173
+ translator = load_model()
174
+ sentences = split_into_sentences(text)
175
+
176
+ # Arm timeout (SIGALRM β€” works on Linux/macOS; no-op on Windows)
177
+ try:
178
+ signal.signal(signal.SIGALRM, _timeout_handler)
179
+ signal.alarm(TRANSLATION_TIMEOUT)
180
+ except (AttributeError, OSError):
181
+ pass # Windows β€” skip timeout guard
182
+
183
+ try:
184
+ results = translator(sentences, max_length=512)
185
+ except TimeoutError:
186
+ raise
187
+ except Exception as exc:
188
+ raise RuntimeError(f"Model inference failed: {exc}") from exc
189
+ finally:
190
+ try:
191
+ signal.alarm(0) # Disarm alarm
192
+ except (AttributeError, OSError):
193
+ pass
194
+
195
+ translated_sentences = [r["translation_text"] for r in results]
196
+ return " ".join(translated_sentences)
197
+
198
+
199
+ # ─────────────────────────────────────────────────────────────────────────────
200
+ # 5. POSTPROCESSING
201
+ # ─────────────────────────────────────────────────────────────────────────────
202
+
203
+ def postprocess(urdu_text: str) -> str:
204
+ """
205
+ Format the raw Urdu translation for correct RTL display.
206
+
207
+ Steps:
208
+ - Strip extra whitespace
209
+ - Add Unicode RLM (Right-to-Left Mark) at the start to force RTL
210
+ rendering in environments that don't auto-detect Urdu script
211
+ - Ensure the text ends with a single newline
212
+
213
+ Args:
214
+ urdu_text: Raw Urdu string from the translation model.
215
+
216
+ Returns:
217
+ RTL-formatted Urdu string ready for the Gradio output box.
218
+ """
219
+ if not urdu_text:
220
+ return ""
221
+
222
+ text = urdu_text.strip()
223
+
224
+ # Insert RLM marker so RTL is enforced even in LTR containers
225
+ RLM = "\u200F"
226
+ if not text.startswith(RLM):
227
+ text = RLM + text
228
+
229
+ return text
230
+
231
+
232
+ # ─────────────────────────────────────────────────────────────────────────────
233
+ # 6. ORCHESTRATION β€” full pipeline
234
+ # ─────────────────────────────────────────────────────────────────────────────
235
+
236
+ def run_translation(input_text: str) -> tuple[str, str]:
237
+ """
238
+ Full end-to-end translation pipeline: preprocess β†’ translate β†’ postprocess.
239
+
240
+ This is the function wired to the Gradio interface.
241
+
242
+ Args:
243
+ input_text: Raw English text from the UI textbox.
244
+
245
+ Returns:
246
+ Tuple of (urdu_output: str, status_message: str).
247
+ On error, urdu_output is "" and status_message contains the error.
248
+ """
249
+ try:
250
+ cleaned = preprocess(input_text)
251
+ if not cleaned:
252
+ return "", "⚠️ Please enter some English text before translating."
253
+
254
+ if len(cleaned) > MAX_CHARS:
255
+ return "", (
256
+ f"⚠️ Input exceeds {MAX_CHARS} characters "
257
+ f"({len(cleaned)} chars). Please shorten your text."
258
+ )
259
+
260
+ raw_urdu = translate(cleaned)
261
+ formatted_urdu = postprocess(raw_urdu)
262
+ word_count_in = len(cleaned.split())
263
+ word_count_out = len(formatted_urdu.split())
264
+ status = (
265
+ f"βœ… Translation complete β€” "
266
+ f"{word_count_in} English words β†’ {word_count_out} Urdu words."
267
+ )
268
+ return formatted_urdu, status
269
+
270
+ except ValueError as e:
271
+ return "", f"⚠️ {e}"
272
+ except TimeoutError as e:
273
+ return "", f"⏱️ {e}"
274
+ except RuntimeError as e:
275
+ return "", f"❌ {e}"
276
+ except Exception as e:
277
+ return "", f"❌ Unexpected error: {e}"
278
+
279
+
280
+ def get_word_count(text: str) -> str:
281
+ """
282
+ Return a live word-count string for a given text input.
283
+
284
+ Args:
285
+ text: Any string (English input or Urdu output).
286
+
287
+ Returns:
288
+ Human-readable word/char count label.
289
+ """
290
+ if not text:
291
+ return "0 words Β· 0 chars"
292
+ words = len(text.split())
293
+ chars = len(text)
294
+ warn = f" ⚠️ limit is {MAX_CHARS}" if chars > MAX_CHARS else ""
295
+ return f"{words} words Β· {chars} chars{warn}"
296
+
297
+
298
+ # ─────────────────────────────────────────────────────────────────────────────
299
+ # 7. GRADIO UI
300
+ # ─────────────────────────────────────────────────────────────────────────────
301
+
302
+ EXAMPLES: list[list[str]] = [
303
+ ["Artificial intelligence is transforming the world rapidly."],
304
+ ["Pakistan is a beautiful country with rich culture and history."],
305
+ ["The patient needs immediate medical attention and care."],
306
+ ["Education is the most powerful weapon to change the world."],
307
+ ["Good morning! How are you feeling today?"],
308
+ [
309
+ "Machine learning models require large datasets for training. "
310
+ "The quality of data directly affects model performance."
311
+ ],
312
+ ]
313
+
314
+ CUSTOM_CSS: str = """
315
+ /* ── Urdu output β€” force RTL ── */
316
+ #urdu-output textarea {
317
+ direction: rtl !important;
318
+ text-align: right !important;
319
+ font-family: 'Noto Nastaliq Urdu', 'Jameel Noori Nastaleeq',
320
+ 'Urdu Typesetting', 'Segoe UI', sans-serif !important;
321
+ font-size: 18px !important;
322
+ line-height: 2.2 !important;
323
+ unicode-bidi: bidi-override;
324
+ }
325
+
326
+ /* ── Status bar ── */
327
+ #status-bar {
328
+ font-size: 13px;
329
+ color: #555;
330
+ padding: 6px 10px;
331
+ border-radius: 6px;
332
+ background: #f8f9fa;
333
+ min-height: 34px;
334
+ }
335
+
336
+ /* ── Word count labels ── */
337
+ .count-label {
338
+ font-size: 12px;
339
+ color: #888;
340
+ text-align: right;
341
+ padding: 2px 4px;
342
+ }
343
+
344
+ /* ── Translate button accent ── */
345
+ #translate-btn {
346
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
347
+ color: white !important;
348
+ font-weight: 600 !important;
349
+ border: none !important;
350
+ }
351
+ #translate-btn:hover {
352
+ opacity: 0.92 !important;
353
+ transform: translateY(-1px);
354
+ }
355
+ """
356
+
357
+
358
+ def build_ui() -> gr.Blocks:
359
+ """
360
+ Construct and return the Gradio Blocks UI.
361
+
362
+ Layout:
363
+ - Header with app title and description
364
+ - Two-column panel: English input (left) | Urdu output (right)
365
+ - Live word/char counters below each panel
366
+ - Action buttons: Translate Β· Clear Β· (Copy handled natively by Gradio)
367
+ - Status bar showing result metadata or error messages
368
+ - Example inputs at the bottom
369
+
370
+ Returns:
371
+ Configured gr.Blocks instance (not yet launched).
372
+ """
373
+ theme = gr.themes.Soft(
374
+ primary_hue="violet",
375
+ secondary_hue="purple",
376
+ neutral_hue="slate",
377
+ font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "sans-serif"],
378
+ )
379
+
380
+ with gr.Blocks(
381
+ theme=theme,
382
+ css=CUSTOM_CSS,
383
+ title="English β†’ Urdu Translator",
384
+ ) as demo:
385
+
386
+ # ── Header ────────────────────────────────────────────────────────────
387
+ gr.HTML("""
388
+ <div style="text-align:center; padding: 24px 0 8px;">
389
+ <h1 style="font-size:2rem; font-weight:700; margin:0;">
390
+ 🌐 English β†’ Urdu Translator
391
+ </h1>
392
+ <p style="color:#666; margin-top:8px; font-size:15px;">
393
+ Neural Machine Translation Β· Helsinki-NLP/opus-mt-en-ur Β· MarianMT
394
+ </p>
395
+ </div>
396
+ """)
397
+
398
+ # ── Main panels ───────────────────────────────────────────────────────
399
+ with gr.Row(equal_height=True):
400
+ with gr.Column():
401
+ gr.Markdown("#### English Input")
402
+ input_box = gr.Textbox(
403
+ label="",
404
+ placeholder="Type or paste English text here… (max 500 characters)",
405
+ lines=10,
406
+ max_lines=20,
407
+ show_copy_button=True,
408
+ elem_id="english-input",
409
+ )
410
+ input_count = gr.Markdown(
411
+ value="0 words Β· 0 chars",
412
+ elem_classes=["count-label"],
413
+ )
414
+
415
+ with gr.Column():
416
+ gr.Markdown("#### Urdu Output (اردو)")
417
+ output_box = gr.Textbox(
418
+ label="",
419
+ placeholder="Ψͺرجمہ یہاں ظاہر ΫΩˆΪ―Ψ§β€¦",
420
+ lines=10,
421
+ max_lines=20,
422
+ interactive=False,
423
+ show_copy_button=True,
424
+ elem_id="urdu-output",
425
+ )
426
+ output_count = gr.Markdown(
427
+ value="0 words Β· 0 chars",
428
+ elem_classes=["count-label"],
429
+ )
430
+
431
+ # ── Buttons ───────────────────────────────────────────────────────────
432
+ with gr.Row():
433
+ translate_btn = gr.Button(
434
+ "πŸ”„ Translate",
435
+ variant="primary",
436
+ scale=3,
437
+ elem_id="translate-btn",
438
+ )
439
+ clear_btn = gr.ClearButton(
440
+ components=[input_box, output_box],
441
+ value="πŸ—‘ Clear",
442
+ scale=1,
443
+ )
444
+
445
+ # ── Status bar ────────────────────────────────────────────────────────
446
+ status_bar = gr.Markdown(
447
+ value="",
448
+ elem_id="status-bar",
449
+ )
450
+
451
+ # ── Examples ─────────────────────────────────────────────────────────
452
+ gr.Examples(
453
+ examples=EXAMPLES,
454
+ inputs=input_box,
455
+ label="πŸ“‹ Example Inputs β€” click to load",
456
+ examples_per_page=6,
457
+ )
458
+
459
+ # ── Footer ────────────────────────────────────────────────────────────
460
+ gr.HTML("""
461
+ <div style="text-align:center; padding:16px 0 4px; color:#aaa; font-size:12px;">
462
+ Powered by
463
+ <a href="https://huggingface.co/Helsinki-NLP/opus-mt-en-ur"
464
+ target="_blank" style="color:#764ba2;">Helsinki-NLP/opus-mt-en-ur</a>
465
+ Β· Built with
466
+ <a href="https://gradio.app" target="_blank" style="color:#764ba2;">Gradio 4</a>
467
+ </div>
468
+ """)
469
+
470
+ # ── Wiring ────────────────────────────────────────────────────────────
471
+
472
+ # Live word counter for input
473
+ input_box.change(
474
+ fn=get_word_count,
475
+ inputs=input_box,
476
+ outputs=input_count,
477
+ )
478
+
479
+ # Live word counter for output
480
+ output_box.change(
481
+ fn=get_word_count,
482
+ inputs=output_box,
483
+ outputs=output_count,
484
+ )
485
+
486
+ # Translate button
487
+ translate_btn.click(
488
+ fn=run_translation,
489
+ inputs=input_box,
490
+ outputs=[output_box, status_bar],
491
+ api_name="translate",
492
+ )
493
+
494
+ # Also allow Enter-key submission (Shift+Enter for newline)
495
+ input_box.submit(
496
+ fn=run_translation,
497
+ inputs=input_box,
498
+ outputs=[output_box, status_bar],
499
+ )
500
+
501
+ # Clear status bar when input is cleared
502
+ clear_btn.click(
503
+ fn=lambda: ("", ""),
504
+ outputs=[status_bar, output_count],
505
+ )
506
+
507
+ return demo
508
+
509
+
510
+ # ─────────────────────────────────────────────────────────────────────────────
511
+ # 8. ENTRY POINT
512
+ # ─────────────────────────────────────────────────────────────────────────────
513
+
514
+ if __name__ == "__main__":
515
+ """
516
+ Launch the Gradio app.
517
+
518
+ - server_name="0.0.0.0" β†’ accessible on local network
519
+ - share=False β†’ set True in Colab (see colab_run.py)
520
+ - HuggingFace Spaces auto-detects app.py and calls demo.launch() itself
521
+ via the Gradio SDK runner, so no explicit launch() is needed there.
522
+ """
523
+ demo = build_ui()
524
+ demo.launch(
525
+ server_name="0.0.0.0",
526
+ server_port=7860,
527
+ share=False,
528
+ show_error=True,
529
+ )