Spaces:

ShahbazAhmad-Lab
/

transformer

Runtime error

App Files Files Community

transformer / app.py

ShahbazAhmad-Lab

Create app.py

7ac1dc0 verified about 2 months ago

raw

history blame contribute delete

20.7 kB

	"""
	English-to-Urdu Neural Machine Translation App
	================================================
	Model : Helsinki-NLP/opus-mt-en-ur (MarianMT)
	UI : Gradio 4.x
	Deploy : HuggingFace Spaces \| Google Colab

	DEPLOYMENT STEPS (HuggingFace Spaces)
	--------------------------------------
	1. Go to https://huggingface.co/new-space
	2. Name your space, choose "Gradio" as the SDK
	3. Upload: app.py, requirements.txt, README.md
	4. Space auto-builds and launches — no extra config needed
	5. Share the public URL from the "App" tab
	"""

	# ── Standard library ──────────────────────────────────────────────────────────
	import os
	import re
	import signal
	import unicodedata
	from pathlib import Path
	from typing import Optional

	# ── Third-party ───────────────────────────────────────────────────────────────
	import gradio as gr
	from transformers import MarianMTModel, MarianTokenizer, pipeline

	# ── Constants ─────────────────────────────────────────────────────────────────
	MODEL_NAME: str = "Helsinki-NLP/opus-mt-en-ur"
	MAX_CHARS: int = 500
	TRANSLATION_TIMEOUT: int = 30 # seconds
	CACHE_DIR: Path = Path(os.getenv("HF_HOME", Path.home() / ".cache" / "huggingface"))

	# ── Global model singleton ────────────────────────────────────────────────────
	_translator = None


	# ─────────────────────────────────────────────────────────────────────────────
	# 1. MODEL LOADING
	# ─────────────────────────────────────────────────────────────────────────────

	def load_model() -> object:
	"""
	Load the MarianMT translation pipeline (English → Urdu).

	Uses a global singleton so the model is loaded only once per process.
	The model is downloaded to CACHE_DIR on first run and reused thereafter.

	Returns:
	HuggingFace translation pipeline object.

	Raises:
	RuntimeError: If the model cannot be loaded after retrying.
	"""
	global _translator
	if _translator is not None:
	return _translator

	try:
	tokenizer = MarianTokenizer.from_pretrained(
	MODEL_NAME, cache_dir=str(CACHE_DIR)
	)
	model = MarianMTModel.from_pretrained(
	MODEL_NAME, cache_dir=str(CACHE_DIR)
	)
	_translator = pipeline(
	"translation",
	model=model,
	tokenizer=tokenizer,
	device=-1, # CPU only — no CUDA dependency
	)
	return _translator
	except Exception as exc:
	raise RuntimeError(
	f"Failed to load translation model '{MODEL_NAME}': {exc}"
	) from exc


	# ─────────────────────────────────────────────────────────────────────────────
	# 2. PREPROCESSING
	# ─────────────────────────────────────────────────────────────────────────────

	def preprocess(text: str) -> str:
	"""
	Clean and normalise raw English input before sending to the model.

	Steps:
	- Strip leading/trailing whitespace
	- Collapse multiple spaces/tabs into a single space
	- Normalise unicode to NFC (composed form)
	- Remove non-printable control characters (except newlines)

	Args:
	text: Raw English string from the UI.

	Returns:
	Cleaned, unicode-normalised string.
	"""
	if not text:
	return ""

	# Unicode normalisation (NFC — composed form)
	text = unicodedata.normalize("NFC", text)

	# Remove non-printable control chars (keep \n for sentence splitting)
	text = "".join(
	ch for ch in text if unicodedata.category(ch)[0] != "C" or ch == "\n"
	)

	# Collapse runs of spaces/tabs
	text = re.sub(r"[ \t]+", " ", text)

	# Trim each line
	lines = [line.strip() for line in text.splitlines()]
	return "\n".join(lines).strip()


	# ─────────────────────────────────────────────────────────────────────────────
	# 3. SENTENCE SPLITTING
	# ─────────────────────────────────────────────────────────────────────────────

	def split_into_sentences(text: str) -> list[str]:
	"""
	Split a paragraph into individual sentences for batch translation.

	Splits on '.', '?', '!' and newlines while preserving the delimiter
	at the end of each sentence.

	Args:
	text: Preprocessed English paragraph.

	Returns:
	List of non-empty sentence strings.
	"""
	# Split on sentence-ending punctuation, keeping the delimiter
	parts = re.split(r"(?<=[.?!])\s+\|\n+", text)
	return [s.strip() for s in parts if s.strip()]


	# ─────────────────────────────────────────────────────────────────────────────
	# 4. CORE TRANSLATION
	# ─────────────────────────────────────────────────────────────────────────────

	def _timeout_handler(signum: int, frame) -> None:
	"""SIGALRM handler — raises TimeoutError when translation exceeds limit."""
	raise TimeoutError(f"Translation timed out after {TRANSLATION_TIMEOUT} seconds.")


	def translate(text: str) -> str:
	"""
	Translate preprocessed English text to Urdu using MarianMT.

	Performs sentence-level batching: long paragraphs are split into
	individual sentences, each translated separately, then rejoined.
	A SIGALRM-based timeout guard (POSIX only) aborts calls that exceed
	TRANSLATION_TIMEOUT seconds.

	Args:
	text: Preprocessed English string (output of preprocess()).

	Returns:
	Raw Urdu translation string (before postprocessing).

	Raises:
	ValueError: If input text is empty.
	TimeoutError: If translation exceeds TRANSLATION_TIMEOUT seconds.
	RuntimeError: If model inference fails.
	"""
	if not text.strip():
	raise ValueError("Input text is empty. Please enter some English text.")

	translator = load_model()
	sentences = split_into_sentences(text)

	# Arm timeout (SIGALRM — works on Linux/macOS; no-op on Windows)
	try:
	signal.signal(signal.SIGALRM, _timeout_handler)
	signal.alarm(TRANSLATION_TIMEOUT)
	except (AttributeError, OSError):
	pass # Windows — skip timeout guard

	try:
	results = translator(sentences, max_length=512)
	except TimeoutError:
	raise
	except Exception as exc:
	raise RuntimeError(f"Model inference failed: {exc}") from exc
	finally:
	try:
	signal.alarm(0) # Disarm alarm
	except (AttributeError, OSError):
	pass

	translated_sentences = [r["translation_text"] for r in results]
	return " ".join(translated_sentences)


	# ─────────────────────────────────────────────────────────────────────────────
	# 5. POSTPROCESSING
	# ─────────────────────────────────────────────────────────────────────────────

	def postprocess(urdu_text: str) -> str:
	"""
	Format the raw Urdu translation for correct RTL display.

	Steps:
	- Strip extra whitespace
	- Add Unicode RLM (Right-to-Left Mark) at the start to force RTL
	rendering in environments that don't auto-detect Urdu script
	- Ensure the text ends with a single newline

	Args:
	urdu_text: Raw Urdu string from the translation model.

	Returns:
	RTL-formatted Urdu string ready for the Gradio output box.
	"""
	if not urdu_text:
	return ""

	text = urdu_text.strip()

	# Insert RLM marker so RTL is enforced even in LTR containers
	RLM = "\u200F"
	if not text.startswith(RLM):
	text = RLM + text

	return text


	# ─────────────────────────────────────────────────────────────────────────────
	# 6. ORCHESTRATION — full pipeline
	# ─────────────────────────────────────────────────────────────────────────────

	def run_translation(input_text: str) -> tuple[str, str]:
	"""
	Full end-to-end translation pipeline: preprocess → translate → postprocess.

	This is the function wired to the Gradio interface.

	Args:
	input_text: Raw English text from the UI textbox.

	Returns:
	Tuple of (urdu_output: str, status_message: str).
	On error, urdu_output is "" and status_message contains the error.
	"""
	try:
	cleaned = preprocess(input_text)
	if not cleaned:
	return "", "⚠️ Please enter some English text before translating."

	if len(cleaned) > MAX_CHARS:
	return "", (
	f"⚠️ Input exceeds {MAX_CHARS} characters "
	f"({len(cleaned)} chars). Please shorten your text."
	)

	raw_urdu = translate(cleaned)
	formatted_urdu = postprocess(raw_urdu)
	word_count_in = len(cleaned.split())
	word_count_out = len(formatted_urdu.split())
	status = (
	f"✅ Translation complete — "
	f"{word_count_in} English words → {word_count_out} Urdu words."
	)
	return formatted_urdu, status

	except ValueError as e:
	return "", f"⚠️ {e}"
	except TimeoutError as e:
	return "", f"⏱️ {e}"
	except RuntimeError as e:
	return "", f"❌ {e}"
	except Exception as e:
	return "", f"❌ Unexpected error: {e}"


	def get_word_count(text: str) -> str:
	"""
	Return a live word-count string for a given text input.

	Args:
	text: Any string (English input or Urdu output).

	Returns:
	Human-readable word/char count label.
	"""
	if not text:
	return "0 words · 0 chars"
	words = len(text.split())
	chars = len(text)
	warn = f" ⚠️ limit is {MAX_CHARS}" if chars > MAX_CHARS else ""
	return f"{words} words · {chars} chars{warn}"


	# ─────────────────────────────────────────────────────────────────────────────
	# 7. GRADIO UI
	# ─────────────────────────────────────────────────────────────────────────────

	EXAMPLES: list[list[str]] = [
	["Artificial intelligence is transforming the world rapidly."],
	["Pakistan is a beautiful country with rich culture and history."],
	["The patient needs immediate medical attention and care."],
	["Education is the most powerful weapon to change the world."],
	["Good morning! How are you feeling today?"],
	[
	"Machine learning models require large datasets for training. "
	"The quality of data directly affects model performance."
	],
	]

	CUSTOM_CSS: str = """
	/* ── Urdu output — force RTL ── */
	#urdu-output textarea {
	direction: rtl !important;
	text-align: right !important;
	font-family: 'Noto Nastaliq Urdu', 'Jameel Noori Nastaleeq',
	'Urdu Typesetting', 'Segoe UI', sans-serif !important;
	font-size: 18px !important;
	line-height: 2.2 !important;
	unicode-bidi: bidi-override;
	}

	/* ── Status bar ── */
	#status-bar {
	font-size: 13px;
	color: #555;
	padding: 6px 10px;
	border-radius: 6px;
	background: #f8f9fa;
	min-height: 34px;
	}

	/* ── Word count labels ── */
	.count-label {
	font-size: 12px;
	color: #888;
	text-align: right;
	padding: 2px 4px;
	}

	/* ── Translate button accent ── */
	#translate-btn {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
	color: white !important;
	font-weight: 600 !important;
	border: none !important;
	}
	#translate-btn:hover {
	opacity: 0.92 !important;
	transform: translateY(-1px);
	}
	"""


	def build_ui() -> gr.Blocks:
	"""
	Construct and return the Gradio Blocks UI.

	Layout:
	- Header with app title and description
	- Two-column panel: English input (left) \| Urdu output (right)
	- Live word/char counters below each panel
	- Action buttons: Translate · Clear · (Copy handled natively by Gradio)
	- Status bar showing result metadata or error messages
	- Example inputs at the bottom

	Returns:
	Configured gr.Blocks instance (not yet launched).
	"""
	theme = gr.themes.Soft(
	primary_hue="violet",
	secondary_hue="purple",
	neutral_hue="slate",
	font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "sans-serif"],
	)

	with gr.Blocks(
	theme=theme,
	css=CUSTOM_CSS,
	title="English → Urdu Translator",
	) as demo:

	# ── Header ────────────────────────────────────────────────────────────
	gr.HTML("""
	<div style="text-align:center; padding: 24px 0 8px;">
	<h1 style="font-size:2rem; font-weight:700; margin:0;">
	🌐 English → Urdu Translator
	</h1>
	<p style="color:#666; margin-top:8px; font-size:15px;">
	Neural Machine Translation · Helsinki-NLP/opus-mt-en-ur · MarianMT
	</p>
	</div>
	""")

	# ── Main panels ───────────────────────────────────────────────────────
	with gr.Row(equal_height=True):
	with gr.Column():
	gr.Markdown("#### English Input")
	input_box = gr.Textbox(
	label="",
	placeholder="Type or paste English text here… (max 500 characters)",
	lines=10,
	max_lines=20,
	show_copy_button=True,
	elem_id="english-input",
	)
	input_count = gr.Markdown(
	value="0 words · 0 chars",
	elem_classes=["count-label"],
	)

	with gr.Column():
	gr.Markdown("#### Urdu Output (اردو)")
	output_box = gr.Textbox(
	label="",
	placeholder="ترجمہ یہاں ظاہر ہوگا…",
	lines=10,
	max_lines=20,
	interactive=False,
	show_copy_button=True,
	elem_id="urdu-output",
	)
	output_count = gr.Markdown(
	value="0 words · 0 chars",
	elem_classes=["count-label"],
	)

	# ── Buttons ───────────────────────────────────────────────────────────
	with gr.Row():
	translate_btn = gr.Button(
	"🔄 Translate",
	variant="primary",
	scale=3,
	elem_id="translate-btn",
	)
	clear_btn = gr.ClearButton(
	components=[input_box, output_box],
	value="🗑 Clear",
	scale=1,
	)

	# ── Status bar ────────────────────────────────────────────────────────
	status_bar = gr.Markdown(
	value="",
	elem_id="status-bar",
	)

	# ── Examples ─────────────────────────────────────────────────────────
	gr.Examples(
	examples=EXAMPLES,
	inputs=input_box,
	label="📋 Example Inputs — click to load",
	examples_per_page=6,
	)

	# ── Footer ────────────────────────────────────────────────────────────
	gr.HTML("""
	<div style="text-align:center; padding:16px 0 4px; color:#aaa; font-size:12px;">
	Powered by
	<a href="https://huggingface.co/Helsinki-NLP/opus-mt-en-ur"
	target="_blank" style="color:#764ba2;">Helsinki-NLP/opus-mt-en-ur</a>
	· Built with
	<a href="https://gradio.app" target="_blank" style="color:#764ba2;">Gradio 4</a>
	</div>
	""")

	# ── Wiring ────────────────────────────────────────────────────────────

	# Live word counter for input
	input_box.change(
	fn=get_word_count,
	inputs=input_box,
	outputs=input_count,
	)

	# Live word counter for output
	output_box.change(
	fn=get_word_count,
	inputs=output_box,
	outputs=output_count,
	)

	# Translate button
	translate_btn.click(
	fn=run_translation,
	inputs=input_box,
	outputs=[output_box, status_bar],
	api_name="translate",
	)

	# Also allow Enter-key submission (Shift+Enter for newline)
	input_box.submit(
	fn=run_translation,
	inputs=input_box,
	outputs=[output_box, status_bar],
	)

	# Clear status bar when input is cleared
	clear_btn.click(
	fn=lambda: ("", ""),
	outputs=[status_bar, output_count],
	)

	return demo


	# ─────────────────────────────────────────────────────────────────────────────
	# 8. ENTRY POINT
	# ─────────────────────────────────────────────────────────────────────────────

	if __name__ == "__main__":
	"""
	Launch the Gradio app.

	- server_name="0.0.0.0" → accessible on local network
	- share=False → set True in Colab (see colab_run.py)
	- HuggingFace Spaces auto-detects app.py and calls demo.launch() itself
	via the Gradio SDK runner, so no explicit launch() is needed there.
	"""
	demo = build_ui()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True,
	)