Spaces:

don0726
/

Trans

Sleeping

App Files Files Community

Trans / app.py

don0726

Update app.py

7b2e3f4 verified about 2 months ago

Raw

History Blame Contribute Delete

2.8 kB

	import gradio as gr
	import re
	from transformers import MarianMTModel, MarianTokenizer

	# -------------------------
	# Fast Model
	# -------------------------
	MODEL_NAME = "Helsinki-NLP/opus-mt-en-hi"

	tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME)

	model = MarianMTModel.from_pretrained(MODEL_NAME)

	# -------------------------
	# Parse SRT
	# -------------------------
	def parse_srt(srt_text):

	pattern = re.compile(
	r"(\d+)\s*\n"
	r"(\d{2}:\d{2}:\d{2},\d{3}\s-->\s\d{2}:\d{2}:\d{2},\d{3})\s*\n"
	r"(.*?)(?=\n\d+\n\|\Z)",
	re.DOTALL
	)

	matches = pattern.findall(srt_text.strip())

	subtitles = []

	for idx, timestamp, text in matches:

	subtitles.append({
	"index": idx,
	"timestamp": timestamp,
	"text": text.replace("\n", " ").strip()
	})

	return subtitles


	# -------------------------
	# Shorten Hindi
	# -------------------------
	def shorten_text(text, max_len):

	if len(text) <= max_len:
	return text

	words = text.split()

	while len(text) > max_len and len(words) > 1:
	words.pop()
	text = " ".join(words)

	return text


	# -------------------------
	# Batch Translate
	# -------------------------
	def batch_translate(texts):

	inputs = tokenizer(
	texts,
	return_tensors="pt",
	padding=True,
	truncation=True
	)

	translated = model.generate(
	**inputs,
	max_new_tokens=64
	)

	outputs = tokenizer.batch_decode(
	translated,
	skip_special_tokens=True
	)

	return outputs


	# -------------------------
	# Main Function
	# -------------------------
	def translate_srt(srt_text):

	subtitles = parse_srt(srt_text)

	english_texts = [
	sub["text"] for sub in subtitles
	]

	# FAST batch translation
	hindi_texts = batch_translate(
	english_texts
	)

	output = []

	for sub, hindi in zip(subtitles, hindi_texts):

	english_len = len(sub["text"])

	# 130% rule
	max_hindi_len = int(
	english_len * 1.3
	)

	hindi = shorten_text(
	hindi.strip(),
	max_hindi_len
	)

	block = (
	f'{sub["index"]}\n'
	f'{sub["timestamp"]}\n'
	f'{hindi}\n'
	)

	output.append(block)

	return "\n".join(output)


	# -------------------------
	# UI
	# -------------------------
	demo = gr.Interface(
	fn=translate_srt,
	inputs=gr.Textbox(
	lines=20,
	label="English SRT"
	),
	outputs=gr.Textbox(
	lines=20,
	label="Hindi SRT"
	),
	title="Fast English → Hindi SRT Translator",
	description="Ultra-fast subtitle translation with timestamp preservation and subtitle length control."
	)

	demo.launch()