Trans / app.py
don0726's picture
Update app.py
7b2e3f4 verified
Raw
History Blame Contribute Delete
2.8 kB
import gradio as gr
import re
from transformers import MarianMTModel, MarianTokenizer
# -------------------------
# Fast Model
# -------------------------
MODEL_NAME = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME)
model = MarianMTModel.from_pretrained(MODEL_NAME)
# -------------------------
# Parse SRT
# -------------------------
def parse_srt(srt_text):
pattern = re.compile(
r"(\d+)\s*\n"
r"(\d{2}:\d{2}:\d{2},\d{3}\s-->\s\d{2}:\d{2}:\d{2},\d{3})\s*\n"
r"(.*?)(?=\n\d+\n|\Z)",
re.DOTALL
)
matches = pattern.findall(srt_text.strip())
subtitles = []
for idx, timestamp, text in matches:
subtitles.append({
"index": idx,
"timestamp": timestamp,
"text": text.replace("\n", " ").strip()
})
return subtitles
# -------------------------
# Shorten Hindi
# -------------------------
def shorten_text(text, max_len):
if len(text) <= max_len:
return text
words = text.split()
while len(text) > max_len and len(words) > 1:
words.pop()
text = " ".join(words)
return text
# -------------------------
# Batch Translate
# -------------------------
def batch_translate(texts):
inputs = tokenizer(
texts,
return_tensors="pt",
padding=True,
truncation=True
)
translated = model.generate(
**inputs,
max_new_tokens=64
)
outputs = tokenizer.batch_decode(
translated,
skip_special_tokens=True
)
return outputs
# -------------------------
# Main Function
# -------------------------
def translate_srt(srt_text):
subtitles = parse_srt(srt_text)
english_texts = [
sub["text"] for sub in subtitles
]
# FAST batch translation
hindi_texts = batch_translate(
english_texts
)
output = []
for sub, hindi in zip(subtitles, hindi_texts):
english_len = len(sub["text"])
# 130% rule
max_hindi_len = int(
english_len * 1.3
)
hindi = shorten_text(
hindi.strip(),
max_hindi_len
)
block = (
f'{sub["index"]}\n'
f'{sub["timestamp"]}\n'
f'{hindi}\n'
)
output.append(block)
return "\n".join(output)
# -------------------------
# UI
# -------------------------
demo = gr.Interface(
fn=translate_srt,
inputs=gr.Textbox(
lines=20,
label="English SRT"
),
outputs=gr.Textbox(
lines=20,
label="Hindi SRT"
),
title="Fast English → Hindi SRT Translator",
description="Ultra-fast subtitle translation with timestamp preservation and subtitle length control."
)
demo.launch()