| import gradio as gr |
| import re |
| from transformers import MarianMTModel, MarianTokenizer |
|
|
| |
| |
| |
| MODEL_NAME = "Helsinki-NLP/opus-mt-en-hi" |
|
|
| tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME) |
|
|
| model = MarianMTModel.from_pretrained(MODEL_NAME) |
|
|
| |
| |
| |
| def parse_srt(srt_text): |
|
|
| pattern = re.compile( |
| r"(\d+)\s*\n" |
| r"(\d{2}:\d{2}:\d{2},\d{3}\s-->\s\d{2}:\d{2}:\d{2},\d{3})\s*\n" |
| r"(.*?)(?=\n\d+\n|\Z)", |
| re.DOTALL |
| ) |
|
|
| matches = pattern.findall(srt_text.strip()) |
|
|
| subtitles = [] |
|
|
| for idx, timestamp, text in matches: |
|
|
| subtitles.append({ |
| "index": idx, |
| "timestamp": timestamp, |
| "text": text.replace("\n", " ").strip() |
| }) |
|
|
| return subtitles |
|
|
|
|
| |
| |
| |
| def shorten_text(text, max_len): |
|
|
| if len(text) <= max_len: |
| return text |
|
|
| words = text.split() |
|
|
| while len(text) > max_len and len(words) > 1: |
| words.pop() |
| text = " ".join(words) |
|
|
| return text |
|
|
|
|
| |
| |
| |
| def batch_translate(texts): |
|
|
| inputs = tokenizer( |
| texts, |
| return_tensors="pt", |
| padding=True, |
| truncation=True |
| ) |
|
|
| translated = model.generate( |
| **inputs, |
| max_new_tokens=64 |
| ) |
|
|
| outputs = tokenizer.batch_decode( |
| translated, |
| skip_special_tokens=True |
| ) |
|
|
| return outputs |
|
|
|
|
| |
| |
| |
| def translate_srt(srt_text): |
|
|
| subtitles = parse_srt(srt_text) |
|
|
| english_texts = [ |
| sub["text"] for sub in subtitles |
| ] |
|
|
| |
| hindi_texts = batch_translate( |
| english_texts |
| ) |
|
|
| output = [] |
|
|
| for sub, hindi in zip(subtitles, hindi_texts): |
|
|
| english_len = len(sub["text"]) |
|
|
| |
| max_hindi_len = int( |
| english_len * 1.3 |
| ) |
|
|
| hindi = shorten_text( |
| hindi.strip(), |
| max_hindi_len |
| ) |
|
|
| block = ( |
| f'{sub["index"]}\n' |
| f'{sub["timestamp"]}\n' |
| f'{hindi}\n' |
| ) |
|
|
| output.append(block) |
|
|
| return "\n".join(output) |
|
|
|
|
| |
| |
| |
| demo = gr.Interface( |
| fn=translate_srt, |
| inputs=gr.Textbox( |
| lines=20, |
| label="English SRT" |
| ), |
| outputs=gr.Textbox( |
| lines=20, |
| label="Hindi SRT" |
| ), |
| title="Fast English → Hindi SRT Translator", |
| description="Ultra-fast subtitle translation with timestamp preservation and subtitle length control." |
| ) |
|
|
| demo.launch() |