xTHExBEASTx commited on
Commit
87aabc7
·
verified ·
1 Parent(s): 0da9887

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -0
app.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import re
4
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
+
6
+ # 1. Setup Model and Device
7
+ device = "cuda" if torch.cuda.is_available() else "cpu"
8
+ model_name = "facebook/nllb-200-distilled-600M"
9
+
10
+ print(f"Loading {model_name} on {device}...")
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
13
+
14
+ # Language Mapping (Expandable)
15
+ LANG_MAP = {
16
+ "English": "eng_Latn",
17
+ "French": "fra_Latn",
18
+ "Spanish": "spa_Latn",
19
+ "German": "deu_Latn",
20
+ "Chinese (Simplified)": "zho_Hans",
21
+ "Japanese": "jpn_Jpan",
22
+ "Arabic": "ary_Arab",
23
+ "Russian": "rus_Cyrl"
24
+ }
25
+
26
+ def translate_text(text, target_lang_name):
27
+ if not text.strip(): return text
28
+ target_code = LANG_MAP.get(target_lang_name, "eng_Latn")
29
+
30
+ inputs = tokenizer(text, return_tensors="pt").to(device)
31
+ translated_tokens = model.generate(
32
+ **inputs,
33
+ forced_bos_token_id=tokenizer.lang_code_to_id[target_code],
34
+ max_length=256
35
+ )
36
+ return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
37
+
38
+ def process_srt(file, target_lang):
39
+ if file is None: return None
40
+
41
+ try:
42
+ with open(file.name, 'r', encoding='utf-8') as f:
43
+ content = f.read()
44
+ except:
45
+ with open(file.name, 'r', encoding='latin-1') as f:
46
+ content = f.read()
47
+
48
+ # Split by SRT blocks while preserving whitespace
49
+ blocks = re.split(r'(\n\s*\n)', content)
50
+ translated_blocks = []
51
+
52
+ for block in blocks:
53
+ if not block.strip() or not any(c.isalpha() for c in block):
54
+ translated_blocks.append(block)
55
+ continue
56
+
57
+ lines = block.strip().splitlines()
58
+ if len(lines) >= 3:
59
+ index, timestamp = lines[0], lines[1]
60
+ text_to_translate = " ".join(lines[2:])
61
+ translated_text = translate_text(text_to_translate, target_lang)
62
+ translated_blocks.append(f"{index}\n{timestamp}\n{translated_text}")
63
+ else:
64
+ translated_blocks.append(block)
65
+
66
+ output_path = "translated_subtitles.srt"
67
+ with open(output_path, "w", encoding="utf-8") as f:
68
+ f.write("\n\n".join(translated_blocks))
69
+ return output_path
70
+
71
+ # 3. Gradio Interface
72
+ demo = gr.Interface(
73
+ fn=process_srt,
74
+ inputs=[
75
+ gr.File(label="Upload SRT File"),
76
+ gr.Dropdown(choices=list(LANG_MAP.keys()), value="English", label="Target Language")
77
+ ],
78
+ outputs=gr.File(label="Download Translated SRT"),
79
+ title="SRT Subtitle Translator",
80
+ description="Translates SRT files using NLLB-200. Optimized for Hugging Face Spaces.",
81
+ show_api=False # Prevents the API-doc crash on newer Gradio versions
82
+ )
83
+
84
+ if __name__ == "__main__":
85
+ demo.launch()
86
+