benkamin commited on
Commit
8f0ff8b
·
verified ·
1 Parent(s): 182ab39

Upload 5 files

Browse files
Files changed (5) hide show
  1. LICENSE +21 -0
  2. app.py +247 -0
  3. prompts.py +50 -0
  4. requirements.txt +5 -0
  5. srt_utils.py +55 -0
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
app.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os, re, time, tempfile
3
+ import gradio as gr
4
+ from dotenv import load_dotenv
5
+ from openai import OpenAI
6
+ from langdetect import detect, DetectorFactory
7
+
8
+ from srt_utils import (
9
+ parse_srt, blocks_to_srt, split_batches,
10
+ validate_srt_batch, last_end_time_ms
11
+ )
12
+ from prompts import build_prompt, RTL_LANGS
13
+
14
+ load_dotenv()
15
+ DetectorFactory.seed = 42 # make langdetect deterministic-ish
16
+
17
+ DEFAULT_GLOSSARY = """agency - יכולת פעולה עצמאית
18
+ attachment - היקשרות
19
+ awakening - התעוררות
20
+ alaya - אלאיה
21
+ ayatana - אייטנה (בסיס החושים)"""
22
+
23
+ LANG_NAME_TO_CODE = {
24
+ "English": "en",
25
+ "Hebrew": "he",
26
+ "Spanish": "es",
27
+ "French": "fr",
28
+ "German": "de",
29
+ "Arabic": "ar",
30
+ "Auto-detect": "auto",
31
+ }
32
+
33
+ def simple_token_estimate(text: str) -> int:
34
+ """
35
+ Heuristic token estimate:
36
+ - Strip SRT numbers/timecodes, keep only text lines.
37
+ - Words * 1.33 ≈ tokens (rough).
38
+ """
39
+ lines = []
40
+ for block in parse_srt(text):
41
+ lines.extend(block[2:]) # text-only
42
+ only_text = " ".join(lines)
43
+ words = len(re.findall(r"\S+", only_text))
44
+ return int(words * 1.33)
45
+
46
+ def estimate_cost(total_in_tokens: int,
47
+ total_out_tokens: int,
48
+ price_in_per_million: float,
49
+ price_out_per_million: float) -> float:
50
+ cost_in = (total_in_tokens / 1_000_000.0) * price_in_per_million
51
+ cost_out = (total_out_tokens / 1_000_000.0) * price_out_per_million
52
+ return round(cost_in + cost_out, 4)
53
+
54
+ def autodetect_source_lang(srt_text: str) -> str:
55
+ # Take first ~1000 characters of text-only content for detection
56
+ texts = []
57
+ for block in parse_srt(srt_text)[:50]:
58
+ texts.extend(block[2:])
59
+ sample = " ".join(texts)[:1000].strip()
60
+ if not sample:
61
+ return "English" # default fallback
62
+ try:
63
+ code = detect(sample)
64
+ except Exception:
65
+ return "English"
66
+ # Map code to UI labels; default to returning code if unknown
67
+ for name, c in LANG_NAME_TO_CODE.items():
68
+ if c == code:
69
+ return name
70
+ return "English"
71
+
72
+ def call_gpt(client: OpenAI, model: str, prompt: str) -> str:
73
+ resp = client.chat.completions.create(
74
+ model=model,
75
+ messages=[{"role": "user", "content": prompt}],
76
+ temperature=0.0,
77
+ top_p=1.0,
78
+ extra_body={"verbosity": "low"} # GPT-5 knob if supported
79
+ )
80
+ text = resp.choices[0].message.content
81
+ m = re.search(r'<<<SRT>>>\s*(.*?)\s*<<<END>>>', text, re.DOTALL)
82
+ return (m.group(1).strip() if m else text.strip())
83
+
84
+ def prepare_download_file(content: str, suffix: str):
85
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
86
+ with open(tmp.name, "w", encoding="utf-8") as f:
87
+ f.write(content)
88
+ return tmp.name
89
+
90
+ def compute_estimates(file_bytes, approx_blocks, use_prev_ctx,
91
+ price_in_per_million, price_out_per_million):
92
+ if file_bytes is None:
93
+ return "Upload an SRT to estimate.", gr.update(visible=False), gr.update(visible=False)
94
+ raw = file_bytes.decode("utf-8", errors="replace")
95
+ # Basic token estimate
96
+ base_tokens = simple_token_estimate(raw)
97
+ # Rough batch count:
98
+ blocks = parse_srt(raw)
99
+ batch_count = max(1, (len(blocks) + approx_blocks - 1) // approx_blocks)
100
+
101
+ # Overheads (rough):
102
+ # - Style/glossary prefix per batch ~ 300 tokens (adjustable)
103
+ prefix_tokens_per_batch = 300
104
+ # - Previous context overhead per batch ~ average of one batch output (text only), but we use 50% of batch's tokens for safety
105
+ context_overhead = 0
106
+ if use_prev_ctx and batch_count > 1:
107
+ context_overhead = int((base_tokens / batch_count) * 0.5) * (batch_count - 1)
108
+
109
+ in_tokens = base_tokens + batch_count * prefix_tokens_per_batch + context_overhead
110
+ out_tokens = base_tokens # translation length ~ same scale
111
+
112
+ total_cost = estimate_cost(in_tokens, out_tokens, price_in_per_million, price_out_per_million)
113
+ msg = (
114
+ f"Estimated tokens — input: ~{in_tokens:,}, output: ~{out_tokens:,}\n"
115
+ f"Estimated total cost: ~${total_cost:.4f} (rates: in ${price_in_per_million}/M, out ${price_out_per_million}/M)\n"
116
+ f"Assumptions: words→tokens≈1.33, per-batch prefix≈{prefix_tokens_per_batch}, "
117
+ f"{'with' if use_prev_ctx else 'no'} previous-batch context."
118
+ )
119
+ return msg, gr.update(visible=True), gr.update(visible=True)
120
+
121
+ def pipeline(file_bytes, user_api_key, source_lang, target_lang, glossary, extra, model, approx_blocks, use_prev_ctx):
122
+ # Resolve API key: user-supplied takes precedence; fallback to env var
123
+ api_key = (user_api_key or "").strip() or os.getenv("OPENAI_API_KEY", "").strip()
124
+ if not api_key:
125
+ return "", "Please paste your OpenAI API key or configure OPENAI_API_KEY.", None, None, "ltr"
126
+ client = OpenAI(api_key=api_key)
127
+
128
+ if file_bytes is None:
129
+ return "", "Please upload an SRT file.", None, None, "ltr"
130
+ raw = file_bytes.decode("utf-8", errors="replace")
131
+ in_blocks = parse_srt(raw)
132
+
133
+ # Source auto-detect
134
+ if source_lang == "Auto-detect":
135
+ source_lang = autodetect_source_lang(raw)
136
+
137
+ # Input sanity
138
+ for b in in_blocks:
139
+ if len(b) < 3 or not b[0].strip().isdigit() or "-->" not in b[1]:
140
+ return "", "Input SRT failed basic validation (numbers/timecodes).", None, None, "ltr"
141
+
142
+ out_blocks_all, logs = [], []
143
+ prev_source, prev_target = None, None
144
+
145
+ for i, batch in enumerate(split_batches(in_blocks, approx_blocks), start=1):
146
+ batch_srt_in = blocks_to_srt(batch)
147
+ prompt = build_prompt(
148
+ source_lang=source_lang, target_lang=target_lang,
149
+ batch_srt=batch_srt_in, glossary_text=glossary, extra_instructions=extra,
150
+ prev_source=prev_source if use_prev_ctx else None,
151
+ prev_target=prev_target if use_prev_ctx else None
152
+ )
153
+ try:
154
+ translated = call_gpt(client, model, prompt)
155
+ except Exception as e:
156
+ logs.append(f"[ERROR] API call failed in batch {i}: {e}")
157
+ # Produce partial outputs for debugging
158
+ srt_path = prepare_download_file(blocks_to_srt(out_blocks_all), ".srt")
159
+ log_path = prepare_download_file("\n".join(logs), ".log.txt")
160
+ return blocks_to_srt(out_blocks_all), "\n".join(logs), srt_path, log_path, ("rtl" if target_lang.lower()[:2] in RTL_LANGS else "ltr")
161
+
162
+ out_batch = parse_srt(translated)
163
+ prev_end = last_end_time_ms(out_blocks_all)
164
+ ok, rep = validate_srt_batch(batch, out_batch, prev_last_end=prev_end)
165
+ logs.append(f"Batch {i}: {'OK' if ok else 'ISSUES'}")
166
+ logs += rep
167
+
168
+ if not ok:
169
+ # Hard retry with stricter wording
170
+ prompt_strict = prompt + "\n\n(HARD MODE) Repeat EXACT numbers/timecodes/line counts. Output SRT only."
171
+ try:
172
+ translated2 = call_gpt(client, model, prompt_strict)
173
+ out_batch2 = parse_srt(translated2)
174
+ ok2, rep2 = validate_srt_batch(batch, out_batch2, prev_last_end=prev_end)
175
+ logs.append(f"Batch {i} (retry): {'OK' if ok2 else 'ISSUES'}")
176
+ logs += rep2
177
+ if ok2:
178
+ out_batch = out_batch2
179
+ ok = True
180
+ except Exception as e:
181
+ logs.append(f"[ERROR] Retry failed in batch {i}: {e}")
182
+
183
+ out_blocks_all.extend(out_batch)
184
+ prev_source, prev_target = batch_srt_in, blocks_to_srt(out_batch)
185
+
186
+ # live progress
187
+ yield blocks_to_srt(out_blocks_all), "\n".join(logs), None, None, ("rtl" if target_lang.lower()[:2] in RTL_LANGS else "ltr")
188
+ time.sleep(0.05)
189
+
190
+ final_srt = blocks_to_srt(out_blocks_all)
191
+ direction = "rtl" if target_lang.lower()[:2] in RTL_LANGS else "ltr"
192
+ srt_path = prepare_download_file(final_srt, ".srt")
193
+ log_path = prepare_download_file("\n".join(logs) if logs else "Done.", ".log.txt")
194
+ return final_srt, "\n".join(logs) if logs else "Done.", srt_path, log_path, direction
195
+
196
+ with gr.Blocks(title="Open Subtitle Translator (GPT-5)") as demo:
197
+ gr.Markdown("## Open Subtitle Translator — GPT-5\nPaste your API key, upload an SRT, pick languages, and translate with strict SRT validation.\n\n"
198
+ "**Tip:** Public Spaces should NOT include owner API keys. Users paste their own keys here.")
199
+
200
+ with gr.Row():
201
+ key = gr.Textbox(label="OpenAI API key", type="password", placeholder="sk-...", info="Used only for this session; not stored.")
202
+ model = gr.Dropdown(choices=["gpt-5", "gpt-5-mini"], value="gpt-5", label="Model")
203
+ approx_blocks = gr.Slider(5, 20, value=10, step=1, label="Approx. SRT blocks per batch")
204
+ use_prev = gr.Checkbox(value=True, label="Use previous-batch target as context")
205
+
206
+ with gr.Row():
207
+ src = gr.Dropdown(choices=["Auto-detect", "English", "Hebrew", "Spanish", "French", "German", "Arabic"], value="English", label="Source language")
208
+ tgt = gr.Dropdown(choices=["Hebrew", "English", "Spanish", "French", "German", "Arabic"], value="Hebrew", label="Target language")
209
+
210
+ with gr.Row():
211
+ price_in = gr.Number(value=1.25, precision=2, label="Price — input $/M tokens (configurable)")
212
+ price_out = gr.Number(value=10.0, precision=2, label="Price — output $/M tokens (configurable)")
213
+
214
+ glossary = gr.Textbox(label="Glossary / Policy", value=DEFAULT_GLOSSARY, lines=6)
215
+ extra = gr.Textbox(label="Extra instructions (optional)", lines=4, placeholder="Tone, domain hints, speaker info…")
216
+
217
+ srt_in = gr.File(label="Upload SRT", file_types=[".srt"])
218
+
219
+ with gr.Row():
220
+ estimate_btn = gr.Button("Estimate Cost")
221
+ run_btn = gr.Button("Translate")
222
+
223
+ srt_preview = gr.Textbox(label="Translated SRT (preview)", lines=18)
224
+ log = gr.Textbox(label="Validation / Log", lines=18)
225
+
226
+ with gr.Row():
227
+ dl_srt = gr.File(label="Download Translated SRT", visible=False)
228
+ dl_log = gr.File(label="Download Log", visible=False)
229
+
230
+ dir_state = gr.State("ltr")
231
+
232
+ estimate_btn.click(
233
+ fn=compute_estimates,
234
+ inputs=[srt_in, approx_blocks, use_prev, price_in, price_out],
235
+ outputs=[log, dl_srt, dl_log],
236
+ api_name="estimate"
237
+ )
238
+
239
+ run_btn.click(
240
+ fn=pipeline,
241
+ inputs=[srt_in, key, src, tgt, glossary, extra, model, approx_blocks, use_prev],
242
+ outputs=[srt_preview, log, dl_srt, dl_log, dir_state],
243
+ api_name="translate"
244
+ )
245
+
246
+ if __name__ == "__main__":
247
+ demo.launch()
prompts.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Prompt builder utilities for the Open Subtitle Translator
2
+
3
+ STYLE_PREFIX = """You are a professional subtitle translator.
4
+
5
+ HARD CONSTRAINTS:
6
+ - Output SRT only between <<<SRT>>> and <<<END>>>.
7
+ - Do NOT change block numbers or timecodes.
8
+ - Do NOT add/remove lines within a block; preserve exact line breaks.
9
+ - Keep any tags/speaker labels (e.g., <i>, ♪) exactly as-is.
10
+ - No commentary or explanations outside the SRT.
11
+ """
12
+
13
+ RTL_LANGS = {"he", "ar", "fa", "ur"}
14
+
15
+ def build_prompt(source_lang: str,
16
+ target_lang: str,
17
+ batch_srt: str,
18
+ glossary_text: str | None,
19
+ extra_instructions: str | None,
20
+ prev_source: str | None,
21
+ prev_target: str | None) -> str:
22
+ """
23
+ Compose a cache-friendly prompt. Keep the prefix byte-identical across calls
24
+ to leverage provider-side prompt caching where available.
25
+ """
26
+ prefix = STYLE_PREFIX + f"""
27
+
28
+ TASK:
29
+ - Translate from {source_lang} to {target_lang}.
30
+ - Input format is SRT blocks.
31
+
32
+ STYLE & GLOSSARY (project-provided; must follow if applicable):
33
+ """
34
+ if glossary_text and glossary_text.strip():
35
+ prefix += glossary_text.strip() + "\n"
36
+ if extra_instructions and extra_instructions.strip():
37
+ prefix += "\nEXTRA INSTRUCTIONS (apply carefully):\n" + extra_instructions.strip() + "\n"
38
+
39
+ context = "\nCONTEXT (do not modify):\n"
40
+ if prev_source:
41
+ context += "[Previous batch — Source]\n" + prev_source.strip() + "\n"
42
+ if prev_target:
43
+ context += "[Previous batch — Target]\n" + prev_target.strip() + "\n"
44
+
45
+ task = (
46
+ "\nCURRENT BATCH TO TRANSLATE:\n"
47
+ + batch_srt
48
+ + "\nReturn only:\n<<<SRT>>>\n[Translated SRT blocks]\n<<<END>>>"
49
+ )
50
+ return prefix + context + task
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=4.44.0
2
+ openai>=1.40.0
3
+ python-dotenv>=1.0.1
4
+
5
+ langdetect>=1.0.9
srt_utils.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List
3
+
4
+ TIME_RE = re.compile(r'^\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}$')
5
+
6
+ def parse_srt(text: str) -> List[list]:
7
+ blocks = []
8
+ for raw in text.strip().split("\n\n"):
9
+ lines = raw.splitlines()
10
+ if lines:
11
+ blocks.append(lines)
12
+ return blocks
13
+
14
+ def blocks_to_srt(blocks: List[list]) -> str:
15
+ return "\n\n".join("\n".join(b) for b in blocks) + "\n"
16
+
17
+ def last_end_time_ms(blocks: List[list]) -> int | None:
18
+ if not blocks:
19
+ return None
20
+ end = blocks[-1][1].split(" --> ")[1]
21
+ hh, mm, ss_ms = end.split(":")
22
+ ss, ms = ss_ms.split(",")
23
+ return (int(hh)*3600 + int(mm)*60 + int(ss))*1000 + int(ms)
24
+
25
+ def validate_srt_batch(in_blocks: List[list], out_blocks: List[list], prev_last_end: int | None = None):
26
+ """
27
+ Returns (ok: bool, report_lines: list[str])
28
+ """
29
+ report, ok = [], True
30
+ if len(in_blocks) != len(out_blocks):
31
+ ok = False
32
+ report.append(f"[STRUCT] Block count mismatch: in={len(in_blocks)} out={len(out_blocks)}")
33
+ for i, (ib, ob) in enumerate(zip(in_blocks, out_blocks)):
34
+ if len(ob) < 3:
35
+ ok = False; report.append(f"[STRUCT] Output block too short @{i}")
36
+ continue
37
+ if ib[0].strip() != ob[0].strip():
38
+ ok = False; report.append(f"[INDEX] Changed @{i}: {ib[0]} → {ob[0]}")
39
+ if ib[1].strip() != ob[1].strip():
40
+ ok = False; report.append(f"[TIMECODE] Changed @{i}: {ib[1]} ≠ {ob[1]}")
41
+ if not TIME_RE.match(ob[1].strip()):
42
+ ok = False; report.append(f"[TIMECODE] Invalid format @{i}: {ob[1]}")
43
+ if len(ib) != len(ob):
44
+ ok = False; report.append(f"[LINES] Line-count changed @{i}: in={len(ib)} out={len(ob)}")
45
+ if prev_last_end and out_blocks:
46
+ def to_ms(tc: str) -> int:
47
+ hh, mm, ss_ms = tc.split(":"); ss, ms = ss_ms.split(",")
48
+ return (int(hh)*3600 + int(mm)*60 + int(ss))*1000 + int(ms)
49
+ if to_ms(out_blocks[0][1].split(" --> ")[0]) < prev_last_end:
50
+ ok = False; report.append("[OVERLAP] First block overlaps previous batch end time.")
51
+ return ok, report
52
+
53
+ def split_batches(blocks: List[list], approx_blocks: int = 10):
54
+ for i in range(0, len(blocks), approx_blocks):
55
+ yield blocks[i:i+approx_blocks]