Spaces:
Running
Running
Upload 5 files
Browse files- LICENSE +21 -0
- app.py +247 -0
- prompts.py +50 -0
- requirements.txt +5 -0
- srt_utils.py +55 -0
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
app.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os, re, time, tempfile
|
| 3 |
+
import gradio as gr
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
from openai import OpenAI
|
| 6 |
+
from langdetect import detect, DetectorFactory
|
| 7 |
+
|
| 8 |
+
from srt_utils import (
|
| 9 |
+
parse_srt, blocks_to_srt, split_batches,
|
| 10 |
+
validate_srt_batch, last_end_time_ms
|
| 11 |
+
)
|
| 12 |
+
from prompts import build_prompt, RTL_LANGS
|
| 13 |
+
|
| 14 |
+
load_dotenv()
|
| 15 |
+
DetectorFactory.seed = 42 # make langdetect deterministic-ish
|
| 16 |
+
|
| 17 |
+
DEFAULT_GLOSSARY = """agency - יכולת פעולה עצמאית
|
| 18 |
+
attachment - היקשרות
|
| 19 |
+
awakening - התעוררות
|
| 20 |
+
alaya - אלאיה
|
| 21 |
+
ayatana - אייטנה (בסיס החושים)"""
|
| 22 |
+
|
| 23 |
+
LANG_NAME_TO_CODE = {
|
| 24 |
+
"English": "en",
|
| 25 |
+
"Hebrew": "he",
|
| 26 |
+
"Spanish": "es",
|
| 27 |
+
"French": "fr",
|
| 28 |
+
"German": "de",
|
| 29 |
+
"Arabic": "ar",
|
| 30 |
+
"Auto-detect": "auto",
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
def simple_token_estimate(text: str) -> int:
|
| 34 |
+
"""
|
| 35 |
+
Heuristic token estimate:
|
| 36 |
+
- Strip SRT numbers/timecodes, keep only text lines.
|
| 37 |
+
- Words * 1.33 ≈ tokens (rough).
|
| 38 |
+
"""
|
| 39 |
+
lines = []
|
| 40 |
+
for block in parse_srt(text):
|
| 41 |
+
lines.extend(block[2:]) # text-only
|
| 42 |
+
only_text = " ".join(lines)
|
| 43 |
+
words = len(re.findall(r"\S+", only_text))
|
| 44 |
+
return int(words * 1.33)
|
| 45 |
+
|
| 46 |
+
def estimate_cost(total_in_tokens: int,
|
| 47 |
+
total_out_tokens: int,
|
| 48 |
+
price_in_per_million: float,
|
| 49 |
+
price_out_per_million: float) -> float:
|
| 50 |
+
cost_in = (total_in_tokens / 1_000_000.0) * price_in_per_million
|
| 51 |
+
cost_out = (total_out_tokens / 1_000_000.0) * price_out_per_million
|
| 52 |
+
return round(cost_in + cost_out, 4)
|
| 53 |
+
|
| 54 |
+
def autodetect_source_lang(srt_text: str) -> str:
|
| 55 |
+
# Take first ~1000 characters of text-only content for detection
|
| 56 |
+
texts = []
|
| 57 |
+
for block in parse_srt(srt_text)[:50]:
|
| 58 |
+
texts.extend(block[2:])
|
| 59 |
+
sample = " ".join(texts)[:1000].strip()
|
| 60 |
+
if not sample:
|
| 61 |
+
return "English" # default fallback
|
| 62 |
+
try:
|
| 63 |
+
code = detect(sample)
|
| 64 |
+
except Exception:
|
| 65 |
+
return "English"
|
| 66 |
+
# Map code to UI labels; default to returning code if unknown
|
| 67 |
+
for name, c in LANG_NAME_TO_CODE.items():
|
| 68 |
+
if c == code:
|
| 69 |
+
return name
|
| 70 |
+
return "English"
|
| 71 |
+
|
| 72 |
+
def call_gpt(client: OpenAI, model: str, prompt: str) -> str:
|
| 73 |
+
resp = client.chat.completions.create(
|
| 74 |
+
model=model,
|
| 75 |
+
messages=[{"role": "user", "content": prompt}],
|
| 76 |
+
temperature=0.0,
|
| 77 |
+
top_p=1.0,
|
| 78 |
+
extra_body={"verbosity": "low"} # GPT-5 knob if supported
|
| 79 |
+
)
|
| 80 |
+
text = resp.choices[0].message.content
|
| 81 |
+
m = re.search(r'<<<SRT>>>\s*(.*?)\s*<<<END>>>', text, re.DOTALL)
|
| 82 |
+
return (m.group(1).strip() if m else text.strip())
|
| 83 |
+
|
| 84 |
+
def prepare_download_file(content: str, suffix: str):
|
| 85 |
+
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
|
| 86 |
+
with open(tmp.name, "w", encoding="utf-8") as f:
|
| 87 |
+
f.write(content)
|
| 88 |
+
return tmp.name
|
| 89 |
+
|
| 90 |
+
def compute_estimates(file_bytes, approx_blocks, use_prev_ctx,
|
| 91 |
+
price_in_per_million, price_out_per_million):
|
| 92 |
+
if file_bytes is None:
|
| 93 |
+
return "Upload an SRT to estimate.", gr.update(visible=False), gr.update(visible=False)
|
| 94 |
+
raw = file_bytes.decode("utf-8", errors="replace")
|
| 95 |
+
# Basic token estimate
|
| 96 |
+
base_tokens = simple_token_estimate(raw)
|
| 97 |
+
# Rough batch count:
|
| 98 |
+
blocks = parse_srt(raw)
|
| 99 |
+
batch_count = max(1, (len(blocks) + approx_blocks - 1) // approx_blocks)
|
| 100 |
+
|
| 101 |
+
# Overheads (rough):
|
| 102 |
+
# - Style/glossary prefix per batch ~ 300 tokens (adjustable)
|
| 103 |
+
prefix_tokens_per_batch = 300
|
| 104 |
+
# - Previous context overhead per batch ~ average of one batch output (text only), but we use 50% of batch's tokens for safety
|
| 105 |
+
context_overhead = 0
|
| 106 |
+
if use_prev_ctx and batch_count > 1:
|
| 107 |
+
context_overhead = int((base_tokens / batch_count) * 0.5) * (batch_count - 1)
|
| 108 |
+
|
| 109 |
+
in_tokens = base_tokens + batch_count * prefix_tokens_per_batch + context_overhead
|
| 110 |
+
out_tokens = base_tokens # translation length ~ same scale
|
| 111 |
+
|
| 112 |
+
total_cost = estimate_cost(in_tokens, out_tokens, price_in_per_million, price_out_per_million)
|
| 113 |
+
msg = (
|
| 114 |
+
f"Estimated tokens — input: ~{in_tokens:,}, output: ~{out_tokens:,}\n"
|
| 115 |
+
f"Estimated total cost: ~${total_cost:.4f} (rates: in ${price_in_per_million}/M, out ${price_out_per_million}/M)\n"
|
| 116 |
+
f"Assumptions: words→tokens≈1.33, per-batch prefix≈{prefix_tokens_per_batch}, "
|
| 117 |
+
f"{'with' if use_prev_ctx else 'no'} previous-batch context."
|
| 118 |
+
)
|
| 119 |
+
return msg, gr.update(visible=True), gr.update(visible=True)
|
| 120 |
+
|
| 121 |
+
def pipeline(file_bytes, user_api_key, source_lang, target_lang, glossary, extra, model, approx_blocks, use_prev_ctx):
|
| 122 |
+
# Resolve API key: user-supplied takes precedence; fallback to env var
|
| 123 |
+
api_key = (user_api_key or "").strip() or os.getenv("OPENAI_API_KEY", "").strip()
|
| 124 |
+
if not api_key:
|
| 125 |
+
return "", "Please paste your OpenAI API key or configure OPENAI_API_KEY.", None, None, "ltr"
|
| 126 |
+
client = OpenAI(api_key=api_key)
|
| 127 |
+
|
| 128 |
+
if file_bytes is None:
|
| 129 |
+
return "", "Please upload an SRT file.", None, None, "ltr"
|
| 130 |
+
raw = file_bytes.decode("utf-8", errors="replace")
|
| 131 |
+
in_blocks = parse_srt(raw)
|
| 132 |
+
|
| 133 |
+
# Source auto-detect
|
| 134 |
+
if source_lang == "Auto-detect":
|
| 135 |
+
source_lang = autodetect_source_lang(raw)
|
| 136 |
+
|
| 137 |
+
# Input sanity
|
| 138 |
+
for b in in_blocks:
|
| 139 |
+
if len(b) < 3 or not b[0].strip().isdigit() or "-->" not in b[1]:
|
| 140 |
+
return "", "Input SRT failed basic validation (numbers/timecodes).", None, None, "ltr"
|
| 141 |
+
|
| 142 |
+
out_blocks_all, logs = [], []
|
| 143 |
+
prev_source, prev_target = None, None
|
| 144 |
+
|
| 145 |
+
for i, batch in enumerate(split_batches(in_blocks, approx_blocks), start=1):
|
| 146 |
+
batch_srt_in = blocks_to_srt(batch)
|
| 147 |
+
prompt = build_prompt(
|
| 148 |
+
source_lang=source_lang, target_lang=target_lang,
|
| 149 |
+
batch_srt=batch_srt_in, glossary_text=glossary, extra_instructions=extra,
|
| 150 |
+
prev_source=prev_source if use_prev_ctx else None,
|
| 151 |
+
prev_target=prev_target if use_prev_ctx else None
|
| 152 |
+
)
|
| 153 |
+
try:
|
| 154 |
+
translated = call_gpt(client, model, prompt)
|
| 155 |
+
except Exception as e:
|
| 156 |
+
logs.append(f"[ERROR] API call failed in batch {i}: {e}")
|
| 157 |
+
# Produce partial outputs for debugging
|
| 158 |
+
srt_path = prepare_download_file(blocks_to_srt(out_blocks_all), ".srt")
|
| 159 |
+
log_path = prepare_download_file("\n".join(logs), ".log.txt")
|
| 160 |
+
return blocks_to_srt(out_blocks_all), "\n".join(logs), srt_path, log_path, ("rtl" if target_lang.lower()[:2] in RTL_LANGS else "ltr")
|
| 161 |
+
|
| 162 |
+
out_batch = parse_srt(translated)
|
| 163 |
+
prev_end = last_end_time_ms(out_blocks_all)
|
| 164 |
+
ok, rep = validate_srt_batch(batch, out_batch, prev_last_end=prev_end)
|
| 165 |
+
logs.append(f"Batch {i}: {'OK' if ok else 'ISSUES'}")
|
| 166 |
+
logs += rep
|
| 167 |
+
|
| 168 |
+
if not ok:
|
| 169 |
+
# Hard retry with stricter wording
|
| 170 |
+
prompt_strict = prompt + "\n\n(HARD MODE) Repeat EXACT numbers/timecodes/line counts. Output SRT only."
|
| 171 |
+
try:
|
| 172 |
+
translated2 = call_gpt(client, model, prompt_strict)
|
| 173 |
+
out_batch2 = parse_srt(translated2)
|
| 174 |
+
ok2, rep2 = validate_srt_batch(batch, out_batch2, prev_last_end=prev_end)
|
| 175 |
+
logs.append(f"Batch {i} (retry): {'OK' if ok2 else 'ISSUES'}")
|
| 176 |
+
logs += rep2
|
| 177 |
+
if ok2:
|
| 178 |
+
out_batch = out_batch2
|
| 179 |
+
ok = True
|
| 180 |
+
except Exception as e:
|
| 181 |
+
logs.append(f"[ERROR] Retry failed in batch {i}: {e}")
|
| 182 |
+
|
| 183 |
+
out_blocks_all.extend(out_batch)
|
| 184 |
+
prev_source, prev_target = batch_srt_in, blocks_to_srt(out_batch)
|
| 185 |
+
|
| 186 |
+
# live progress
|
| 187 |
+
yield blocks_to_srt(out_blocks_all), "\n".join(logs), None, None, ("rtl" if target_lang.lower()[:2] in RTL_LANGS else "ltr")
|
| 188 |
+
time.sleep(0.05)
|
| 189 |
+
|
| 190 |
+
final_srt = blocks_to_srt(out_blocks_all)
|
| 191 |
+
direction = "rtl" if target_lang.lower()[:2] in RTL_LANGS else "ltr"
|
| 192 |
+
srt_path = prepare_download_file(final_srt, ".srt")
|
| 193 |
+
log_path = prepare_download_file("\n".join(logs) if logs else "Done.", ".log.txt")
|
| 194 |
+
return final_srt, "\n".join(logs) if logs else "Done.", srt_path, log_path, direction
|
| 195 |
+
|
| 196 |
+
with gr.Blocks(title="Open Subtitle Translator (GPT-5)") as demo:
|
| 197 |
+
gr.Markdown("## Open Subtitle Translator — GPT-5\nPaste your API key, upload an SRT, pick languages, and translate with strict SRT validation.\n\n"
|
| 198 |
+
"**Tip:** Public Spaces should NOT include owner API keys. Users paste their own keys here.")
|
| 199 |
+
|
| 200 |
+
with gr.Row():
|
| 201 |
+
key = gr.Textbox(label="OpenAI API key", type="password", placeholder="sk-...", info="Used only for this session; not stored.")
|
| 202 |
+
model = gr.Dropdown(choices=["gpt-5", "gpt-5-mini"], value="gpt-5", label="Model")
|
| 203 |
+
approx_blocks = gr.Slider(5, 20, value=10, step=1, label="Approx. SRT blocks per batch")
|
| 204 |
+
use_prev = gr.Checkbox(value=True, label="Use previous-batch target as context")
|
| 205 |
+
|
| 206 |
+
with gr.Row():
|
| 207 |
+
src = gr.Dropdown(choices=["Auto-detect", "English", "Hebrew", "Spanish", "French", "German", "Arabic"], value="English", label="Source language")
|
| 208 |
+
tgt = gr.Dropdown(choices=["Hebrew", "English", "Spanish", "French", "German", "Arabic"], value="Hebrew", label="Target language")
|
| 209 |
+
|
| 210 |
+
with gr.Row():
|
| 211 |
+
price_in = gr.Number(value=1.25, precision=2, label="Price — input $/M tokens (configurable)")
|
| 212 |
+
price_out = gr.Number(value=10.0, precision=2, label="Price — output $/M tokens (configurable)")
|
| 213 |
+
|
| 214 |
+
glossary = gr.Textbox(label="Glossary / Policy", value=DEFAULT_GLOSSARY, lines=6)
|
| 215 |
+
extra = gr.Textbox(label="Extra instructions (optional)", lines=4, placeholder="Tone, domain hints, speaker info…")
|
| 216 |
+
|
| 217 |
+
srt_in = gr.File(label="Upload SRT", file_types=[".srt"])
|
| 218 |
+
|
| 219 |
+
with gr.Row():
|
| 220 |
+
estimate_btn = gr.Button("Estimate Cost")
|
| 221 |
+
run_btn = gr.Button("Translate")
|
| 222 |
+
|
| 223 |
+
srt_preview = gr.Textbox(label="Translated SRT (preview)", lines=18)
|
| 224 |
+
log = gr.Textbox(label="Validation / Log", lines=18)
|
| 225 |
+
|
| 226 |
+
with gr.Row():
|
| 227 |
+
dl_srt = gr.File(label="Download Translated SRT", visible=False)
|
| 228 |
+
dl_log = gr.File(label="Download Log", visible=False)
|
| 229 |
+
|
| 230 |
+
dir_state = gr.State("ltr")
|
| 231 |
+
|
| 232 |
+
estimate_btn.click(
|
| 233 |
+
fn=compute_estimates,
|
| 234 |
+
inputs=[srt_in, approx_blocks, use_prev, price_in, price_out],
|
| 235 |
+
outputs=[log, dl_srt, dl_log],
|
| 236 |
+
api_name="estimate"
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
run_btn.click(
|
| 240 |
+
fn=pipeline,
|
| 241 |
+
inputs=[srt_in, key, src, tgt, glossary, extra, model, approx_blocks, use_prev],
|
| 242 |
+
outputs=[srt_preview, log, dl_srt, dl_log, dir_state],
|
| 243 |
+
api_name="translate"
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
if __name__ == "__main__":
|
| 247 |
+
demo.launch()
|
prompts.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Prompt builder utilities for the Open Subtitle Translator
|
| 2 |
+
|
| 3 |
+
STYLE_PREFIX = """You are a professional subtitle translator.
|
| 4 |
+
|
| 5 |
+
HARD CONSTRAINTS:
|
| 6 |
+
- Output SRT only between <<<SRT>>> and <<<END>>>.
|
| 7 |
+
- Do NOT change block numbers or timecodes.
|
| 8 |
+
- Do NOT add/remove lines within a block; preserve exact line breaks.
|
| 9 |
+
- Keep any tags/speaker labels (e.g., <i>, ♪) exactly as-is.
|
| 10 |
+
- No commentary or explanations outside the SRT.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
RTL_LANGS = {"he", "ar", "fa", "ur"}
|
| 14 |
+
|
| 15 |
+
def build_prompt(source_lang: str,
|
| 16 |
+
target_lang: str,
|
| 17 |
+
batch_srt: str,
|
| 18 |
+
glossary_text: str | None,
|
| 19 |
+
extra_instructions: str | None,
|
| 20 |
+
prev_source: str | None,
|
| 21 |
+
prev_target: str | None) -> str:
|
| 22 |
+
"""
|
| 23 |
+
Compose a cache-friendly prompt. Keep the prefix byte-identical across calls
|
| 24 |
+
to leverage provider-side prompt caching where available.
|
| 25 |
+
"""
|
| 26 |
+
prefix = STYLE_PREFIX + f"""
|
| 27 |
+
|
| 28 |
+
TASK:
|
| 29 |
+
- Translate from {source_lang} to {target_lang}.
|
| 30 |
+
- Input format is SRT blocks.
|
| 31 |
+
|
| 32 |
+
STYLE & GLOSSARY (project-provided; must follow if applicable):
|
| 33 |
+
"""
|
| 34 |
+
if glossary_text and glossary_text.strip():
|
| 35 |
+
prefix += glossary_text.strip() + "\n"
|
| 36 |
+
if extra_instructions and extra_instructions.strip():
|
| 37 |
+
prefix += "\nEXTRA INSTRUCTIONS (apply carefully):\n" + extra_instructions.strip() + "\n"
|
| 38 |
+
|
| 39 |
+
context = "\nCONTEXT (do not modify):\n"
|
| 40 |
+
if prev_source:
|
| 41 |
+
context += "[Previous batch — Source]\n" + prev_source.strip() + "\n"
|
| 42 |
+
if prev_target:
|
| 43 |
+
context += "[Previous batch — Target]\n" + prev_target.strip() + "\n"
|
| 44 |
+
|
| 45 |
+
task = (
|
| 46 |
+
"\nCURRENT BATCH TO TRANSLATE:\n"
|
| 47 |
+
+ batch_srt
|
| 48 |
+
+ "\nReturn only:\n<<<SRT>>>\n[Translated SRT blocks]\n<<<END>>>"
|
| 49 |
+
)
|
| 50 |
+
return prefix + context + task
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.44.0
|
| 2 |
+
openai>=1.40.0
|
| 3 |
+
python-dotenv>=1.0.1
|
| 4 |
+
|
| 5 |
+
langdetect>=1.0.9
|
srt_utils.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import List
|
| 3 |
+
|
| 4 |
+
TIME_RE = re.compile(r'^\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}$')
|
| 5 |
+
|
| 6 |
+
def parse_srt(text: str) -> List[list]:
|
| 7 |
+
blocks = []
|
| 8 |
+
for raw in text.strip().split("\n\n"):
|
| 9 |
+
lines = raw.splitlines()
|
| 10 |
+
if lines:
|
| 11 |
+
blocks.append(lines)
|
| 12 |
+
return blocks
|
| 13 |
+
|
| 14 |
+
def blocks_to_srt(blocks: List[list]) -> str:
|
| 15 |
+
return "\n\n".join("\n".join(b) for b in blocks) + "\n"
|
| 16 |
+
|
| 17 |
+
def last_end_time_ms(blocks: List[list]) -> int | None:
|
| 18 |
+
if not blocks:
|
| 19 |
+
return None
|
| 20 |
+
end = blocks[-1][1].split(" --> ")[1]
|
| 21 |
+
hh, mm, ss_ms = end.split(":")
|
| 22 |
+
ss, ms = ss_ms.split(",")
|
| 23 |
+
return (int(hh)*3600 + int(mm)*60 + int(ss))*1000 + int(ms)
|
| 24 |
+
|
| 25 |
+
def validate_srt_batch(in_blocks: List[list], out_blocks: List[list], prev_last_end: int | None = None):
|
| 26 |
+
"""
|
| 27 |
+
Returns (ok: bool, report_lines: list[str])
|
| 28 |
+
"""
|
| 29 |
+
report, ok = [], True
|
| 30 |
+
if len(in_blocks) != len(out_blocks):
|
| 31 |
+
ok = False
|
| 32 |
+
report.append(f"[STRUCT] Block count mismatch: in={len(in_blocks)} out={len(out_blocks)}")
|
| 33 |
+
for i, (ib, ob) in enumerate(zip(in_blocks, out_blocks)):
|
| 34 |
+
if len(ob) < 3:
|
| 35 |
+
ok = False; report.append(f"[STRUCT] Output block too short @{i}")
|
| 36 |
+
continue
|
| 37 |
+
if ib[0].strip() != ob[0].strip():
|
| 38 |
+
ok = False; report.append(f"[INDEX] Changed @{i}: {ib[0]} → {ob[0]}")
|
| 39 |
+
if ib[1].strip() != ob[1].strip():
|
| 40 |
+
ok = False; report.append(f"[TIMECODE] Changed @{i}: {ib[1]} ≠ {ob[1]}")
|
| 41 |
+
if not TIME_RE.match(ob[1].strip()):
|
| 42 |
+
ok = False; report.append(f"[TIMECODE] Invalid format @{i}: {ob[1]}")
|
| 43 |
+
if len(ib) != len(ob):
|
| 44 |
+
ok = False; report.append(f"[LINES] Line-count changed @{i}: in={len(ib)} out={len(ob)}")
|
| 45 |
+
if prev_last_end and out_blocks:
|
| 46 |
+
def to_ms(tc: str) -> int:
|
| 47 |
+
hh, mm, ss_ms = tc.split(":"); ss, ms = ss_ms.split(",")
|
| 48 |
+
return (int(hh)*3600 + int(mm)*60 + int(ss))*1000 + int(ms)
|
| 49 |
+
if to_ms(out_blocks[0][1].split(" --> ")[0]) < prev_last_end:
|
| 50 |
+
ok = False; report.append("[OVERLAP] First block overlaps previous batch end time.")
|
| 51 |
+
return ok, report
|
| 52 |
+
|
| 53 |
+
def split_batches(blocks: List[list], approx_blocks: int = 10):
|
| 54 |
+
for i in range(0, len(blocks), approx_blocks):
|
| 55 |
+
yield blocks[i:i+approx_blocks]
|