Spaces:

TodaaBria
/

SRT-Translator

Running

App Files Files Community

benkamin commited on Aug 17, 2025

Commit

8f0ff8b

verified ·

1 Parent(s): 182ab39

Upload 5 files

Browse files

Files changed (5) hide show

LICENSE +21 -0
app.py +247 -0
prompts.py +50 -0
requirements.txt +5 -0
srt_utils.py +55 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import os, re, time, tempfile
+import gradio as gr
+from dotenv import load_dotenv
+from openai import OpenAI
+from langdetect import detect, DetectorFactory
+from srt_utils import (
+    parse_srt, blocks_to_srt, split_batches,
+    validate_srt_batch, last_end_time_ms
+)
+from prompts import build_prompt, RTL_LANGS
+load_dotenv()
+DetectorFactory.seed = 42  # make langdetect deterministic-ish
+DEFAULT_GLOSSARY = """agency - יכולת פעולה עצמאית
+attachment - היקשרות
+awakening - התעוררות
+alaya - אלאיה
+ayatana - אייטנה (בסיס החושים)"""
+LANG_NAME_TO_CODE = {
+    "English": "en",
+    "Hebrew": "he",
+    "Spanish": "es",
+    "French": "fr",
+    "German": "de",
+    "Arabic": "ar",
+    "Auto-detect": "auto",
+}
+def simple_token_estimate(text: str) -> int:
+    """
+    Heuristic token estimate:
+    - Strip SRT numbers/timecodes, keep only text lines.
+    - Words * 1.33 ≈ tokens (rough).
+    """
+    lines = []
+    for block in parse_srt(text):
+        lines.extend(block[2:])  # text-only
+    only_text = " ".join(lines)
+    words = len(re.findall(r"\S+", only_text))
+    return int(words * 1.33)
+def estimate_cost(total_in_tokens: int,
+                  total_out_tokens: int,
+                  price_in_per_million: float,
+                  price_out_per_million: float) -> float:
+    cost_in = (total_in_tokens / 1_000_000.0) * price_in_per_million
+    cost_out = (total_out_tokens / 1_000_000.0) * price_out_per_million
+    return round(cost_in + cost_out, 4)
+def autodetect_source_lang(srt_text: str) -> str:
+    # Take first ~1000 characters of text-only content for detection
+    texts = []
+    for block in parse_srt(srt_text)[:50]:
+        texts.extend(block[2:])
+    sample = " ".join(texts)[:1000].strip()
+    if not sample:
+        return "English"  # default fallback
+    try:
+        code = detect(sample)
+    except Exception:
+        return "English"
+    # Map code to UI labels; default to returning code if unknown
+    for name, c in LANG_NAME_TO_CODE.items():
+        if c == code:
+            return name
+    return "English"
+def call_gpt(client: OpenAI, model: str, prompt: str) -> str:
+    resp = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0.0,
+        top_p=1.0,
+        extra_body={"verbosity": "low"}  # GPT-5 knob if supported
+    )
+    text = resp.choices[0].message.content
+    m = re.search(r'<<<SRT>>>\s*(.*?)\s*<<<END>>>', text, re.DOTALL)
+    return (m.group(1).strip() if m else text.strip())
+def prepare_download_file(content: str, suffix: str):
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
+    with open(tmp.name, "w", encoding="utf-8") as f:
+        f.write(content)
+    return tmp.name
+def compute_estimates(file_bytes, approx_blocks, use_prev_ctx,
+                      price_in_per_million, price_out_per_million):
+    if file_bytes is None:
+        return "Upload an SRT to estimate.", gr.update(visible=False), gr.update(visible=False)
+    raw = file_bytes.decode("utf-8", errors="replace")
+    # Basic token estimate
+    base_tokens = simple_token_estimate(raw)
+    # Rough batch count:
+    blocks = parse_srt(raw)
+    batch_count = max(1, (len(blocks) + approx_blocks - 1) // approx_blocks)
+    # Overheads (rough):
+    # - Style/glossary prefix per batch ~ 300 tokens (adjustable)
+    prefix_tokens_per_batch = 300
+    # - Previous context overhead per batch ~ average of one batch output (text only), but we use 50% of batch's tokens for safety
+    context_overhead = 0
+    if use_prev_ctx and batch_count > 1:
+        context_overhead = int((base_tokens / batch_count) * 0.5) * (batch_count - 1)
+    in_tokens = base_tokens + batch_count * prefix_tokens_per_batch + context_overhead
+    out_tokens = base_tokens  # translation length ~ same scale
+    total_cost = estimate_cost(in_tokens, out_tokens, price_in_per_million, price_out_per_million)
+    msg = (
+        f"Estimated tokens — input: ~{in_tokens:,}, output: ~{out_tokens:,}\n"
+        f"Estimated total cost: ~${total_cost:.4f} (rates: in ${price_in_per_million}/M, out ${price_out_per_million}/M)\n"
+        f"Assumptions: words→tokens≈1.33, per-batch prefix≈{prefix_tokens_per_batch}, "
+        f"{'with' if use_prev_ctx else 'no'} previous-batch context."
+    )
+    return msg, gr.update(visible=True), gr.update(visible=True)
+def pipeline(file_bytes, user_api_key, source_lang, target_lang, glossary, extra, model, approx_blocks, use_prev_ctx):
+    # Resolve API key: user-supplied takes precedence; fallback to env var
+    api_key = (user_api_key or "").strip() or os.getenv("OPENAI_API_KEY", "").strip()
+    if not api_key:
+        return "", "Please paste your OpenAI API key or configure OPENAI_API_KEY.", None, None, "ltr"
+    client = OpenAI(api_key=api_key)
+    if file_bytes is None:
+        return "", "Please upload an SRT file.", None, None, "ltr"
+    raw = file_bytes.decode("utf-8", errors="replace")
+    in_blocks = parse_srt(raw)
+    # Source auto-detect
+    if source_lang == "Auto-detect":
+        source_lang = autodetect_source_lang(raw)
+    # Input sanity
+    for b in in_blocks:
+        if len(b) < 3 or not b[0].strip().isdigit() or "-->" not in b[1]:
+            return "", "Input SRT failed basic validation (numbers/timecodes).", None, None, "ltr"
+    out_blocks_all, logs = [], []
+    prev_source, prev_target = None, None
+    for i, batch in enumerate(split_batches(in_blocks, approx_blocks), start=1):
+        batch_srt_in = blocks_to_srt(batch)
+        prompt = build_prompt(
+            source_lang=source_lang, target_lang=target_lang,
+            batch_srt=batch_srt_in, glossary_text=glossary, extra_instructions=extra,
+            prev_source=prev_source if use_prev_ctx else None,
+            prev_target=prev_target if use_prev_ctx else None
+        )
+        try:
+            translated = call_gpt(client, model, prompt)
+        except Exception as e:
+            logs.append(f"[ERROR] API call failed in batch {i}: {e}")
+            # Produce partial outputs for debugging
+            srt_path = prepare_download_file(blocks_to_srt(out_blocks_all), ".srt")
+            log_path = prepare_download_file("\n".join(logs), ".log.txt")
+            return blocks_to_srt(out_blocks_all), "\n".join(logs), srt_path, log_path, ("rtl" if target_lang.lower()[:2] in RTL_LANGS else "ltr")
+        out_batch = parse_srt(translated)
+        prev_end = last_end_time_ms(out_blocks_all)
+        ok, rep = validate_srt_batch(batch, out_batch, prev_last_end=prev_end)
+        logs.append(f"Batch {i}: {'OK' if ok else 'ISSUES'}")
+        logs += rep
+        if not ok:
+            # Hard retry with stricter wording
+            prompt_strict = prompt + "\n\n(HARD MODE) Repeat EXACT numbers/timecodes/line counts. Output SRT only."
+            try:
+                translated2 = call_gpt(client, model, prompt_strict)
+                out_batch2 = parse_srt(translated2)
+                ok2, rep2 = validate_srt_batch(batch, out_batch2, prev_last_end=prev_end)
+                logs.append(f"Batch {i} (retry): {'OK' if ok2 else 'ISSUES'}")
+                logs += rep2
+                if ok2:
+                    out_batch = out_batch2
+                    ok = True
+            except Exception as e:
+                logs.append(f"[ERROR] Retry failed in batch {i}: {e}")
+        out_blocks_all.extend(out_batch)
+        prev_source, prev_target = batch_srt_in, blocks_to_srt(out_batch)
+        # live progress
+        yield blocks_to_srt(out_blocks_all), "\n".join(logs), None, None, ("rtl" if target_lang.lower()[:2] in RTL_LANGS else "ltr")
+        time.sleep(0.05)
+    final_srt = blocks_to_srt(out_blocks_all)
+    direction = "rtl" if target_lang.lower()[:2] in RTL_LANGS else "ltr"
+    srt_path = prepare_download_file(final_srt, ".srt")
+    log_path = prepare_download_file("\n".join(logs) if logs else "Done.", ".log.txt")
+    return final_srt, "\n".join(logs) if logs else "Done.", srt_path, log_path, direction
+with gr.Blocks(title="Open Subtitle Translator (GPT-5)") as demo:
+    gr.Markdown("## Open Subtitle Translator — GPT-5\nPaste your API key, upload an SRT, pick languages, and translate with strict SRT validation.\n\n"
+                "**Tip:** Public Spaces should NOT include owner API keys. Users paste their own keys here.")
+    with gr.Row():
+        key = gr.Textbox(label="OpenAI API key", type="password", placeholder="sk-...", info="Used only for this session; not stored.")
+        model = gr.Dropdown(choices=["gpt-5", "gpt-5-mini"], value="gpt-5", label="Model")
+        approx_blocks = gr.Slider(5, 20, value=10, step=1, label="Approx. SRT blocks per batch")
+        use_prev = gr.Checkbox(value=True, label="Use previous-batch target as context")
+    with gr.Row():
+        src = gr.Dropdown(choices=["Auto-detect", "English", "Hebrew", "Spanish", "French", "German", "Arabic"], value="English", label="Source language")
+        tgt = gr.Dropdown(choices=["Hebrew", "English", "Spanish", "French", "German", "Arabic"], value="Hebrew", label="Target language")
+    with gr.Row():
+        price_in = gr.Number(value=1.25, precision=2, label="Price — input $/M tokens (configurable)")
+        price_out = gr.Number(value=10.0, precision=2, label="Price — output $/M tokens (configurable)")
+    glossary = gr.Textbox(label="Glossary / Policy", value=DEFAULT_GLOSSARY, lines=6)
+    extra = gr.Textbox(label="Extra instructions (optional)", lines=4, placeholder="Tone, domain hints, speaker info…")
+    srt_in = gr.File(label="Upload SRT", file_types=[".srt"])
+    with gr.Row():
+        estimate_btn = gr.Button("Estimate Cost")
+        run_btn = gr.Button("Translate")
+    srt_preview = gr.Textbox(label="Translated SRT (preview)", lines=18)
+    log = gr.Textbox(label="Validation / Log", lines=18)
+    with gr.Row():
+        dl_srt = gr.File(label="Download Translated SRT", visible=False)
+        dl_log = gr.File(label="Download Log", visible=False)
+    dir_state = gr.State("ltr")
+    estimate_btn.click(
+        fn=compute_estimates,
+        inputs=[srt_in, approx_blocks, use_prev, price_in, price_out],
+        outputs=[log, dl_srt, dl_log],
+        api_name="estimate"
+    )
+    run_btn.click(
+        fn=pipeline,
+        inputs=[srt_in, key, src, tgt, glossary, extra, model, approx_blocks, use_prev],
+        outputs=[srt_preview, log, dl_srt, dl_log, dir_state],
+        api_name="translate"
+    )
+if __name__ == "__main__":
+    demo.launch()

prompts.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Prompt builder utilities for the Open Subtitle Translator
+STYLE_PREFIX = """You are a professional subtitle translator.
+HARD CONSTRAINTS:
+- Output SRT only between <<<SRT>>> and <<<END>>>.
+- Do NOT change block numbers or timecodes.
+- Do NOT add/remove lines within a block; preserve exact line breaks.
+- Keep any tags/speaker labels (e.g., <i>, ♪) exactly as-is.
+- No commentary or explanations outside the SRT.
+"""
+RTL_LANGS = {"he", "ar", "fa", "ur"}
+def build_prompt(source_lang: str,
+                 target_lang: str,
+                 batch_srt: str,
+                 glossary_text: str | None,
+                 extra_instructions: str | None,
+                 prev_source: str | None,
+                 prev_target: str | None) -> str:
+    """
+    Compose a cache-friendly prompt. Keep the prefix byte-identical across calls
+    to leverage provider-side prompt caching where available.
+    """
+    prefix = STYLE_PREFIX + f"""
+TASK:
+- Translate from {source_lang} to {target_lang}.
+- Input format is SRT blocks.
+STYLE & GLOSSARY (project-provided; must follow if applicable):
+"""
+    if glossary_text and glossary_text.strip():
+        prefix += glossary_text.strip() + "\n"
+    if extra_instructions and extra_instructions.strip():
+        prefix += "\nEXTRA INSTRUCTIONS (apply carefully):\n" + extra_instructions.strip() + "\n"
+    context = "\nCONTEXT (do not modify):\n"
+    if prev_source:
+        context += "[Previous batch — Source]\n" + prev_source.strip() + "\n"
+    if prev_target:
+        context += "[Previous batch — Target]\n" + prev_target.strip() + "\n"
+    task = (
+        "\nCURRENT BATCH TO TRANSLATE:\n"
+        + batch_srt
+        + "\nReturn only:\n<<<SRT>>>\n[Translated SRT blocks]\n<<<END>>>"
+    )
+    return prefix + context + task

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio>=4.44.0
+openai>=1.40.0
+python-dotenv>=1.0.1
+langdetect>=1.0.9

srt_utils.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import re
+from typing import List
+TIME_RE = re.compile(r'^\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}$')
+def parse_srt(text: str) -> List[list]:
+    blocks = []
+    for raw in text.strip().split("\n\n"):
+        lines = raw.splitlines()
+        if lines:
+            blocks.append(lines)
+    return blocks
+def blocks_to_srt(blocks: List[list]) -> str:
+    return "\n\n".join("\n".join(b) for b in blocks) + "\n"
+def last_end_time_ms(blocks: List[list]) -> int | None:
+    if not blocks:
+        return None
+    end = blocks[-1][1].split(" --> ")[1]
+    hh, mm, ss_ms = end.split(":")
+    ss, ms = ss_ms.split(",")
+    return (int(hh)*3600 + int(mm)*60 + int(ss))*1000 + int(ms)
+def validate_srt_batch(in_blocks: List[list], out_blocks: List[list], prev_last_end: int | None = None):
+    """
+    Returns (ok: bool, report_lines: list[str])
+    """
+    report, ok = [], True
+    if len(in_blocks) != len(out_blocks):
+        ok = False
+        report.append(f"[STRUCT] Block count mismatch: in={len(in_blocks)} out={len(out_blocks)}")
+    for i, (ib, ob) in enumerate(zip(in_blocks, out_blocks)):
+        if len(ob) < 3:
+            ok = False; report.append(f"[STRUCT] Output block too short @{i}")
+            continue
+        if ib[0].strip() != ob[0].strip():
+            ok = False; report.append(f"[INDEX] Changed @{i}: {ib[0]} → {ob[0]}")
+        if ib[1].strip() != ob[1].strip():
+            ok = False; report.append(f"[TIMECODE] Changed @{i}: {ib[1]} ≠ {ob[1]}")
+        if not TIME_RE.match(ob[1].strip()):
+            ok = False; report.append(f"[TIMECODE] Invalid format @{i}: {ob[1]}")
+        if len(ib) != len(ob):
+            ok = False; report.append(f"[LINES] Line-count changed @{i}: in={len(ib)} out={len(ob)}")
+    if prev_last_end and out_blocks:
+        def to_ms(tc: str) -> int:
+            hh, mm, ss_ms = tc.split(":"); ss, ms = ss_ms.split(",")
+            return (int(hh)*3600 + int(mm)*60 + int(ss))*1000 + int(ms)
+        if to_ms(out_blocks[0][1].split(" --> ")[0]) < prev_last_end:
+            ok = False; report.append("[OVERLAP] First block overlaps previous batch end time.")
+    return ok, report
+def split_batches(blocks: List[list], approx_blocks: int = 10):
+    for i in range(0, len(blocks), approx_blocks):
+        yield blocks[i:i+approx_blocks]