Spaces:

armyneo
/

srtconvert

Running

App Files Files Community

armyneo commited on Nov 28, 2025

Commit

420e1ab

verified ·

1 Parent(s): a0c3ad0

app.py instal

Browse files

Files changed (1) hide show

app.py +116 -65

app.py CHANGED Viewed

@@ -2,19 +2,67 @@ import re
 import io
 import zipfile
 from pathlib import Path
 import gradio as gr
 from docx import Document
 from docx.oxml import OxmlElement
 from docx.oxml.ns import qn
-# ---------- SRT PARSER ----------
 def parse_srt(path: Path):
     """
-    Parse .srt file into a list of:
-      {index, start, end, text}
     """
     raw = path.read_text(encoding="utf-8-sig", errors="ignore").strip()
     blocks = re.split(r"\n\s*\n", raw)
@@ -30,10 +78,10 @@ def parse_srt(path: Path):
         if len(lines) < 2:
             continue
-        # typical block:
         #   1
         #   00:00:13,555 --> 00:00:17,559
-        #   WOMAN: text...
         try:
             idx = int(lines[0])
             time_line = lines[1]
@@ -63,9 +111,11 @@ def parse_srt(path: Path):
     return subs
-# ---------- CHARACTER + TEXT CLEANING ----------
-# Matches lines like:
 #   WOMAN: ...
 #   DR. LEWIS: ...
 #   >>> NURSE: ...
@@ -79,9 +129,9 @@ speaker_pattern = re.compile(
 def extract_character_and_clean_text(block: str):
     """
-    From a subtitle block, extract:
-      - character (first detected NAME:)
-      - text without NAME: prefix lines
     """
     if not block:
         return "", ""
@@ -104,6 +154,7 @@ def extract_character_and_clean_text(block: str):
             if after:
                 out_lines.append(after)
         else:
             out_lines.append(original)
     out_lines = [ln for ln in out_lines if ln.strip()]
@@ -113,7 +164,7 @@ def extract_character_and_clean_text(block: str):
 def start_time_to_mm_ss(start: str) -> str:
     """
     'HH:MM:SS,mmm' -> 'MM.SS'
-    (total minutes . seconds)
     """
     hms, *_ = start.split(",")
     h, m, s = [int(x) for x in hms.split(":")]
@@ -123,49 +174,44 @@ def start_time_to_mm_ss(start: str) -> str:
     return f"{total_minutes:02d}.{seconds:02d}"
-# ---------- DOCX GENERATION ----------
-def add_header_styling(cell):
     """
-    Bold header + light grey background for header cells.
     """
     p = cell.paragraphs[0]
-    # Clear existing runs
     for r in p.runs:
         r.text = ""
-    run = p.add_run()
     run.bold = True
-    # Set shading (background)
     tc = cell._tc
     tcPr = tc.get_or_add_tcPr()
     shd = tcPr.find(qn("w:shd"))
     if shd is None:
         shd = OxmlElement("w:shd")
         tcPr.append(shd)
-    shd.set(qn("w:fill"), "D9D9D9")  # light gray
-def srt_to_docx_bytes(srt_path: Path) -> tuple[bytes, str]:
     """
-    Convert one SRT file to a styled DOCX in memory.
-    Returns (docx_bytes, suggested_filename).
     """
     subs = parse_srt(srt_path)
     doc = Document()
-    # Create a table: Character | TC | note | TEXT
     table = doc.add_table(rows=1, cols=4)
-    table.style = "Table Grid"  # border lines
     hdr_cells = table.rows[0].cells
     headers = ["Character", "TC", "note", "TEXT"]
     for idx, label in enumerate(headers):
-        cell = hdr_cells[idx]
-        add_header_styling(cell)
-        # set header text into the bold run we created
-        cell.paragraphs[0].runs[-1].text = label
     for sub in subs:
         raw_text = sub["text"]
@@ -179,19 +225,21 @@ def srt_to_docx_bytes(srt_path: Path) -> tuple[bytes, str]:
         row = table.add_row()
         cells = row.cells
-        # Character
         cells[0].text = character
-        # TC as MM.SS from START only
         cells[1].text = start_time_to_mm_ss(sub["start"])
-        # note (blank)
         cells[2].text = ""
-        # TEXT (cleaned, without NAME:)
-        cells[3].text = clean_txt
-    # Serialize to bytes
     buffer = io.BytesIO()
     doc.save(buffer)
     buffer.seek(0)
@@ -200,56 +248,52 @@ def srt_to_docx_bytes(srt_path: Path) -> tuple[bytes, str]:
     return buffer.getvalue(), out_name
-# ---------- GRADIO LOGIC ----------
-def process_srt_files(files):
     """
-    Gradio callback:
-      files: list of uploaded .srt files
-      returns: path to a ZIP containing all .docx results
     """
     if not files:
         return None
-    # Normalize to Path objects
-    paths: list[Path] = []
-    for f in files:
-        # Gradio may pass dict, tempfile, or path string depending on version
-        if isinstance(f, dict) and "name" in f:
-            paths.append(Path(f["name"]))
-        elif hasattr(f, "name"):
-            paths.append(Path(f.name))
-        else:
-            paths.append(Path(str(f)))
     zip_buffer = io.BytesIO()
     with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
         for path in paths:
-            doc_bytes, doc_name = srt_to_docx_bytes(path)
-            # add to zip
             zf.writestr(doc_name, doc_bytes)
     zip_buffer.seek(0)
-    out_zip_path = Path("converted_subtitles.zip")
     with open(out_zip_path, "wb") as f:
         f.write(zip_buffer.read())
-    return str(out_zip_path)
-# ---------- GRADIO UI ----------
 with gr.Blocks() as demo:
     gr.Markdown(
         """
-        # SRT → DOCX Subtitle Converter
-        - Upload one or more **.srt** files.
-        - For each subtitle:
-          - **Character**: inferred from lines like `WOMAN:`, `LEWIS:`, `NURSE:`, etc.
-          - **TC**: start time as **MM.SS** (no hour, no ms).
-          - **TEXT**: subtitle text **without** the `NAME:` prefix.
-        - Output: a single **ZIP** with one DOCX per SRT.
         """
     )
@@ -257,15 +301,22 @@ with gr.Blocks() as demo:
         srt_files = gr.File(
             label="Upload .srt files",
             file_types=[".srt"],
-            file_count="multiple"
         )
     out_zip = gr.File(label="Download ZIP of DOCX files")
-    convert_btn = gr.Button("Convert to DOCX")
     convert_btn.click(
         fn=process_srt_files,
-        inputs=srt_files,
         outputs=out_zip,
     )

 import io
 import zipfile
 from pathlib import Path
+from typing import Tuple, List
 import gradio as gr
 from docx import Document
 from docx.oxml import OxmlElement
 from docx.oxml.ns import qn
+from transformers import pipeline
+# ----------------------------------------------------
+# 1) ÇEVİRİ MODELİ (daha hafif model kullanalım)
+# ----------------------------------------------------
+# "tc-big" çok ağır, CPU basic'te sıkıntı çıkarabiliyor.
+MODEL_NAME = "Helsinki-NLP/opus-mt-en-tr"
+# Public model, token yok. CPU kullan (device=-1).
+translator = pipeline(
+    "translation",
+    model=MODEL_NAME,
+    device=-1,
+)
+def translate_en_tr(text: str) -> str:
+    """
+    EN->TR çeviri.
+    Satır yapısını korumak için satırları ayırıyoruz ama
+    modeli batch halde tek seferde çağırıyoruz.
+    """
+    text = (text or "").strip()
+    if not text:
+        return text
+    lines = text.splitlines()
+    # Boş olmayan satırların indekslerini topla
+    non_empty_idx: List[int] = [i for i, ln in enumerate(lines) if ln.strip()]
+    to_translate: List[str] = [lines[i] for i in non_empty_idx]
+    if not to_translate:
+        return text
+    # Batch çeviri (tek model çağrısı)
+    outputs = translator(to_translate, max_length=512)
+    translated = [o["translation_text"] for o in outputs]
+    # Çevirilen satırları eski yerlerine koy
+    out_lines = list(lines)
+    for j, idx in enumerate(non_empty_idx):
+        out_lines[idx] = translated[j]
+    return "\n".join(out_lines)
+# ----------------------------------------------------
+# 2) SRT PARSER
+# ----------------------------------------------------
 def parse_srt(path: Path):
     """
+    SRT -> [{index, start, end, text}, ...]
     """
     raw = path.read_text(encoding="utf-8-sig", errors="ignore").strip()
     blocks = re.split(r"\n\s*\n", raw)
         if len(lines) < 2:
             continue
+        # klasik blok:
         #   1
         #   00:00:13,555 --> 00:00:17,559
+        #   WOMAN: ...
         try:
             idx = int(lines[0])
             time_line = lines[1]
     return subs
+# ----------------------------------------------------
+# 3) KARAKTER ÇIKARMA + TEXT TEMİZLEME
+# ----------------------------------------------------
+# Örnek eşleşmeler:
 #   WOMAN: ...
 #   DR. LEWIS: ...
 #   >>> NURSE: ...
 def extract_character_and_clean_text(block: str):
     """
+    block içinden:
+      - Character: ilk NAME:
+      - TEXT: NAME: prefix'leri atılmış metin
     """
     if not block:
         return "", ""
             if after:
                 out_lines.append(after)
         else:
+            # NAME: ile başlamayan satırlar olduğu gibi kalsın
             out_lines.append(original)
     out_lines = [ln for ln in out_lines if ln.strip()]
 def start_time_to_mm_ss(start: str) -> str:
     """
     'HH:MM:SS,mmm' -> 'MM.SS'
+    (toplam dakika . saniye)
     """
     hms, *_ = start.split(",")
     h, m, s = [int(x) for x in hms.split(":")]
     return f"{total_minutes:02d}.{seconds:02d}"
+# ----------------------------------------------------
+# 4) DOCX OLUŞTURMA
+# ----------------------------------------------------
+def style_header_cell(cell, text: str):
     """
+    Header hücresi: bold + gri background.
     """
     p = cell.paragraphs[0]
     for r in p.runs:
         r.text = ""
+    run = p.add_run(text)
     run.bold = True
     tc = cell._tc
     tcPr = tc.get_or_add_tcPr()
     shd = tcPr.find(qn("w:shd"))
     if shd is None:
         shd = OxmlElement("w:shd")
         tcPr.append(shd)
+    shd.set(qn("w:fill"), "D9D9D9")  # light grey
+def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str]:
     """
+    Tek SRT -> styled DOCX (bytes, filename)
     """
     subs = parse_srt(srt_path)
     doc = Document()
+    # TABLE: Character | TC | note | TEXT
     table = doc.add_table(rows=1, cols=4)
+    table.style = "Table Grid"
     hdr_cells = table.rows[0].cells
     headers = ["Character", "TC", "note", "TEXT"]
     for idx, label in enumerate(headers):
+        style_header_cell(hdr_cells[idx], label)
     for sub in subs:
         raw_text = sub["text"]
         row = table.add_row()
         cells = row.cells
+        # Character -> ASLA çevirmiyoruz
         cells[0].text = character
+        # TC -> MM.SS (start time)
         cells[1].text = start_time_to_mm_ss(sub["start"])
+        # note -> boş
         cells[2].text = ""
+        # TEXT -> isteğe bağlı TR çeviri
+        if translate_to_tr:
+            cells[3].text = translate_en_tr(clean_txt)
+        else:
+            cells[3].text = clean_txt
     buffer = io.BytesIO()
     doc.save(buffer)
     buffer.seek(0)
     return buffer.getvalue(), out_name
+# ----------------------------------------------------
+# 5) GRADIO ÇAĞRI FONKSİYONU (MULTI SRT -> ZIP)
+# ----------------------------------------------------
+def process_srt_files(files, translate_to_tr: bool):
     """
+    Çoklu SRT al, hepsini DOCX'e çevir, tek ZIP döndür.
+    Gradio output için path döndürüyoruz.
     """
     if not files:
         return None
+    # Gr.File(type="filepath") -> string path listesi
+    paths = [Path(p) for p in files]
     zip_buffer = io.BytesIO()
     with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
         for path in paths:
+            doc_bytes, doc_name = srt_to_docx_bytes(path, translate_to_tr)
             zf.writestr(doc_name, doc_bytes)
     zip_buffer.seek(0)
+    out_zip_path = "converted_subtitles.zip"
     with open(out_zip_path, "wb") as f:
         f.write(zip_buffer.read())
+    return out_zip_path
+# ----------------------------------------------------
+# 6) GRADIO UI
+# ----------------------------------------------------
 with gr.Blocks() as demo:
     gr.Markdown(
         """
+        # SRT → DOCX (Character / TC / TEXT) + EN→TR Çeviri
+        - Bir veya birden fazla **.srt** yükle.
+        - Her satır için:
+          - **Character**: `WOMAN:`, `LEWIS:`, `NURSE:` gibi isimler çıkarılır (**çeviri yok**).
+          - **TC**: sadece **MM.SS** (start time'dan).
+          - **TEXT**: `NAME:` prefix'leri atılmış metin.
+        - İstersen TEXT'i **EN→TR** çevir.
+        - Çıktı: Tüm DOCX'leri içeren tek bir **ZIP**.
         """
     )
         srt_files = gr.File(
             label="Upload .srt files",
             file_types=[".srt"],
+            file_count="multiple",
+            type="filepath",
         )
+    translate_chk = gr.Checkbox(
+        label="Translate TEXT (EN → TR, only TEXT, not Character)",
+        value=False,
+    )
     out_zip = gr.File(label="Download ZIP of DOCX files")
+    convert_btn = gr.Button("Convert")
     convert_btn.click(
         fn=process_srt_files,
+        inputs=[srt_files, translate_chk],
         outputs=out_zip,
     )