Spaces:

armyneo
/

srtconvert

Running

App Files Files Community

armyneo commited on Nov 27, 2025

Commit

a0c3ad0

verified ·

1 Parent(s): c984900

revert normal

Browse files

Files changed (1) hide show

app.py +65 -104

app.py CHANGED Viewed

@@ -2,55 +2,19 @@ import re
 import io
 import zipfile
 from pathlib import Path
-from typing import Tuple
 import gradio as gr
 from docx import Document
 from docx.oxml import OxmlElement
 from docx.oxml.ns import qn
-from transformers import pipeline
-# ----------------------------------------------------
-# 1) ÇEVİRİ MODELİ (Helsinki-NLP / opus-mt-tc-big-en-tr)
-# ----------------------------------------------------
-MODEL_NAME = "Helsinki-NLP/opus-mt-tc-big-en-tr"
-# Public model, token vermiyoruz.
-translator = pipeline(
-    "translation",
-    model=MODEL_NAME,
-)
-def translate_en_tr(text: str) -> str:
-    """
-    Sadece TEXT için EN->TR çeviri.
-    Satır satır çeviriyoruz ki satır yapısı bozulmasın.
-    """
-    text = text.strip()
-    if not text:
-        return text
-    lines = text.splitlines()
-    out_lines = []
-    for line in lines:
-        if not line.strip():
-            out_lines.append("")
-        else:
-            out = translator(line)[0]["translation_text"]
-            out_lines.append(out)
-    return "\n".join(out_lines)
-# ----------------------------------------------------
-# 2) SRT PARSER
-# ----------------------------------------------------
 def parse_srt(path: Path):
     """
-    SRT -> [{index, start, end, text}, ...]
     """
     raw = path.read_text(encoding="utf-8-sig", errors="ignore").strip()
     blocks = re.split(r"\n\s*\n", raw)
@@ -66,10 +30,10 @@ def parse_srt(path: Path):
         if len(lines) < 2:
             continue
-        # klasik blok:
         #   1
         #   00:00:13,555 --> 00:00:17,559
-        #   WOMAN: ...
         try:
             idx = int(lines[0])
             time_line = lines[1]
@@ -99,11 +63,9 @@ def parse_srt(path: Path):
     return subs
-# ----------------------------------------------------
-# 3) KARAKTER ÇIKARMA + TEXT TEMİZLEME
-# ----------------------------------------------------
-# Örnek eşleşmeler:
 #   WOMAN: ...
 #   DR. LEWIS: ...
 #   >>> NURSE: ...
@@ -117,9 +79,9 @@ speaker_pattern = re.compile(
 def extract_character_and_clean_text(block: str):
     """
-    block içinden:
-      - Character: ilk NAME:
-      - TEXT: NAME: prefix'leri atılmış metin
     """
     if not block:
         return "", ""
@@ -142,7 +104,6 @@ def extract_character_and_clean_text(block: str):
             if after:
                 out_lines.append(after)
         else:
-            # NAME: ile başlamayan satırlar olduğu gibi kalsın
             out_lines.append(original)
     out_lines = [ln for ln in out_lines if ln.strip()]
@@ -152,7 +113,7 @@ def extract_character_and_clean_text(block: str):
 def start_time_to_mm_ss(start: str) -> str:
     """
     'HH:MM:SS,mmm' -> 'MM.SS'
-    (toplam dakika . saniye)
     """
     hms, *_ = start.split(",")
     h, m, s = [int(x) for x in hms.split(":")]
@@ -162,46 +123,49 @@ def start_time_to_mm_ss(start: str) -> str:
     return f"{total_minutes:02d}.{seconds:02d}"
-# ----------------------------------------------------
-# 4) DOCX OLUŞTURMA
-# ----------------------------------------------------
-def style_header_cell(cell, text: str):
     """
-    Header hücresi: bold + gri background.
     """
     p = cell.paragraphs[0]
-    # eski run'ları temizle
     for r in p.runs:
         r.text = ""
-    run = p.add_run(text)
     run.bold = True
-    # arka plan shading
     tc = cell._tc
     tcPr = tc.get_or_add_tcPr()
     shd = tcPr.find(qn("w:shd"))
     if shd is None:
         shd = OxmlElement("w:shd")
         tcPr.append(shd)
-    shd.set(qn("w:fill"), "D9D9D9")  # light grey
-def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str]:
     """
-    Tek SRT -> styled DOCX (bytes, filename)
     """
     subs = parse_srt(srt_path)
     doc = Document()
-    # TABLE: Character | TC | note | TEXT
     table = doc.add_table(rows=1, cols=4)
-    table.style = "Table Grid"  # border çizgileri
     hdr_cells = table.rows[0].cells
     headers = ["Character", "TC", "note", "TEXT"]
     for idx, label in enumerate(headers):
-        style_header_cell(hdr_cells[idx], label)
     for sub in subs:
         raw_text = sub["text"]
@@ -215,21 +179,19 @@ def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str
         row = table.add_row()
         cells = row.cells
-        # Character -> ASLA çevirmiyoruz
         cells[0].text = character
-        # TC -> MM.SS (start time only)
         cells[1].text = start_time_to_mm_ss(sub["start"])
-        # note -> boş
         cells[2].text = ""
-        # TEXT -> isteğe bağlı TR çevir
-        if translate_to_tr:
-            cells[3].text = translate_en_tr(clean_txt)
-        else:
-            cells[3].text = clean_txt
     buffer = io.BytesIO()
     doc.save(buffer)
     buffer.seek(0)
@@ -238,50 +200,56 @@ def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str
     return buffer.getvalue(), out_name
-# ----------------------------------------------------
-# 5) GRADIO ÇAĞRI FONKSİYONU (MULTI SRT -> ZIP)
-# ----------------------------------------------------
-def process_srt_files(files, translate_to_tr: bool):
     """
-    Çoklu SRT al, hepsini DOCX'e çevir, tek ZIP döndür.
     """
     if not files:
         return None
-    # Gradio type="filepath" -> direkt string path listesi
-    paths = [Path(p) for p in files]
     zip_buffer = io.BytesIO()
     with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
         for path in paths:
-            doc_bytes, doc_name = srt_to_docx_bytes(path, translate_to_tr)
             zf.writestr(doc_name, doc_bytes)
     zip_buffer.seek(0)
-    out_zip_path = "converted_subtitles.zip"
     with open(out_zip_path, "wb") as f:
         f.write(zip_buffer.read())
-    return out_zip_path
-# ----------------------------------------------------
-# 6) GRADIO UI
-# ----------------------------------------------------
 with gr.Blocks() as demo:
     gr.Markdown(
         """
-        # SRT → DOCX (Character / TC / TEXT) + EN→TR Çeviri (Helsinki)
-        - Bir veya birden fazla **.srt** yükle.
-        - Her satır için:
-          - **Character**: `WOMAN:`, `LEWIS:`, `NURSE:` gibi isimler çıkarılır (**çeviri yok**).
-          - **TC**: sadece **MM.SS** (start time'dan).
-          - **TEXT**: `NAME:` prefix'leri atılmış metin.
-        - İstersen TEXT'i **Helsinki-NLP/opus-mt-tc-big-en-tr** ile Türkçe'ye çevir (Character asla çevrilmez).
-        - Çıktı: Tüm DOCX'leri içeren tek bir **ZIP**.
         """
     )
@@ -289,22 +257,15 @@ with gr.Blocks() as demo:
         srt_files = gr.File(
             label="Upload .srt files",
             file_types=[".srt"],
-            file_count="multiple",
-            type="filepath",  # Gradio -> string path list
         )
-    translate_chk = gr.Checkbox(
-        label="Translate TEXT (EN → TR, only TEXT, not Character)",
-        value=False,
-    )
     out_zip = gr.File(label="Download ZIP of DOCX files")
-    convert_btn = gr.Button("Convert")
     convert_btn.click(
         fn=process_srt_files,
-        inputs=[srt_files, translate_chk],
         outputs=out_zip,
     )

 import io
 import zipfile
 from pathlib import Path
 import gradio as gr
 from docx import Document
 from docx.oxml import OxmlElement
 from docx.oxml.ns import qn
+# ---------- SRT PARSER ----------
 def parse_srt(path: Path):
     """
+    Parse .srt file into a list of:
+      {index, start, end, text}
     """
     raw = path.read_text(encoding="utf-8-sig", errors="ignore").strip()
     blocks = re.split(r"\n\s*\n", raw)
         if len(lines) < 2:
             continue
+        # typical block:
         #   1
         #   00:00:13,555 --> 00:00:17,559
+        #   WOMAN: text...
         try:
             idx = int(lines[0])
             time_line = lines[1]
     return subs
+# ---------- CHARACTER + TEXT CLEANING ----------
+# Matches lines like:
 #   WOMAN: ...
 #   DR. LEWIS: ...
 #   >>> NURSE: ...
 def extract_character_and_clean_text(block: str):
     """
+    From a subtitle block, extract:
+      - character (first detected NAME:)
+      - text without NAME: prefix lines
     """
     if not block:
         return "", ""
             if after:
                 out_lines.append(after)
         else:
             out_lines.append(original)
     out_lines = [ln for ln in out_lines if ln.strip()]
 def start_time_to_mm_ss(start: str) -> str:
     """
     'HH:MM:SS,mmm' -> 'MM.SS'
+    (total minutes . seconds)
     """
     hms, *_ = start.split(",")
     h, m, s = [int(x) for x in hms.split(":")]
     return f"{total_minutes:02d}.{seconds:02d}"
+# ---------- DOCX GENERATION ----------
+def add_header_styling(cell):
     """
+    Bold header + light grey background for header cells.
     """
     p = cell.paragraphs[0]
+    # Clear existing runs
     for r in p.runs:
         r.text = ""
+    run = p.add_run()
     run.bold = True
+    # Set shading (background)
     tc = cell._tc
     tcPr = tc.get_or_add_tcPr()
     shd = tcPr.find(qn("w:shd"))
     if shd is None:
         shd = OxmlElement("w:shd")
         tcPr.append(shd)
+    shd.set(qn("w:fill"), "D9D9D9")  # light gray
+def srt_to_docx_bytes(srt_path: Path) -> tuple[bytes, str]:
     """
+    Convert one SRT file to a styled DOCX in memory.
+    Returns (docx_bytes, suggested_filename).
     """
     subs = parse_srt(srt_path)
     doc = Document()
+    # Create a table: Character | TC | note | TEXT
     table = doc.add_table(rows=1, cols=4)
+    table.style = "Table Grid"  # border lines
     hdr_cells = table.rows[0].cells
     headers = ["Character", "TC", "note", "TEXT"]
     for idx, label in enumerate(headers):
+        cell = hdr_cells[idx]
+        add_header_styling(cell)
+        # set header text into the bold run we created
+        cell.paragraphs[0].runs[-1].text = label
     for sub in subs:
         raw_text = sub["text"]
         row = table.add_row()
         cells = row.cells
+        # Character
         cells[0].text = character
+        # TC as MM.SS from START only
         cells[1].text = start_time_to_mm_ss(sub["start"])
+        # note (blank)
         cells[2].text = ""
+        # TEXT (cleaned, without NAME:)
+        cells[3].text = clean_txt
+    # Serialize to bytes
     buffer = io.BytesIO()
     doc.save(buffer)
     buffer.seek(0)
     return buffer.getvalue(), out_name
+# ---------- GRADIO LOGIC ----------
+def process_srt_files(files):
     """
+    Gradio callback:
+      files: list of uploaded .srt files
+      returns: path to a ZIP containing all .docx results
     """
     if not files:
         return None
+    # Normalize to Path objects
+    paths: list[Path] = []
+    for f in files:
+        # Gradio may pass dict, tempfile, or path string depending on version
+        if isinstance(f, dict) and "name" in f:
+            paths.append(Path(f["name"]))
+        elif hasattr(f, "name"):
+            paths.append(Path(f.name))
+        else:
+            paths.append(Path(str(f)))
     zip_buffer = io.BytesIO()
     with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
         for path in paths:
+            doc_bytes, doc_name = srt_to_docx_bytes(path)
+            # add to zip
             zf.writestr(doc_name, doc_bytes)
     zip_buffer.seek(0)
+    out_zip_path = Path("converted_subtitles.zip")
     with open(out_zip_path, "wb") as f:
         f.write(zip_buffer.read())
+    return str(out_zip_path)
+# ---------- GRADIO UI ----------
 with gr.Blocks() as demo:
     gr.Markdown(
         """
+        # SRT → DOCX Subtitle Converter
+        - Upload one or more **.srt** files.
+        - For each subtitle:
+          - **Character**: inferred from lines like `WOMAN:`, `LEWIS:`, `NURSE:`, etc.
+          - **TC**: start time as **MM.SS** (no hour, no ms).
+          - **TEXT**: subtitle text **without** the `NAME:` prefix.
+        - Output: a single **ZIP** with one DOCX per SRT.
         """
     )
         srt_files = gr.File(
             label="Upload .srt files",
             file_types=[".srt"],
+            file_count="multiple"
         )
     out_zip = gr.File(label="Download ZIP of DOCX files")
+    convert_btn = gr.Button("Convert to DOCX")
     convert_btn.click(
         fn=process_srt_files,
+        inputs=srt_files,
         outputs=out_zip,
     )