Spaces:

armyneo
/

srtconvert

Running

App Files Files Community

armyneo commited on Nov 27, 2025

Commit

ca48fe5

verified ·

1 Parent(s): a62cfa6

Create app.py

Browse files

Files changed (1) hide show

app.py +273 -0

app.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import re
+import io
+import zipfile
+from pathlib import Path
+import gradio as gr
+from docx import Document
+from docx.oxml import OxmlElement
+from docx.oxml.ns import qn
+# ---------- SRT PARSER ----------
+def parse_srt(path: Path):
+    """
+    Parse .srt file into a list of:
+      {index, start, end, text}
+    """
+    raw = path.read_text(encoding="utf-8-sig", errors="ignore").strip()
+    blocks = re.split(r"\n\s*\n", raw)
+    subs = []
+    time_re = re.compile(
+        r"(?P<start>\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*"
+        r"(?P<end>\d{2}:\d{2}:\d{2},\d{3})"
+    )
+    for block in blocks:
+        lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
+        if len(lines) < 2:
+            continue
+        # typical block:
+        #   1
+        #   00:00:13,555 --> 00:00:17,559
+        #   WOMAN: text...
+        try:
+            idx = int(lines[0])
+            time_line = lines[1]
+            text_lines = lines[2:]
+        except ValueError:
+            idx = None
+            time_line = lines[0]
+            text_lines = lines[1:]
+        m = time_re.match(time_line)
+        if not m:
+            continue
+        start = m.group("start")
+        end = m.group("end")
+        text = "\n".join(text_lines)
+        subs.append(
+            {
+                "index": idx,
+                "start": start,
+                "end": end,
+                "text": text,
+            }
+        )
+    return subs
+# ---------- CHARACTER + TEXT CLEANING ----------
+# Matches lines like:
+#   WOMAN: ...
+#   DR. LEWIS: ...
+#   >>> NURSE: ...
+#   -NURSE: ...
+speaker_pattern = re.compile(
+    r'^\s*(?:>{1,3}\s*)?(?:-+\s*)?'
+    r'(?P<name>(?:[A-Z][A-Z0-9.\']+(?:\s+[A-Z][A-Z0-9.\']+){0,4}))'
+    r'\s*:\s*(?P<after>.*)$'
+)
+def extract_character_and_clean_text(block: str):
+    """
+    From a subtitle block, extract:
+      - character (first detected NAME:)
+      - text without NAME: prefix lines
+    """
+    if not block:
+        return "", ""
+    lines = block.splitlines()
+    character = ""
+    out_lines = []
+    for line in lines:
+        original = line.strip()
+        if not original:
+            continue
+        m = speaker_pattern.match(original)
+        if m:
+            name = m.group("name").strip()
+            if not character:
+                character = name
+            after = m.group("after").rstrip()
+            if after:
+                out_lines.append(after)
+        else:
+            out_lines.append(original)
+    out_lines = [ln for ln in out_lines if ln.strip()]
+    return character, "\n".join(out_lines)
+def start_time_to_mm_ss(start: str) -> str:
+    """
+    'HH:MM:SS,mmm' -> 'MM.SS'
+    (total minutes . seconds)
+    """
+    hms, *_ = start.split(",")
+    h, m, s = [int(x) for x in hms.split(":")]
+    total_seconds = h * 3600 + m * 60 + s
+    total_minutes = total_seconds // 60
+    seconds = total_seconds % 60
+    return f"{total_minutes:02d}.{seconds:02d}"
+# ---------- DOCX GENERATION ----------
+def add_header_styling(cell):
+    """
+    Bold header + light grey background for header cells.
+    """
+    p = cell.paragraphs[0]
+    # Clear existing runs
+    for r in p.runs:
+        r.text = ""
+    run = p.add_run()
+    run.bold = True
+    # Set shading (background)
+    tc = cell._tc
+    tcPr = tc.get_or_add_tcPr()
+    shd = tcPr.find(qn("w:shd"))
+    if shd is None:
+        shd = OxmlElement("w:shd")
+        tcPr.append(shd)
+    shd.set(qn("w:fill"), "D9D9D9")  # light gray
+def srt_to_docx_bytes(srt_path: Path) -> tuple[bytes, str]:
+    """
+    Convert one SRT file to a styled DOCX in memory.
+    Returns (docx_bytes, suggested_filename).
+    """
+    subs = parse_srt(srt_path)
+    doc = Document()
+    # Create a table: Character | TC | note | TEXT
+    table = doc.add_table(rows=1, cols=4)
+    table.style = "Table Grid"  # border lines
+    hdr_cells = table.rows[0].cells
+    headers = ["Character", "TC", "note", "TEXT"]
+    for idx, label in enumerate(headers):
+        cell = hdr_cells[idx]
+        add_header_styling(cell)
+        # set header text into the bold run we created
+        cell.paragraphs[0].runs[-1].text = label
+    for sub in subs:
+        raw_text = sub["text"]
+        if not raw_text.strip():
+            continue
+        character, clean_txt = extract_character_and_clean_text(raw_text)
+        if not clean_txt.strip():
+            continue
+        row = table.add_row()
+        cells = row.cells
+        # Character
+        cells[0].text = character
+        # TC as MM.SS from START only
+        cells[1].text = start_time_to_mm_ss(sub["start"])
+        # note (blank)
+        cells[2].text = ""
+        # TEXT (cleaned, without NAME:)
+        cells[3].text = clean_txt
+    # Serialize to bytes
+    buffer = io.BytesIO()
+    doc.save(buffer)
+    buffer.seek(0)
+    out_name = srt_path.with_suffix(".docx").name
+    return buffer.getvalue(), out_name
+# ---------- GRADIO LOGIC ----------
+def process_srt_files(files):
+    """
+    Gradio callback:
+      files: list of uploaded .srt files
+      returns: path to a ZIP containing all .docx results
+    """
+    if not files:
+        return None
+    # Normalize to Path objects
+    paths: list[Path] = []
+    for f in files:
+        # Gradio may pass dict, tempfile, or path string depending on version
+        if isinstance(f, dict) and "name" in f:
+            paths.append(Path(f["name"]))
+        elif hasattr(f, "name"):
+            paths.append(Path(f.name))
+        else:
+            paths.append(Path(str(f)))
+    zip_buffer = io.BytesIO()
+    with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
+        for path in paths:
+            doc_bytes, doc_name = srt_to_docx_bytes(path)
+            # add to zip
+            zf.writestr(doc_name, doc_bytes)
+    zip_buffer.seek(0)
+    out_zip_path = Path("converted_subtitles.zip")
+    with open(out_zip_path, "wb") as f:
+        f.write(zip_buffer.read())
+    return str(out_zip_path)
+# ---------- GRADIO UI ----------
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        # SRT → DOCX Subtitle Converter
+        - Upload one or more **.srt** files.
+        - For each subtitle:
+          - **Character**: inferred from lines like `WOMAN:`, `LEWIS:`, `NURSE:`, etc.
+          - **TC**: start time as **MM.SS** (no hour, no ms).
+          - **TEXT**: subtitle text **without** the `NAME:` prefix.
+        - Output: a single **ZIP** with one DOCX per SRT.
+        """
+    )
+    with gr.Row():
+        srt_files = gr.File(
+            label="Upload .srt files",
+            file_types=[".srt"],
+            file_count="multiple"
+        )
+    out_zip = gr.File(label="Download ZIP of DOCX files")
+    convert_btn = gr.Button("Convert to DOCX")
+    convert_btn.click(
+        fn=process_srt_files,
+        inputs=srt_files,
+        outputs=out_zip,
+    )
+if __name__ == "__main__":
+    demo.launch()