armyneo commited on
Commit
ca48fe5
·
verified ·
1 Parent(s): a62cfa6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +273 -0
app.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import io
3
+ import zipfile
4
+ from pathlib import Path
5
+
6
+ import gradio as gr
7
+ from docx import Document
8
+ from docx.oxml import OxmlElement
9
+ from docx.oxml.ns import qn
10
+
11
+
12
+ # ---------- SRT PARSER ----------
13
+
14
+ def parse_srt(path: Path):
15
+ """
16
+ Parse .srt file into a list of:
17
+ {index, start, end, text}
18
+ """
19
+ raw = path.read_text(encoding="utf-8-sig", errors="ignore").strip()
20
+ blocks = re.split(r"\n\s*\n", raw)
21
+ subs = []
22
+
23
+ time_re = re.compile(
24
+ r"(?P<start>\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*"
25
+ r"(?P<end>\d{2}:\d{2}:\d{2},\d{3})"
26
+ )
27
+
28
+ for block in blocks:
29
+ lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
30
+ if len(lines) < 2:
31
+ continue
32
+
33
+ # typical block:
34
+ # 1
35
+ # 00:00:13,555 --> 00:00:17,559
36
+ # WOMAN: text...
37
+ try:
38
+ idx = int(lines[0])
39
+ time_line = lines[1]
40
+ text_lines = lines[2:]
41
+ except ValueError:
42
+ idx = None
43
+ time_line = lines[0]
44
+ text_lines = lines[1:]
45
+
46
+ m = time_re.match(time_line)
47
+ if not m:
48
+ continue
49
+
50
+ start = m.group("start")
51
+ end = m.group("end")
52
+ text = "\n".join(text_lines)
53
+
54
+ subs.append(
55
+ {
56
+ "index": idx,
57
+ "start": start,
58
+ "end": end,
59
+ "text": text,
60
+ }
61
+ )
62
+
63
+ return subs
64
+
65
+
66
+ # ---------- CHARACTER + TEXT CLEANING ----------
67
+
68
+ # Matches lines like:
69
+ # WOMAN: ...
70
+ # DR. LEWIS: ...
71
+ # >>> NURSE: ...
72
+ # -NURSE: ...
73
+ speaker_pattern = re.compile(
74
+ r'^\s*(?:>{1,3}\s*)?(?:-+\s*)?'
75
+ r'(?P<name>(?:[A-Z][A-Z0-9.\']+(?:\s+[A-Z][A-Z0-9.\']+){0,4}))'
76
+ r'\s*:\s*(?P<after>.*)$'
77
+ )
78
+
79
+
80
+ def extract_character_and_clean_text(block: str):
81
+ """
82
+ From a subtitle block, extract:
83
+ - character (first detected NAME:)
84
+ - text without NAME: prefix lines
85
+ """
86
+ if not block:
87
+ return "", ""
88
+
89
+ lines = block.splitlines()
90
+ character = ""
91
+ out_lines = []
92
+
93
+ for line in lines:
94
+ original = line.strip()
95
+ if not original:
96
+ continue
97
+
98
+ m = speaker_pattern.match(original)
99
+ if m:
100
+ name = m.group("name").strip()
101
+ if not character:
102
+ character = name
103
+ after = m.group("after").rstrip()
104
+ if after:
105
+ out_lines.append(after)
106
+ else:
107
+ out_lines.append(original)
108
+
109
+ out_lines = [ln for ln in out_lines if ln.strip()]
110
+ return character, "\n".join(out_lines)
111
+
112
+
113
+ def start_time_to_mm_ss(start: str) -> str:
114
+ """
115
+ 'HH:MM:SS,mmm' -> 'MM.SS'
116
+ (total minutes . seconds)
117
+ """
118
+ hms, *_ = start.split(",")
119
+ h, m, s = [int(x) for x in hms.split(":")]
120
+ total_seconds = h * 3600 + m * 60 + s
121
+ total_minutes = total_seconds // 60
122
+ seconds = total_seconds % 60
123
+ return f"{total_minutes:02d}.{seconds:02d}"
124
+
125
+
126
+ # ---------- DOCX GENERATION ----------
127
+
128
+ def add_header_styling(cell):
129
+ """
130
+ Bold header + light grey background for header cells.
131
+ """
132
+ p = cell.paragraphs[0]
133
+ # Clear existing runs
134
+ for r in p.runs:
135
+ r.text = ""
136
+ run = p.add_run()
137
+ run.bold = True
138
+
139
+ # Set shading (background)
140
+ tc = cell._tc
141
+ tcPr = tc.get_or_add_tcPr()
142
+ shd = tcPr.find(qn("w:shd"))
143
+ if shd is None:
144
+ shd = OxmlElement("w:shd")
145
+ tcPr.append(shd)
146
+ shd.set(qn("w:fill"), "D9D9D9") # light gray
147
+
148
+
149
+ def srt_to_docx_bytes(srt_path: Path) -> tuple[bytes, str]:
150
+ """
151
+ Convert one SRT file to a styled DOCX in memory.
152
+ Returns (docx_bytes, suggested_filename).
153
+ """
154
+ subs = parse_srt(srt_path)
155
+
156
+ doc = Document()
157
+
158
+ # Create a table: Character | TC | note | TEXT
159
+ table = doc.add_table(rows=1, cols=4)
160
+ table.style = "Table Grid" # border lines
161
+
162
+ hdr_cells = table.rows[0].cells
163
+ headers = ["Character", "TC", "note", "TEXT"]
164
+ for idx, label in enumerate(headers):
165
+ cell = hdr_cells[idx]
166
+ add_header_styling(cell)
167
+ # set header text into the bold run we created
168
+ cell.paragraphs[0].runs[-1].text = label
169
+
170
+ for sub in subs:
171
+ raw_text = sub["text"]
172
+ if not raw_text.strip():
173
+ continue
174
+
175
+ character, clean_txt = extract_character_and_clean_text(raw_text)
176
+ if not clean_txt.strip():
177
+ continue
178
+
179
+ row = table.add_row()
180
+ cells = row.cells
181
+
182
+ # Character
183
+ cells[0].text = character
184
+
185
+ # TC as MM.SS from START only
186
+ cells[1].text = start_time_to_mm_ss(sub["start"])
187
+
188
+ # note (blank)
189
+ cells[2].text = ""
190
+
191
+ # TEXT (cleaned, without NAME:)
192
+ cells[3].text = clean_txt
193
+
194
+ # Serialize to bytes
195
+ buffer = io.BytesIO()
196
+ doc.save(buffer)
197
+ buffer.seek(0)
198
+
199
+ out_name = srt_path.with_suffix(".docx").name
200
+ return buffer.getvalue(), out_name
201
+
202
+
203
+ # ---------- GRADIO LOGIC ----------
204
+
205
+ def process_srt_files(files):
206
+ """
207
+ Gradio callback:
208
+ files: list of uploaded .srt files
209
+ returns: path to a ZIP containing all .docx results
210
+ """
211
+ if not files:
212
+ return None
213
+
214
+ # Normalize to Path objects
215
+ paths: list[Path] = []
216
+ for f in files:
217
+ # Gradio may pass dict, tempfile, or path string depending on version
218
+ if isinstance(f, dict) and "name" in f:
219
+ paths.append(Path(f["name"]))
220
+ elif hasattr(f, "name"):
221
+ paths.append(Path(f.name))
222
+ else:
223
+ paths.append(Path(str(f)))
224
+
225
+ zip_buffer = io.BytesIO()
226
+ with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
227
+ for path in paths:
228
+ doc_bytes, doc_name = srt_to_docx_bytes(path)
229
+ # add to zip
230
+ zf.writestr(doc_name, doc_bytes)
231
+
232
+ zip_buffer.seek(0)
233
+ out_zip_path = Path("converted_subtitles.zip")
234
+ with open(out_zip_path, "wb") as f:
235
+ f.write(zip_buffer.read())
236
+
237
+ return str(out_zip_path)
238
+
239
+
240
+ # ---------- GRADIO UI ----------
241
+
242
+ with gr.Blocks() as demo:
243
+ gr.Markdown(
244
+ """
245
+ # SRT → DOCX Subtitle Converter
246
+
247
+ - Upload one or more **.srt** files.
248
+ - For each subtitle:
249
+ - **Character**: inferred from lines like `WOMAN:`, `LEWIS:`, `NURSE:`, etc.
250
+ - **TC**: start time as **MM.SS** (no hour, no ms).
251
+ - **TEXT**: subtitle text **without** the `NAME:` prefix.
252
+ - Output: a single **ZIP** with one DOCX per SRT.
253
+ """
254
+ )
255
+
256
+ with gr.Row():
257
+ srt_files = gr.File(
258
+ label="Upload .srt files",
259
+ file_types=[".srt"],
260
+ file_count="multiple"
261
+ )
262
+
263
+ out_zip = gr.File(label="Download ZIP of DOCX files")
264
+
265
+ convert_btn = gr.Button("Convert to DOCX")
266
+ convert_btn.click(
267
+ fn=process_srt_files,
268
+ inputs=srt_files,
269
+ outputs=out_zip,
270
+ )
271
+
272
+ if __name__ == "__main__":
273
+ demo.launch()