armyneo commited on
Commit
a0c3ad0
·
verified ·
1 Parent(s): c984900

revert normal

Browse files
Files changed (1) hide show
  1. app.py +65 -104
app.py CHANGED
@@ -2,55 +2,19 @@ import re
2
  import io
3
  import zipfile
4
  from pathlib import Path
5
- from typing import Tuple
6
 
7
  import gradio as gr
8
  from docx import Document
9
  from docx.oxml import OxmlElement
10
  from docx.oxml.ns import qn
11
- from transformers import pipeline
12
 
13
 
14
- # ----------------------------------------------------
15
- # 1) ÇEVİRİ MODELİ (Helsinki-NLP / opus-mt-tc-big-en-tr)
16
- # ----------------------------------------------------
17
-
18
- MODEL_NAME = "Helsinki-NLP/opus-mt-tc-big-en-tr"
19
-
20
- # Public model, token vermiyoruz.
21
- translator = pipeline(
22
- "translation",
23
- model=MODEL_NAME,
24
- )
25
-
26
-
27
- def translate_en_tr(text: str) -> str:
28
- """
29
- Sadece TEXT için EN->TR çeviri.
30
- Satır satır çeviriyoruz ki satır yapısı bozulmasın.
31
- """
32
- text = text.strip()
33
- if not text:
34
- return text
35
-
36
- lines = text.splitlines()
37
- out_lines = []
38
- for line in lines:
39
- if not line.strip():
40
- out_lines.append("")
41
- else:
42
- out = translator(line)[0]["translation_text"]
43
- out_lines.append(out)
44
- return "\n".join(out_lines)
45
-
46
-
47
- # ----------------------------------------------------
48
- # 2) SRT PARSER
49
- # ----------------------------------------------------
50
 
51
  def parse_srt(path: Path):
52
  """
53
- SRT -> [{index, start, end, text}, ...]
 
54
  """
55
  raw = path.read_text(encoding="utf-8-sig", errors="ignore").strip()
56
  blocks = re.split(r"\n\s*\n", raw)
@@ -66,10 +30,10 @@ def parse_srt(path: Path):
66
  if len(lines) < 2:
67
  continue
68
 
69
- # klasik blok:
70
  # 1
71
  # 00:00:13,555 --> 00:00:17,559
72
- # WOMAN: ...
73
  try:
74
  idx = int(lines[0])
75
  time_line = lines[1]
@@ -99,11 +63,9 @@ def parse_srt(path: Path):
99
  return subs
100
 
101
 
102
- # ----------------------------------------------------
103
- # 3) KARAKTER ÇIKARMA + TEXT TEMİZLEME
104
- # ----------------------------------------------------
105
 
106
- # Örnek eşleşmeler:
107
  # WOMAN: ...
108
  # DR. LEWIS: ...
109
  # >>> NURSE: ...
@@ -117,9 +79,9 @@ speaker_pattern = re.compile(
117
 
118
  def extract_character_and_clean_text(block: str):
119
  """
120
- block içinden:
121
- - Character: ilk NAME:
122
- - TEXT: NAME: prefix'leri atılmış metin
123
  """
124
  if not block:
125
  return "", ""
@@ -142,7 +104,6 @@ def extract_character_and_clean_text(block: str):
142
  if after:
143
  out_lines.append(after)
144
  else:
145
- # NAME: ile başlamayan satırlar olduğu gibi kalsın
146
  out_lines.append(original)
147
 
148
  out_lines = [ln for ln in out_lines if ln.strip()]
@@ -152,7 +113,7 @@ def extract_character_and_clean_text(block: str):
152
  def start_time_to_mm_ss(start: str) -> str:
153
  """
154
  'HH:MM:SS,mmm' -> 'MM.SS'
155
- (toplam dakika . saniye)
156
  """
157
  hms, *_ = start.split(",")
158
  h, m, s = [int(x) for x in hms.split(":")]
@@ -162,46 +123,49 @@ def start_time_to_mm_ss(start: str) -> str:
162
  return f"{total_minutes:02d}.{seconds:02d}"
163
 
164
 
165
- # ----------------------------------------------------
166
- # 4) DOCX OLUŞTURMA
167
- # ----------------------------------------------------
168
 
169
- def style_header_cell(cell, text: str):
170
  """
171
- Header hücresi: bold + gri background.
172
  """
173
  p = cell.paragraphs[0]
174
- # eski run'ları temizle
175
  for r in p.runs:
176
  r.text = ""
177
- run = p.add_run(text)
178
  run.bold = True
179
 
180
- # arka plan shading
181
  tc = cell._tc
182
  tcPr = tc.get_or_add_tcPr()
183
  shd = tcPr.find(qn("w:shd"))
184
  if shd is None:
185
  shd = OxmlElement("w:shd")
186
  tcPr.append(shd)
187
- shd.set(qn("w:fill"), "D9D9D9") # light grey
188
 
189
 
190
- def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str]:
191
  """
192
- Tek SRT -> styled DOCX (bytes, filename)
 
193
  """
194
  subs = parse_srt(srt_path)
 
195
  doc = Document()
196
 
197
- # TABLE: Character | TC | note | TEXT
198
  table = doc.add_table(rows=1, cols=4)
199
- table.style = "Table Grid" # border çizgileri
200
 
201
  hdr_cells = table.rows[0].cells
202
  headers = ["Character", "TC", "note", "TEXT"]
203
  for idx, label in enumerate(headers):
204
- style_header_cell(hdr_cells[idx], label)
 
 
 
205
 
206
  for sub in subs:
207
  raw_text = sub["text"]
@@ -215,21 +179,19 @@ def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str
215
  row = table.add_row()
216
  cells = row.cells
217
 
218
- # Character -> ASLA çevirmiyoruz
219
  cells[0].text = character
220
 
221
- # TC -> MM.SS (start time only)
222
  cells[1].text = start_time_to_mm_ss(sub["start"])
223
 
224
- # note -> boş
225
  cells[2].text = ""
226
 
227
- # TEXT -> isteğe bağlı TR çevir
228
- if translate_to_tr:
229
- cells[3].text = translate_en_tr(clean_txt)
230
- else:
231
- cells[3].text = clean_txt
232
 
 
233
  buffer = io.BytesIO()
234
  doc.save(buffer)
235
  buffer.seek(0)
@@ -238,50 +200,56 @@ def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str
238
  return buffer.getvalue(), out_name
239
 
240
 
241
- # ----------------------------------------------------
242
- # 5) GRADIO ÇAĞRI FONKSİYONU (MULTI SRT -> ZIP)
243
- # ----------------------------------------------------
244
 
245
- def process_srt_files(files, translate_to_tr: bool):
246
  """
247
- Çoklu SRT al, hepsini DOCX'e çevir, tek ZIP döndür.
 
 
248
  """
249
  if not files:
250
  return None
251
 
252
- # Gradio type="filepath" -> direkt string path listesi
253
- paths = [Path(p) for p in files]
 
 
 
 
 
 
 
 
254
 
255
  zip_buffer = io.BytesIO()
256
  with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
257
  for path in paths:
258
- doc_bytes, doc_name = srt_to_docx_bytes(path, translate_to_tr)
 
259
  zf.writestr(doc_name, doc_bytes)
260
 
261
  zip_buffer.seek(0)
262
- out_zip_path = "converted_subtitles.zip"
263
  with open(out_zip_path, "wb") as f:
264
  f.write(zip_buffer.read())
265
 
266
- return out_zip_path
267
 
268
 
269
- # ----------------------------------------------------
270
- # 6) GRADIO UI
271
- # ----------------------------------------------------
272
 
273
  with gr.Blocks() as demo:
274
  gr.Markdown(
275
  """
276
- # SRT → DOCX (Character / TC / TEXT) + EN→TR Çeviri (Helsinki)
277
-
278
- - Bir veya birden fazla **.srt** yükle.
279
- - Her satır için:
280
- - **Character**: `WOMAN:`, `LEWIS:`, `NURSE:` gibi isimler çıkarılır (**çeviri yok**).
281
- - **TC**: sadece **MM.SS** (start time'dan).
282
- - **TEXT**: `NAME:` prefix'leri atılmış metin.
283
- - İstersen TEXT'i **Helsinki-NLP/opus-mt-tc-big-en-tr** ile Türkçe'ye çevir (Character asla çevrilmez).
284
- - Çıktı: Tüm DOCX'leri içeren tek bir **ZIP**.
285
  """
286
  )
287
 
@@ -289,22 +257,15 @@ with gr.Blocks() as demo:
289
  srt_files = gr.File(
290
  label="Upload .srt files",
291
  file_types=[".srt"],
292
- file_count="multiple",
293
- type="filepath", # Gradio -> string path list
294
  )
295
 
296
- translate_chk = gr.Checkbox(
297
- label="Translate TEXT (EN → TR, only TEXT, not Character)",
298
- value=False,
299
- )
300
-
301
  out_zip = gr.File(label="Download ZIP of DOCX files")
302
 
303
- convert_btn = gr.Button("Convert")
304
-
305
  convert_btn.click(
306
  fn=process_srt_files,
307
- inputs=[srt_files, translate_chk],
308
  outputs=out_zip,
309
  )
310
 
 
2
  import io
3
  import zipfile
4
  from pathlib import Path
 
5
 
6
  import gradio as gr
7
  from docx import Document
8
  from docx.oxml import OxmlElement
9
  from docx.oxml.ns import qn
 
10
 
11
 
12
+ # ---------- SRT PARSER ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def parse_srt(path: Path):
15
  """
16
+ Parse .srt file into a list of:
17
+ {index, start, end, text}
18
  """
19
  raw = path.read_text(encoding="utf-8-sig", errors="ignore").strip()
20
  blocks = re.split(r"\n\s*\n", raw)
 
30
  if len(lines) < 2:
31
  continue
32
 
33
+ # typical block:
34
  # 1
35
  # 00:00:13,555 --> 00:00:17,559
36
+ # WOMAN: text...
37
  try:
38
  idx = int(lines[0])
39
  time_line = lines[1]
 
63
  return subs
64
 
65
 
66
+ # ---------- CHARACTER + TEXT CLEANING ----------
 
 
67
 
68
+ # Matches lines like:
69
  # WOMAN: ...
70
  # DR. LEWIS: ...
71
  # >>> NURSE: ...
 
79
 
80
  def extract_character_and_clean_text(block: str):
81
  """
82
+ From a subtitle block, extract:
83
+ - character (first detected NAME:)
84
+ - text without NAME: prefix lines
85
  """
86
  if not block:
87
  return "", ""
 
104
  if after:
105
  out_lines.append(after)
106
  else:
 
107
  out_lines.append(original)
108
 
109
  out_lines = [ln for ln in out_lines if ln.strip()]
 
113
  def start_time_to_mm_ss(start: str) -> str:
114
  """
115
  'HH:MM:SS,mmm' -> 'MM.SS'
116
+ (total minutes . seconds)
117
  """
118
  hms, *_ = start.split(",")
119
  h, m, s = [int(x) for x in hms.split(":")]
 
123
  return f"{total_minutes:02d}.{seconds:02d}"
124
 
125
 
126
+ # ---------- DOCX GENERATION ----------
 
 
127
 
128
+ def add_header_styling(cell):
129
  """
130
+ Bold header + light grey background for header cells.
131
  """
132
  p = cell.paragraphs[0]
133
+ # Clear existing runs
134
  for r in p.runs:
135
  r.text = ""
136
+ run = p.add_run()
137
  run.bold = True
138
 
139
+ # Set shading (background)
140
  tc = cell._tc
141
  tcPr = tc.get_or_add_tcPr()
142
  shd = tcPr.find(qn("w:shd"))
143
  if shd is None:
144
  shd = OxmlElement("w:shd")
145
  tcPr.append(shd)
146
+ shd.set(qn("w:fill"), "D9D9D9") # light gray
147
 
148
 
149
+ def srt_to_docx_bytes(srt_path: Path) -> tuple[bytes, str]:
150
  """
151
+ Convert one SRT file to a styled DOCX in memory.
152
+ Returns (docx_bytes, suggested_filename).
153
  """
154
  subs = parse_srt(srt_path)
155
+
156
  doc = Document()
157
 
158
+ # Create a table: Character | TC | note | TEXT
159
  table = doc.add_table(rows=1, cols=4)
160
+ table.style = "Table Grid" # border lines
161
 
162
  hdr_cells = table.rows[0].cells
163
  headers = ["Character", "TC", "note", "TEXT"]
164
  for idx, label in enumerate(headers):
165
+ cell = hdr_cells[idx]
166
+ add_header_styling(cell)
167
+ # set header text into the bold run we created
168
+ cell.paragraphs[0].runs[-1].text = label
169
 
170
  for sub in subs:
171
  raw_text = sub["text"]
 
179
  row = table.add_row()
180
  cells = row.cells
181
 
182
+ # Character
183
  cells[0].text = character
184
 
185
+ # TC as MM.SS from START only
186
  cells[1].text = start_time_to_mm_ss(sub["start"])
187
 
188
+ # note (blank)
189
  cells[2].text = ""
190
 
191
+ # TEXT (cleaned, without NAME:)
192
+ cells[3].text = clean_txt
 
 
 
193
 
194
+ # Serialize to bytes
195
  buffer = io.BytesIO()
196
  doc.save(buffer)
197
  buffer.seek(0)
 
200
  return buffer.getvalue(), out_name
201
 
202
 
203
+ # ---------- GRADIO LOGIC ----------
 
 
204
 
205
+ def process_srt_files(files):
206
  """
207
+ Gradio callback:
208
+ files: list of uploaded .srt files
209
+ returns: path to a ZIP containing all .docx results
210
  """
211
  if not files:
212
  return None
213
 
214
+ # Normalize to Path objects
215
+ paths: list[Path] = []
216
+ for f in files:
217
+ # Gradio may pass dict, tempfile, or path string depending on version
218
+ if isinstance(f, dict) and "name" in f:
219
+ paths.append(Path(f["name"]))
220
+ elif hasattr(f, "name"):
221
+ paths.append(Path(f.name))
222
+ else:
223
+ paths.append(Path(str(f)))
224
 
225
  zip_buffer = io.BytesIO()
226
  with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
227
  for path in paths:
228
+ doc_bytes, doc_name = srt_to_docx_bytes(path)
229
+ # add to zip
230
  zf.writestr(doc_name, doc_bytes)
231
 
232
  zip_buffer.seek(0)
233
+ out_zip_path = Path("converted_subtitles.zip")
234
  with open(out_zip_path, "wb") as f:
235
  f.write(zip_buffer.read())
236
 
237
+ return str(out_zip_path)
238
 
239
 
240
+ # ---------- GRADIO UI ----------
 
 
241
 
242
  with gr.Blocks() as demo:
243
  gr.Markdown(
244
  """
245
+ # SRT → DOCX Subtitle Converter
246
+
247
+ - Upload one or more **.srt** files.
248
+ - For each subtitle:
249
+ - **Character**: inferred from lines like `WOMAN:`, `LEWIS:`, `NURSE:`, etc.
250
+ - **TC**: start time as **MM.SS** (no hour, no ms).
251
+ - **TEXT**: subtitle text **without** the `NAME:` prefix.
252
+ - Output: a single **ZIP** with one DOCX per SRT.
 
253
  """
254
  )
255
 
 
257
  srt_files = gr.File(
258
  label="Upload .srt files",
259
  file_types=[".srt"],
260
+ file_count="multiple"
 
261
  )
262
 
 
 
 
 
 
263
  out_zip = gr.File(label="Download ZIP of DOCX files")
264
 
265
+ convert_btn = gr.Button("Convert to DOCX")
 
266
  convert_btn.click(
267
  fn=process_srt_files,
268
+ inputs=srt_files,
269
  outputs=out_zip,
270
  )
271