armyneo commited on
Commit
420e1ab
·
verified ·
1 Parent(s): a0c3ad0

app.py instal

Browse files
Files changed (1) hide show
  1. app.py +116 -65
app.py CHANGED
@@ -2,19 +2,67 @@ import re
2
  import io
3
  import zipfile
4
  from pathlib import Path
 
5
 
6
  import gradio as gr
7
  from docx import Document
8
  from docx.oxml import OxmlElement
9
  from docx.oxml.ns import qn
 
10
 
 
 
 
11
 
12
- # ---------- SRT PARSER ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def parse_srt(path: Path):
15
  """
16
- Parse .srt file into a list of:
17
- {index, start, end, text}
18
  """
19
  raw = path.read_text(encoding="utf-8-sig", errors="ignore").strip()
20
  blocks = re.split(r"\n\s*\n", raw)
@@ -30,10 +78,10 @@ def parse_srt(path: Path):
30
  if len(lines) < 2:
31
  continue
32
 
33
- # typical block:
34
  # 1
35
  # 00:00:13,555 --> 00:00:17,559
36
- # WOMAN: text...
37
  try:
38
  idx = int(lines[0])
39
  time_line = lines[1]
@@ -63,9 +111,11 @@ def parse_srt(path: Path):
63
  return subs
64
 
65
 
66
- # ---------- CHARACTER + TEXT CLEANING ----------
 
 
67
 
68
- # Matches lines like:
69
  # WOMAN: ...
70
  # DR. LEWIS: ...
71
  # >>> NURSE: ...
@@ -79,9 +129,9 @@ speaker_pattern = re.compile(
79
 
80
  def extract_character_and_clean_text(block: str):
81
  """
82
- From a subtitle block, extract:
83
- - character (first detected NAME:)
84
- - text without NAME: prefix lines
85
  """
86
  if not block:
87
  return "", ""
@@ -104,6 +154,7 @@ def extract_character_and_clean_text(block: str):
104
  if after:
105
  out_lines.append(after)
106
  else:
 
107
  out_lines.append(original)
108
 
109
  out_lines = [ln for ln in out_lines if ln.strip()]
@@ -113,7 +164,7 @@ def extract_character_and_clean_text(block: str):
113
  def start_time_to_mm_ss(start: str) -> str:
114
  """
115
  'HH:MM:SS,mmm' -> 'MM.SS'
116
- (total minutes . seconds)
117
  """
118
  hms, *_ = start.split(",")
119
  h, m, s = [int(x) for x in hms.split(":")]
@@ -123,49 +174,44 @@ def start_time_to_mm_ss(start: str) -> str:
123
  return f"{total_minutes:02d}.{seconds:02d}"
124
 
125
 
126
- # ---------- DOCX GENERATION ----------
 
 
127
 
128
- def add_header_styling(cell):
129
  """
130
- Bold header + light grey background for header cells.
131
  """
132
  p = cell.paragraphs[0]
133
- # Clear existing runs
134
  for r in p.runs:
135
  r.text = ""
136
- run = p.add_run()
137
  run.bold = True
138
 
139
- # Set shading (background)
140
  tc = cell._tc
141
  tcPr = tc.get_or_add_tcPr()
142
  shd = tcPr.find(qn("w:shd"))
143
  if shd is None:
144
  shd = OxmlElement("w:shd")
145
  tcPr.append(shd)
146
- shd.set(qn("w:fill"), "D9D9D9") # light gray
147
 
148
 
149
- def srt_to_docx_bytes(srt_path: Path) -> tuple[bytes, str]:
150
  """
151
- Convert one SRT file to a styled DOCX in memory.
152
- Returns (docx_bytes, suggested_filename).
153
  """
154
  subs = parse_srt(srt_path)
155
-
156
  doc = Document()
157
 
158
- # Create a table: Character | TC | note | TEXT
159
  table = doc.add_table(rows=1, cols=4)
160
- table.style = "Table Grid" # border lines
161
 
162
  hdr_cells = table.rows[0].cells
163
  headers = ["Character", "TC", "note", "TEXT"]
164
  for idx, label in enumerate(headers):
165
- cell = hdr_cells[idx]
166
- add_header_styling(cell)
167
- # set header text into the bold run we created
168
- cell.paragraphs[0].runs[-1].text = label
169
 
170
  for sub in subs:
171
  raw_text = sub["text"]
@@ -179,19 +225,21 @@ def srt_to_docx_bytes(srt_path: Path) -> tuple[bytes, str]:
179
  row = table.add_row()
180
  cells = row.cells
181
 
182
- # Character
183
  cells[0].text = character
184
 
185
- # TC as MM.SS from START only
186
  cells[1].text = start_time_to_mm_ss(sub["start"])
187
 
188
- # note (blank)
189
  cells[2].text = ""
190
 
191
- # TEXT (cleaned, without NAME:)
192
- cells[3].text = clean_txt
 
 
 
193
 
194
- # Serialize to bytes
195
  buffer = io.BytesIO()
196
  doc.save(buffer)
197
  buffer.seek(0)
@@ -200,56 +248,52 @@ def srt_to_docx_bytes(srt_path: Path) -> tuple[bytes, str]:
200
  return buffer.getvalue(), out_name
201
 
202
 
203
- # ---------- GRADIO LOGIC ----------
 
 
204
 
205
- def process_srt_files(files):
206
  """
207
- Gradio callback:
208
- files: list of uploaded .srt files
209
- returns: path to a ZIP containing all .docx results
210
  """
211
  if not files:
212
  return None
213
 
214
- # Normalize to Path objects
215
- paths: list[Path] = []
216
- for f in files:
217
- # Gradio may pass dict, tempfile, or path string depending on version
218
- if isinstance(f, dict) and "name" in f:
219
- paths.append(Path(f["name"]))
220
- elif hasattr(f, "name"):
221
- paths.append(Path(f.name))
222
- else:
223
- paths.append(Path(str(f)))
224
 
225
  zip_buffer = io.BytesIO()
226
  with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
227
  for path in paths:
228
- doc_bytes, doc_name = srt_to_docx_bytes(path)
229
- # add to zip
230
  zf.writestr(doc_name, doc_bytes)
231
 
232
  zip_buffer.seek(0)
233
- out_zip_path = Path("converted_subtitles.zip")
 
234
  with open(out_zip_path, "wb") as f:
235
  f.write(zip_buffer.read())
236
 
237
- return str(out_zip_path)
238
 
239
 
240
- # ---------- GRADIO UI ----------
 
 
241
 
242
  with gr.Blocks() as demo:
243
  gr.Markdown(
244
  """
245
- # SRT → DOCX Subtitle Converter
246
-
247
- - Upload one or more **.srt** files.
248
- - For each subtitle:
249
- - **Character**: inferred from lines like `WOMAN:`, `LEWIS:`, `NURSE:`, etc.
250
- - **TC**: start time as **MM.SS** (no hour, no ms).
251
- - **TEXT**: subtitle text **without** the `NAME:` prefix.
252
- - Output: a single **ZIP** with one DOCX per SRT.
 
253
  """
254
  )
255
 
@@ -257,15 +301,22 @@ with gr.Blocks() as demo:
257
  srt_files = gr.File(
258
  label="Upload .srt files",
259
  file_types=[".srt"],
260
- file_count="multiple"
 
261
  )
262
 
 
 
 
 
 
263
  out_zip = gr.File(label="Download ZIP of DOCX files")
264
 
265
- convert_btn = gr.Button("Convert to DOCX")
 
266
  convert_btn.click(
267
  fn=process_srt_files,
268
- inputs=srt_files,
269
  outputs=out_zip,
270
  )
271
 
 
2
  import io
3
  import zipfile
4
  from pathlib import Path
5
+ from typing import Tuple, List
6
 
7
  import gradio as gr
8
  from docx import Document
9
  from docx.oxml import OxmlElement
10
  from docx.oxml.ns import qn
11
+ from transformers import pipeline
12
 
13
+ # ----------------------------------------------------
14
+ # 1) ÇEVİRİ MODELİ (daha hafif model kullanalım)
15
+ # ----------------------------------------------------
16
 
17
+ # "tc-big" çok ağır, CPU basic'te sıkıntı çıkarabiliyor.
18
+ MODEL_NAME = "Helsinki-NLP/opus-mt-en-tr"
19
+
20
+ # Public model, token yok. CPU kullan (device=-1).
21
+ translator = pipeline(
22
+ "translation",
23
+ model=MODEL_NAME,
24
+ device=-1,
25
+ )
26
+
27
+
28
+ def translate_en_tr(text: str) -> str:
29
+ """
30
+ EN->TR çeviri.
31
+ Satır yapısını korumak için satırları ayırıyoruz ama
32
+ modeli batch halde tek seferde çağırıyoruz.
33
+ """
34
+ text = (text or "").strip()
35
+ if not text:
36
+ return text
37
+
38
+ lines = text.splitlines()
39
+
40
+ # Boş olmayan satırların indekslerini topla
41
+ non_empty_idx: List[int] = [i for i, ln in enumerate(lines) if ln.strip()]
42
+ to_translate: List[str] = [lines[i] for i in non_empty_idx]
43
+
44
+ if not to_translate:
45
+ return text
46
+
47
+ # Batch çeviri (tek model çağrısı)
48
+ outputs = translator(to_translate, max_length=512)
49
+ translated = [o["translation_text"] for o in outputs]
50
+
51
+ # Çevirilen satırları eski yerlerine koy
52
+ out_lines = list(lines)
53
+ for j, idx in enumerate(non_empty_idx):
54
+ out_lines[idx] = translated[j]
55
+
56
+ return "\n".join(out_lines)
57
+
58
+
59
+ # ----------------------------------------------------
60
+ # 2) SRT PARSER
61
+ # ----------------------------------------------------
62
 
63
  def parse_srt(path: Path):
64
  """
65
+ SRT -> [{index, start, end, text}, ...]
 
66
  """
67
  raw = path.read_text(encoding="utf-8-sig", errors="ignore").strip()
68
  blocks = re.split(r"\n\s*\n", raw)
 
78
  if len(lines) < 2:
79
  continue
80
 
81
+ # klasik blok:
82
  # 1
83
  # 00:00:13,555 --> 00:00:17,559
84
+ # WOMAN: ...
85
  try:
86
  idx = int(lines[0])
87
  time_line = lines[1]
 
111
  return subs
112
 
113
 
114
+ # ----------------------------------------------------
115
+ # 3) KARAKTER ÇIKARMA + TEXT TEMİZLEME
116
+ # ----------------------------------------------------
117
 
118
+ # Örnek eşleşmeler:
119
  # WOMAN: ...
120
  # DR. LEWIS: ...
121
  # >>> NURSE: ...
 
129
 
130
  def extract_character_and_clean_text(block: str):
131
  """
132
+ block içinden:
133
+ - Character: ilk NAME:
134
+ - TEXT: NAME: prefix'leri atılmış metin
135
  """
136
  if not block:
137
  return "", ""
 
154
  if after:
155
  out_lines.append(after)
156
  else:
157
+ # NAME: ile başlamayan satırlar olduğu gibi kalsın
158
  out_lines.append(original)
159
 
160
  out_lines = [ln for ln in out_lines if ln.strip()]
 
164
  def start_time_to_mm_ss(start: str) -> str:
165
  """
166
  'HH:MM:SS,mmm' -> 'MM.SS'
167
+ (toplam dakika . saniye)
168
  """
169
  hms, *_ = start.split(",")
170
  h, m, s = [int(x) for x in hms.split(":")]
 
174
  return f"{total_minutes:02d}.{seconds:02d}"
175
 
176
 
177
+ # ----------------------------------------------------
178
+ # 4) DOCX OLUŞTURMA
179
+ # ----------------------------------------------------
180
 
181
+ def style_header_cell(cell, text: str):
182
  """
183
+ Header hücresi: bold + gri background.
184
  """
185
  p = cell.paragraphs[0]
 
186
  for r in p.runs:
187
  r.text = ""
188
+ run = p.add_run(text)
189
  run.bold = True
190
 
 
191
  tc = cell._tc
192
  tcPr = tc.get_or_add_tcPr()
193
  shd = tcPr.find(qn("w:shd"))
194
  if shd is None:
195
  shd = OxmlElement("w:shd")
196
  tcPr.append(shd)
197
+ shd.set(qn("w:fill"), "D9D9D9") # light grey
198
 
199
 
200
+ def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str]:
201
  """
202
+ Tek SRT -> styled DOCX (bytes, filename)
 
203
  """
204
  subs = parse_srt(srt_path)
 
205
  doc = Document()
206
 
207
+ # TABLE: Character | TC | note | TEXT
208
  table = doc.add_table(rows=1, cols=4)
209
+ table.style = "Table Grid"
210
 
211
  hdr_cells = table.rows[0].cells
212
  headers = ["Character", "TC", "note", "TEXT"]
213
  for idx, label in enumerate(headers):
214
+ style_header_cell(hdr_cells[idx], label)
 
 
 
215
 
216
  for sub in subs:
217
  raw_text = sub["text"]
 
225
  row = table.add_row()
226
  cells = row.cells
227
 
228
+ # Character -> ASLA çevirmiyoruz
229
  cells[0].text = character
230
 
231
+ # TC -> MM.SS (start time)
232
  cells[1].text = start_time_to_mm_ss(sub["start"])
233
 
234
+ # note -> boş
235
  cells[2].text = ""
236
 
237
+ # TEXT -> isteğe bağlı TR çeviri
238
+ if translate_to_tr:
239
+ cells[3].text = translate_en_tr(clean_txt)
240
+ else:
241
+ cells[3].text = clean_txt
242
 
 
243
  buffer = io.BytesIO()
244
  doc.save(buffer)
245
  buffer.seek(0)
 
248
  return buffer.getvalue(), out_name
249
 
250
 
251
+ # ----------------------------------------------------
252
+ # 5) GRADIO ÇAĞRI FONKSİYONU (MULTI SRT -> ZIP)
253
+ # ----------------------------------------------------
254
 
255
+ def process_srt_files(files, translate_to_tr: bool):
256
  """
257
+ Çoklu SRT al, hepsini DOCX'e çevir, tek ZIP döndür.
258
+ Gradio output için path döndürüyoruz.
 
259
  """
260
  if not files:
261
  return None
262
 
263
+ # Gr.File(type="filepath") -> string path listesi
264
+ paths = [Path(p) for p in files]
 
 
 
 
 
 
 
 
265
 
266
  zip_buffer = io.BytesIO()
267
  with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
268
  for path in paths:
269
+ doc_bytes, doc_name = srt_to_docx_bytes(path, translate_to_tr)
 
270
  zf.writestr(doc_name, doc_bytes)
271
 
272
  zip_buffer.seek(0)
273
+
274
+ out_zip_path = "converted_subtitles.zip"
275
  with open(out_zip_path, "wb") as f:
276
  f.write(zip_buffer.read())
277
 
278
+ return out_zip_path
279
 
280
 
281
+ # ----------------------------------------------------
282
+ # 6) GRADIO UI
283
+ # ----------------------------------------------------
284
 
285
  with gr.Blocks() as demo:
286
  gr.Markdown(
287
  """
288
+ # SRT → DOCX (Character / TC / TEXT) + EN→TR Çeviri
289
+
290
+ - Bir veya birden fazla **.srt** yükle.
291
+ - Her satır için:
292
+ - **Character**: `WOMAN:`, `LEWIS:`, `NURSE:` gibi isimler çıkarılır (**çeviri yok**).
293
+ - **TC**: sadece **MM.SS** (start time'dan).
294
+ - **TEXT**: `NAME:` prefix'leri atılmış metin.
295
+ - İstersen TEXT'i **EN→TR** çevir.
296
+ - Çıktı: Tüm DOCX'leri içeren tek bir **ZIP**.
297
  """
298
  )
299
 
 
301
  srt_files = gr.File(
302
  label="Upload .srt files",
303
  file_types=[".srt"],
304
+ file_count="multiple",
305
+ type="filepath",
306
  )
307
 
308
+ translate_chk = gr.Checkbox(
309
+ label="Translate TEXT (EN → TR, only TEXT, not Character)",
310
+ value=False,
311
+ )
312
+
313
  out_zip = gr.File(label="Download ZIP of DOCX files")
314
 
315
+ convert_btn = gr.Button("Convert")
316
+
317
  convert_btn.click(
318
  fn=process_srt_files,
319
+ inputs=[srt_files, translate_chk],
320
  outputs=out_zip,
321
  )
322