armyneo commited on
Commit
948e65a
·
verified ·
1 Parent(s): f41a98c

fix turkish characters 2

Browse files
Files changed (1) hide show
  1. app.py +46 -123
app.py CHANGED
@@ -91,15 +91,57 @@ def translate_en_tr(text: str) -> str:
91
 
92
 
93
  # ======================================================
94
- # 2) SRT PARSER
95
  # ======================================================
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  def parse_srt(path: Path):
98
  """
99
  SRT -> [{index, start, end, text}, ...]
 
100
  """
101
- # Türkçe karakterleri korumak için utf-8-sig + errors="replace"
102
- raw = path.read_text(encoding="utf-8-sig", errors="replace").strip()
103
  blocks = re.split(r"\n\s*\n", raw)
104
  subs = []
105
 
@@ -264,123 +306,4 @@ def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str
264
  """
265
  Tek SRT -> styled DOCX (bytes, filename)
266
  """
267
- subs = parse_srt(srt_path)
268
- doc = Document()
269
-
270
- # TABLE: Character | TC | note | TEXT
271
- table = doc.add_table(rows=1, cols=4)
272
- table.style = "Table Grid"
273
-
274
- hdr_cells = table.rows[0].cells
275
- headers = ["Character", "TC", "note", "TEXT"]
276
- for idx, label in enumerate(headers):
277
- style_header_cell(hdr_cells[idx], label)
278
-
279
- for sub in subs:
280
- raw_text = sub["text"]
281
- if not raw_text.strip():
282
- continue
283
-
284
- character, clean_txt = extract_character_and_clean_text(raw_text)
285
- if not clean_txt.strip():
286
- continue
287
-
288
- row = table.add_row()
289
- cells = row.cells
290
-
291
- # Character (Türkçe harfler dahil; ama sadece yoğun uppercase ise dolduruluyor)
292
- cells[0].text = character
293
-
294
- # TC -> MM.SS
295
- cells[1].text = start_time_to_mm_ss(sub["start"])
296
-
297
- # note -> boş
298
- cells[2].text = ""
299
-
300
- # TEXT -> isteğe bağlı TR çeviri
301
- if translate_to_tr:
302
- cells[3].text = translate_en_tr(clean_txt)
303
- else:
304
- cells[3].text = clean_txt
305
-
306
- buffer = io.BytesIO()
307
- doc.save(buffer)
308
- buffer.seek(0)
309
-
310
- out_name = srt_path.with_suffix(".docx").name
311
- return buffer.getvalue(), out_name
312
-
313
-
314
- # ======================================================
315
- # 5) GRADIO: ÇOKLU SRT -> ZIP(DOCX)
316
- # ======================================================
317
-
318
- def process_srt_files(files, translate_to_tr: bool):
319
- """
320
- Çoklu SRT al, hepsini DOCX'e çevir, tek ZIP döndür.
321
- """
322
- if not files:
323
- return None
324
-
325
- paths = [Path(p) for p in files]
326
-
327
- zip_buffer = io.BytesIO()
328
- with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
329
- for path in paths:
330
- doc_bytes, doc_name = srt_to_docx_bytes(path, translate_to_tr)
331
- zf.writestr(doc_name, doc_bytes)
332
-
333
- zip_buffer.seek(0)
334
- out_zip_path = "converted_subtitles.zip"
335
- with open(out_zip_path, "wb") as f:
336
- f.write(zip_buffer.read())
337
-
338
- return out_zip_path
339
-
340
-
341
- # ======================================================
342
- # 6) GRADIO UI
343
- # ======================================================
344
-
345
- with gr.Blocks() as demo:
346
- gr.Markdown(
347
- """
348
- # SRT → DOCX (Character / TC / TEXT) + EN→TR Çeviri (HF Inference)
349
-
350
- - Bir veya birden fazla **.srt** yükle.
351
- - Her satır için:
352
- - **Character**:
353
- - `WOMAN:`, `DR. GREENE:`, `HEMSİRE SELMA:` gibi *büyük harf ağırlıklı* isimler otomatik alınır.
354
- - Normal Türkçe cümleler (ör. "Doktor: bugün erken geldiniz.") bozulmaz, TEXT'e tam gider.
355
- - **TC**: sadece **MM.SS** (start time).
356
- - **TEXT**: `NAME:` prefix'i speaker olarak algılanamadıysa **tam satır**.
357
- - İstersen TEXT'i **Helsinki-NLP/opus-mt-tc-big-en-tr** ile Türkçe'ye çevir (Character asla çevrilmez).
358
- - Çıktı: Tüm DOCX'leri içeren tek bir **ZIP**.
359
- """
360
- )
361
-
362
- with gr.Row():
363
- srt_files = gr.File(
364
- label="Upload .srt files",
365
- file_types=[".srt"],
366
- file_count="multiple",
367
- type="filepath",
368
- )
369
-
370
- translate_chk = gr.Checkbox(
371
- label="Translate TEXT (EN → TR, only TEXT, not Character)",
372
- value=False,
373
- )
374
-
375
- out_zip = gr.File(label="Download ZIP of DOCX files")
376
-
377
- convert_btn = gr.Button("Convert")
378
-
379
- convert_btn.click(
380
- fn=process_srt_files,
381
- inputs=[srt_files, translate_chk],
382
- outputs=out_zip,
383
- )
384
-
385
- if __name__ == "__main__":
386
- demo.launch()
 
91
 
92
 
93
  # ======================================================
94
+ # 2) SRT PARSER + ENCODING OTOMATİK TESPİTİ
95
  # ======================================================
96
 
97
+ def read_srt_text(path: Path) -> str:
98
+ """
99
+ SRT dosyasını binary okuyup birkaç encoding dener:
100
+ - utf-8-sig
101
+ - utf-8
102
+ - cp1254 (Windows-1254, Türkçe)
103
+ - iso-8859-9
104
+ - latin-1
105
+
106
+ En az '�' (replacement) ve kontrol karakteri üreten encoding'i seçer.
107
+ Böylece 'Hastan�z' yerine 'Hastanız' gibi doğru TR karakterler gelir.
108
+ """
109
+ raw_bytes = path.read_bytes()
110
+ encodings = ["utf-8-sig", "utf-8", "cp1254", "iso-8859-9", "latin-1"]
111
+
112
+ best_txt = None
113
+ best_score = None
114
+ best_enc = None
115
+
116
+ for enc in encodings:
117
+ try:
118
+ txt = raw_bytes.decode(enc, errors="replace")
119
+ except LookupError:
120
+ continue
121
+
122
+ # '�' sayısı + garip kontrol karakterleri
123
+ bad_repl = txt.count("�")
124
+ bad_ctrl = sum(
125
+ 1 for ch in txt
126
+ if ord(ch) < 32 and ch not in "\n\r\t"
127
+ )
128
+ score = bad_repl * 10 + bad_ctrl
129
+
130
+ if best_score is None or score < best_score:
131
+ best_score = score
132
+ best_txt = txt
133
+ best_enc = enc
134
+
135
+ print(f"[SRT ENCODING] {path.name}: {best_enc} (score={best_score})")
136
+ return best_txt if best_txt is not None else raw_bytes.decode("utf-8", errors="replace")
137
+
138
+
139
  def parse_srt(path: Path):
140
  """
141
  SRT -> [{index, start, end, text}, ...]
142
+ Encoding, read_srt_text ile otomatik tespit edilir (TR charset dahil).
143
  """
144
+ raw = read_srt_text(path).strip()
 
145
  blocks = re.split(r"\n\s*\n", raw)
146
  subs = []
147
 
 
306
  """
307
  Tek SRT -> styled DOCX (bytes, filename)
308
  """
309
+ subs = parse_srt_