Spaces:
Running
Running
fix turkish characters 2
Browse files
app.py
CHANGED
|
@@ -91,15 +91,57 @@ def translate_en_tr(text: str) -> str:
|
|
| 91 |
|
| 92 |
|
| 93 |
# ======================================================
|
| 94 |
-
# 2) SRT PARSER
|
| 95 |
# ======================================================
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
def parse_srt(path: Path):
|
| 98 |
"""
|
| 99 |
SRT -> [{index, start, end, text}, ...]
|
|
|
|
| 100 |
"""
|
| 101 |
-
|
| 102 |
-
raw = path.read_text(encoding="utf-8-sig", errors="replace").strip()
|
| 103 |
blocks = re.split(r"\n\s*\n", raw)
|
| 104 |
subs = []
|
| 105 |
|
|
@@ -264,123 +306,4 @@ def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str
|
|
| 264 |
"""
|
| 265 |
Tek SRT -> styled DOCX (bytes, filename)
|
| 266 |
"""
|
| 267 |
-
subs =
|
| 268 |
-
doc = Document()
|
| 269 |
-
|
| 270 |
-
# TABLE: Character | TC | note | TEXT
|
| 271 |
-
table = doc.add_table(rows=1, cols=4)
|
| 272 |
-
table.style = "Table Grid"
|
| 273 |
-
|
| 274 |
-
hdr_cells = table.rows[0].cells
|
| 275 |
-
headers = ["Character", "TC", "note", "TEXT"]
|
| 276 |
-
for idx, label in enumerate(headers):
|
| 277 |
-
style_header_cell(hdr_cells[idx], label)
|
| 278 |
-
|
| 279 |
-
for sub in subs:
|
| 280 |
-
raw_text = sub["text"]
|
| 281 |
-
if not raw_text.strip():
|
| 282 |
-
continue
|
| 283 |
-
|
| 284 |
-
character, clean_txt = extract_character_and_clean_text(raw_text)
|
| 285 |
-
if not clean_txt.strip():
|
| 286 |
-
continue
|
| 287 |
-
|
| 288 |
-
row = table.add_row()
|
| 289 |
-
cells = row.cells
|
| 290 |
-
|
| 291 |
-
# Character (Türkçe harfler dahil; ama sadece yoğun uppercase ise dolduruluyor)
|
| 292 |
-
cells[0].text = character
|
| 293 |
-
|
| 294 |
-
# TC -> MM.SS
|
| 295 |
-
cells[1].text = start_time_to_mm_ss(sub["start"])
|
| 296 |
-
|
| 297 |
-
# note -> boş
|
| 298 |
-
cells[2].text = ""
|
| 299 |
-
|
| 300 |
-
# TEXT -> isteğe bağlı TR çeviri
|
| 301 |
-
if translate_to_tr:
|
| 302 |
-
cells[3].text = translate_en_tr(clean_txt)
|
| 303 |
-
else:
|
| 304 |
-
cells[3].text = clean_txt
|
| 305 |
-
|
| 306 |
-
buffer = io.BytesIO()
|
| 307 |
-
doc.save(buffer)
|
| 308 |
-
buffer.seek(0)
|
| 309 |
-
|
| 310 |
-
out_name = srt_path.with_suffix(".docx").name
|
| 311 |
-
return buffer.getvalue(), out_name
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
# ======================================================
|
| 315 |
-
# 5) GRADIO: ÇOKLU SRT -> ZIP(DOCX)
|
| 316 |
-
# ======================================================
|
| 317 |
-
|
| 318 |
-
def process_srt_files(files, translate_to_tr: bool):
|
| 319 |
-
"""
|
| 320 |
-
Çoklu SRT al, hepsini DOCX'e çevir, tek ZIP döndür.
|
| 321 |
-
"""
|
| 322 |
-
if not files:
|
| 323 |
-
return None
|
| 324 |
-
|
| 325 |
-
paths = [Path(p) for p in files]
|
| 326 |
-
|
| 327 |
-
zip_buffer = io.BytesIO()
|
| 328 |
-
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
|
| 329 |
-
for path in paths:
|
| 330 |
-
doc_bytes, doc_name = srt_to_docx_bytes(path, translate_to_tr)
|
| 331 |
-
zf.writestr(doc_name, doc_bytes)
|
| 332 |
-
|
| 333 |
-
zip_buffer.seek(0)
|
| 334 |
-
out_zip_path = "converted_subtitles.zip"
|
| 335 |
-
with open(out_zip_path, "wb") as f:
|
| 336 |
-
f.write(zip_buffer.read())
|
| 337 |
-
|
| 338 |
-
return out_zip_path
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
# ======================================================
|
| 342 |
-
# 6) GRADIO UI
|
| 343 |
-
# ======================================================
|
| 344 |
-
|
| 345 |
-
with gr.Blocks() as demo:
|
| 346 |
-
gr.Markdown(
|
| 347 |
-
"""
|
| 348 |
-
# SRT → DOCX (Character / TC / TEXT) + EN→TR Çeviri (HF Inference)
|
| 349 |
-
|
| 350 |
-
- Bir veya birden fazla **.srt** yükle.
|
| 351 |
-
- Her satır için:
|
| 352 |
-
- **Character**:
|
| 353 |
-
- `WOMAN:`, `DR. GREENE:`, `HEMSİRE SELMA:` gibi *büyük harf ağırlıklı* isimler otomatik alınır.
|
| 354 |
-
- Normal Türkçe cümleler (ör. "Doktor: bugün erken geldiniz.") bozulmaz, TEXT'e tam gider.
|
| 355 |
-
- **TC**: sadece **MM.SS** (start time).
|
| 356 |
-
- **TEXT**: `NAME:` prefix'i speaker olarak algılanamadıysa **tam satır**.
|
| 357 |
-
- İstersen TEXT'i **Helsinki-NLP/opus-mt-tc-big-en-tr** ile Türkçe'ye çevir (Character asla çevrilmez).
|
| 358 |
-
- Çıktı: Tüm DOCX'leri içeren tek bir **ZIP**.
|
| 359 |
-
"""
|
| 360 |
-
)
|
| 361 |
-
|
| 362 |
-
with gr.Row():
|
| 363 |
-
srt_files = gr.File(
|
| 364 |
-
label="Upload .srt files",
|
| 365 |
-
file_types=[".srt"],
|
| 366 |
-
file_count="multiple",
|
| 367 |
-
type="filepath",
|
| 368 |
-
)
|
| 369 |
-
|
| 370 |
-
translate_chk = gr.Checkbox(
|
| 371 |
-
label="Translate TEXT (EN → TR, only TEXT, not Character)",
|
| 372 |
-
value=False,
|
| 373 |
-
)
|
| 374 |
-
|
| 375 |
-
out_zip = gr.File(label="Download ZIP of DOCX files")
|
| 376 |
-
|
| 377 |
-
convert_btn = gr.Button("Convert")
|
| 378 |
-
|
| 379 |
-
convert_btn.click(
|
| 380 |
-
fn=process_srt_files,
|
| 381 |
-
inputs=[srt_files, translate_chk],
|
| 382 |
-
outputs=out_zip,
|
| 383 |
-
)
|
| 384 |
-
|
| 385 |
-
if __name__ == "__main__":
|
| 386 |
-
demo.launch()
|
|
|
|
| 91 |
|
| 92 |
|
| 93 |
# ======================================================
|
| 94 |
+
# 2) SRT PARSER + ENCODING OTOMATİK TESPİTİ
|
| 95 |
# ======================================================
|
| 96 |
|
| 97 |
+
def read_srt_text(path: Path) -> str:
|
| 98 |
+
"""
|
| 99 |
+
SRT dosyasını binary okuyup birkaç encoding dener:
|
| 100 |
+
- utf-8-sig
|
| 101 |
+
- utf-8
|
| 102 |
+
- cp1254 (Windows-1254, Türkçe)
|
| 103 |
+
- iso-8859-9
|
| 104 |
+
- latin-1
|
| 105 |
+
|
| 106 |
+
En az '�' (replacement) ve kontrol karakteri üreten encoding'i seçer.
|
| 107 |
+
Böylece 'Hastan�z' yerine 'Hastanız' gibi doğru TR karakterler gelir.
|
| 108 |
+
"""
|
| 109 |
+
raw_bytes = path.read_bytes()
|
| 110 |
+
encodings = ["utf-8-sig", "utf-8", "cp1254", "iso-8859-9", "latin-1"]
|
| 111 |
+
|
| 112 |
+
best_txt = None
|
| 113 |
+
best_score = None
|
| 114 |
+
best_enc = None
|
| 115 |
+
|
| 116 |
+
for enc in encodings:
|
| 117 |
+
try:
|
| 118 |
+
txt = raw_bytes.decode(enc, errors="replace")
|
| 119 |
+
except LookupError:
|
| 120 |
+
continue
|
| 121 |
+
|
| 122 |
+
# '�' sayısı + garip kontrol karakterleri
|
| 123 |
+
bad_repl = txt.count("�")
|
| 124 |
+
bad_ctrl = sum(
|
| 125 |
+
1 for ch in txt
|
| 126 |
+
if ord(ch) < 32 and ch not in "\n\r\t"
|
| 127 |
+
)
|
| 128 |
+
score = bad_repl * 10 + bad_ctrl
|
| 129 |
+
|
| 130 |
+
if best_score is None or score < best_score:
|
| 131 |
+
best_score = score
|
| 132 |
+
best_txt = txt
|
| 133 |
+
best_enc = enc
|
| 134 |
+
|
| 135 |
+
print(f"[SRT ENCODING] {path.name}: {best_enc} (score={best_score})")
|
| 136 |
+
return best_txt if best_txt is not None else raw_bytes.decode("utf-8", errors="replace")
|
| 137 |
+
|
| 138 |
+
|
| 139 |
def parse_srt(path: Path):
|
| 140 |
"""
|
| 141 |
SRT -> [{index, start, end, text}, ...]
|
| 142 |
+
Encoding, read_srt_text ile otomatik tespit edilir (TR charset dahil).
|
| 143 |
"""
|
| 144 |
+
raw = read_srt_text(path).strip()
|
|
|
|
| 145 |
blocks = re.split(r"\n\s*\n", raw)
|
| 146 |
subs = []
|
| 147 |
|
|
|
|
| 306 |
"""
|
| 307 |
Tek SRT -> styled DOCX (bytes, filename)
|
| 308 |
"""
|
| 309 |
+
subs = parse_srt_
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|