Spaces:
Running
Running
| import re | |
| import io | |
| import zipfile | |
| from pathlib import Path | |
| from typing import Tuple, Any, Optional, List | |
| import os | |
| import time | |
| import gradio as gr | |
| from docx import Document | |
| from docx.oxml import OxmlElement | |
| from docx.oxml.ns import qn | |
| from huggingface_hub import InferenceClient | |
| # ====================================================== | |
| # 1) HUGGING FACE INFERENCE API (EN -> TR ÇEVİRİ) - BATCH | |
| # ====================================================== | |
| HF_MODEL = "Helsinki-NLP/opus-mt-tc-big-en-tr" | |
| # Space → Settings → Variables and secrets → HF_TOKEN | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| # Token varsa kullan, yoksa anonim client | |
| if HF_TOKEN: | |
| client = InferenceClient(token=HF_TOKEN) | |
| else: | |
| client = InferenceClient() | |
| def _extract_translation_text(result: Any) -> str: | |
| """ | |
| InferenceClient.translation dönüş tipini normalize et: | |
| - str | |
| - obj.translation_text | |
| - {"translation_text": "..."} | |
| - [{"translation_text": "..."}] | |
| """ | |
| if isinstance(result, str): | |
| return result | |
| if hasattr(result, "translation_text"): | |
| try: | |
| return result.translation_text # type: ignore[attr-defined] | |
| except Exception: | |
| pass | |
| if isinstance(result, dict) and "translation_text" in result: | |
| return str(result["translation_text"]) | |
| if isinstance(result, list) and result: | |
| item = result[0] | |
| if isinstance(item, str): | |
| return item | |
| if isinstance(item, dict) and "translation_text" in item: | |
| return str(item["translation_text"]) | |
| if hasattr(item, "translation_text"): | |
| try: | |
| return item.translation_text # type: ignore[attr-defined] | |
| except Exception: | |
| pass | |
| return str(result) | |
| def _translate_batch_en_tr( | |
| texts: List[str], | |
| max_batch_size: int = 200, | |
| max_retries: int = 2, | |
| base_sleep: float = 2.0, | |
| ) -> List[str]: | |
| """ | |
| Çoklu TEXT listesi alır, en az istekle EN->TR çevirir. | |
| - texts: orijinal metin listesi | |
| - return: aynı uzunlukta, çevrilmiş (veya hata durumunda orijinal) metin listesi | |
| """ | |
| if not texts: | |
| return texts | |
| result_texts: List[str] = list(texts) | |
| # Çok düşük olasılıkla metin içinde geçebilecek, "garip" bir ayracı seçiyoruz | |
| SEP = "\n[[BLOCK-SEPARATOR-6b8b4567-ICETEA]]\n" | |
| n = len(texts) | |
| for start_idx in range(0, n, max_batch_size): | |
| end_idx = min(start_idx + max_batch_size, n) | |
| batch_indices = list(range(start_idx, end_idx)) | |
| batch_texts = [texts[i] for i in batch_indices] | |
| # Tamamen boş batch ise atla | |
| if not any(t.strip() for t in batch_texts): | |
| continue | |
| joined = SEP.join(batch_texts) | |
| translated_joined: Optional[str] = None | |
| for attempt in range(max_retries + 1): | |
| try: | |
| resp = client.translation(joined, model=HF_MODEL) | |
| translated_joined = _extract_translation_text(resp) | |
| break | |
| except Exception as e: | |
| print("HF translation error (batch):", repr(e)) | |
| if attempt < max_retries: | |
| time.sleep(base_sleep * (attempt + 1)) | |
| else: | |
| translated_joined = None | |
| # Çeviri tamamen patladıysa: bu batch orijinal kalsın | |
| if translated_joined is None: | |
| continue | |
| parts = translated_joined.split(SEP) | |
| # Ayracı model bozduysa / sayılar tutmazsa -> batch orijinal kalsın | |
| if len(parts) != len(batch_texts): | |
| print( | |
| "HF translation: mismatch between batch size and split parts, " | |
| "keeping original texts for this batch." | |
| ) | |
| continue | |
| # Başarılı: result_texts içine yaz | |
| for i, part in zip(batch_indices, parts): | |
| result_texts[i] = part | |
| return result_texts | |
| # ====================================================== | |
| # 2) SRT PARSER + ENCODING AUTO-DETECT | |
| # ====================================================== | |
| def read_srt_text(path: Path) -> str: | |
| """ | |
| SRT dosyasını binary okuyup birkaç encoding dener: | |
| - utf-8-sig | |
| - utf-8 | |
| - cp1254 (Windows-1254, Türkçe) | |
| - iso-8859-9 | |
| - latin-1 | |
| En az '�' ve kontrol karakteri üreten encoding'i seçer. | |
| Böylece 'Hastan�z' yerine 'Hastanız' gibi doğru TR karakterler gelir. | |
| """ | |
| raw_bytes = path.read_bytes() | |
| encodings = ["utf-8-sig", "utf-8", "cp1254", "iso-8859-9", "latin-1"] | |
| best_txt: Optional[str] = None | |
| best_score: Optional[int] = None | |
| best_enc: Optional[str] = None | |
| for enc in encodings: | |
| try: | |
| txt = raw_bytes.decode(enc, errors="replace") | |
| except LookupError: | |
| continue | |
| bad_repl = txt.count("�") | |
| bad_ctrl = sum( | |
| 1 for ch in txt | |
| if ord(ch) < 32 and ch not in "\n\r\t" | |
| ) | |
| score = bad_repl * 10 + bad_ctrl | |
| if best_score is None or score < best_score: | |
| best_score = score | |
| best_txt = txt | |
| best_enc = enc | |
| print(f"[SRT ENCODING] {path.name}: {best_enc} (score={best_score})") | |
| return best_txt if best_txt is not None else raw_bytes.decode("utf-8", errors="replace") | |
| def parse_srt(path: Path): | |
| """ | |
| SRT -> [{index, start, end, text}, ...] | |
| Encoding, read_srt_text ile otomatik tespit edilir (TR charset dahil). | |
| """ | |
| raw = read_srt_text(path).strip() | |
| blocks = re.split(r"\n\s*\n", raw) | |
| subs = [] | |
| time_re = re.compile( | |
| r"(?P<start>\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*" | |
| r"(?P<end>\d{2}:\d{2}:\d{2},\d{3})" | |
| ) | |
| for block in blocks: | |
| lines = [ln.strip() for ln in block.splitlines() if ln.strip()] | |
| if len(lines) < 2: | |
| continue | |
| # klasik blok: | |
| # 1 | |
| # 00:00:13,555 --> 00:00:17,559 | |
| # DR. GREENE: ... | |
| try: | |
| idx = int(lines[0]) | |
| time_line = lines[1] | |
| text_lines = lines[2:] | |
| except ValueError: | |
| idx = None | |
| time_line = lines[0] | |
| text_lines = lines[1:] | |
| m = time_re.match(time_line) | |
| if not m: | |
| continue | |
| start = m.group("start") | |
| end = m.group("end") | |
| text = "\n".join(text_lines) | |
| subs.append( | |
| { | |
| "index": idx, | |
| "start": start, | |
| "end": end, | |
| "text": text, | |
| } | |
| ) | |
| return subs | |
| # ====================================================== | |
| # 3) KARAKTER ÇIKARMA + TEXT TEMİZLEME (TR-SAFE HEURISTIC) | |
| # ====================================================== | |
| # Unicode harf tabanlı name-word: | |
| # - [^\W\d_] = herhangi bir Unicode harfi (A-Z, a-z, Ç,Ğ,İ,Ö,Ş,Ü,ç,ğ,ı,ö,ş,ü vs.) | |
| # - sonrasında harf, nokta, apostrof, tire gelebilir | |
| name_word = r"[^\W\d_][^\W\d_.'-]*" | |
| speaker_pattern = re.compile( | |
| rf'^\s*(?:>{{1,3}}\s*)?(?:-+\s*)?' | |
| rf'(?P<name>(?:{name_word}(?:\s+{name_word}){{0,4}}))' | |
| rf'\s*:\s*(?P<after>.*)$', | |
| flags=re.UNICODE, | |
| ) | |
| def looks_like_speaker_name(name: str) -> bool: | |
| """ | |
| Sadece büyük harf oranı yüksek olan isimleri speaker olarak kabul et. | |
| Örn: | |
| "DR. GREENE" -> EVET | |
| "HEMSİRE SELMA" -> EVET | |
| "Doktor" -> HAYIR | |
| "Merhaba" -> HAYIR | |
| """ | |
| letters = [ch for ch in name if ch.isalpha()] | |
| if not letters: | |
| return False | |
| upper_count = sum(1 for ch in letters if ch.isupper()) | |
| ratio = upper_count / len(letters) | |
| return ratio >= 0.8 # %80+ uppercase -> speaker tag | |
| def extract_character_and_clean_text(block: str): | |
| """ | |
| block içinden: | |
| - Character: ilk NAME: (büyük oranda uppercase olan) | |
| - TEXT: NAME: prefix'leri atılmış metin | |
| Eğer satır "normal cümle" ise (örn. Türkçe SRT, speaker yoksa): | |
| - Character = "" | |
| - TEXT = orijinal block | |
| """ | |
| if not block: | |
| return "", "" | |
| lines = block.splitlines() | |
| character = "" | |
| out_lines = [] | |
| for line in lines: | |
| original = line.strip() | |
| if not original: | |
| continue | |
| m = speaker_pattern.match(original) | |
| if m: | |
| name = m.group("name").strip() | |
| after = m.group("after").rstrip() | |
| if looks_like_speaker_name(name): | |
| if not character: | |
| character = name | |
| if after: | |
| out_lines.append(after) | |
| # bu satırı orijinal haliyle TEXT'e eklemiyoruz | |
| continue | |
| # speaker değil -> olduğu gibi TEXT'e ekle | |
| out_lines.append(original) | |
| out_lines = [ln for ln in out_lines if ln.strip()] | |
| return character, "\n".join(out_lines) | |
| def start_time_to_mm_ss(start: str) -> str: | |
| """ | |
| 'HH:MM:SS,mmm' -> 'MM.SS' | |
| """ | |
| hms, *_ = start.split(",") | |
| h, m, s = [int(x) for x in hms.split(":")] | |
| total_seconds = h * 3600 + m * 60 + s | |
| total_minutes = total_seconds // 60 | |
| seconds = total_seconds % 60 | |
| return f"{total_minutes:02d}.{seconds:02d}" | |
| # ====================================================== | |
| # 4) DOCX OLUŞTURMA | |
| # ====================================================== | |
| def style_header_cell(cell, text: str): | |
| """ | |
| Header hücresi: bold + gri background. | |
| """ | |
| p = cell.paragraphs[0] | |
| for r in p.runs: | |
| r.text = "" | |
| run = p.add_run(text) | |
| run.bold = True | |
| tc = cell._tc | |
| tcPr = tc.get_or_add_tcPr() | |
| shd = tcPr.find(qn("w:shd")) | |
| if shd is None: | |
| shd = OxmlElement("w:shd") | |
| tcPr.append(shd) | |
| shd.set(qn("w:fill"), "D9D9D9") # light grey | |
| def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str]: | |
| """ | |
| Tek SRT -> styled DOCX (bytes, filename) | |
| translate_to_tr=False ise *hiçbir şekilde* HF API çağrılmaz. | |
| """ | |
| subs = parse_srt(srt_path) | |
| doc = Document() | |
| # TABLE: Character | TC | note | TEXT | |
| table = doc.add_table(rows=1, cols=4) | |
| table.style = "Table Grid" | |
| hdr_cells = table.rows[0].cells | |
| headers = ["Character", "TC", "note", "TEXT"] | |
| for idx, label in enumerate(headers): | |
| style_header_cell(hdr_cells[idx], label) | |
| # Önce tüm satırları topla, sonra gerekiyorsa toplu çeviri yap | |
| characters: List[str] = [] | |
| tcs: List[str] = [] | |
| texts: List[str] = [] | |
| for sub in subs: | |
| raw_text = sub["text"] | |
| if not raw_text.strip(): | |
| continue | |
| character, clean_txt = extract_character_and_clean_text(raw_text) | |
| if not clean_txt.strip(): | |
| continue | |
| characters.append(character) | |
| tcs.append(start_time_to_mm_ss(sub["start"])) | |
| texts.append(clean_txt) | |
| # Kullanıcı checkbox'ı işaretlemediyse: hiç çeviri yok (HF API çağrısı YOK) | |
| if bool(translate_to_tr): | |
| texts = _translate_batch_en_tr(texts) | |
| # else: texts olduğu gibi kalıyor | |
| # Tabloya yaz | |
| for character, tc, text in zip(characters, tcs, texts): | |
| row = table.add_row() | |
| cells = row.cells | |
| cells[0].text = character # Character (asla çevrilmez) | |
| cells[1].text = tc # TC (MM.SS) | |
| cells[2].text = "" # note | |
| cells[3].text = text # TEXT (çevirildiyse TR, değilse orijinal) | |
| buffer = io.BytesIO() | |
| doc.save(buffer) | |
| buffer.seek(0) | |
| out_name = srt_path.with_suffix(".docx").name | |
| return buffer.getvalue(), out_name | |
| # ====================================================== | |
| # 5) GRADIO: MULTI SRT -> ZIP(DOCX) | |
| # ====================================================== | |
| def process_srt_files(files, translate_to_tr: bool): | |
| """ | |
| Çoklu SRT al, hepsini DOCX'e çevir, tek ZIP döndür. | |
| translate_to_tr False ise HF API'ye hiç gitmez. | |
| """ | |
| if not files: | |
| return None | |
| paths = [Path(p) for p in files] | |
| zip_buffer = io.BytesIO() | |
| with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf: | |
| for path in paths: | |
| doc_bytes, doc_name = srt_to_docx_bytes(path, bool(translate_to_tr)) | |
| zf.writestr(doc_name, doc_bytes) | |
| zip_buffer.seek(0) | |
| out_zip_path = "converted_subtitles.zip" | |
| with open(out_zip_path, "wb") as f: | |
| f.write(zip_buffer.read()) | |
| return out_zip_path | |
| # ====================================================== | |
| # 6) GRADIO UI | |
| # ====================================================== | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # SRT → DOCX (Character / TC / TEXT) + EN→TR (HF Inference + Token) | |
| - Bir veya birden fazla **.srt** yükle. | |
| - Encoding otomatik tespit edilir (UTF-8, Windows-1254, ISO-8859-9, Latin-1). | |
| - Her subtitle bloğu için: | |
| - **Character**: | |
| - `WOMAN:`, `DR. GREENE:`, `HEMSİRE SELMA:` gibi *büyük harf ağırlıklı* isimler → Character. | |
| - Normal Türkçe cümleler -> Character boş, TEXT olduğu gibi. | |
| - **TC**: başlangıç zamanı **MM.SS**. | |
| - **TEXT**: gövde metin, gerçek speaker tag'leri temizlenmiş. | |
| - **Translate TEXT** işaretliyse, sadece TEXT alanı `Helsinki-NLP/opus-mt-tc-big-en-tr` ile EN→TR çevrilir | |
| (Character asla çevrilmez). | |
| - Çıktı: Tüm DOCX'leri içeren tek bir **ZIP** dosya. | |
| """ | |
| ) | |
| with gr.Row(): | |
| srt_files = gr.File( | |
| label="Upload .srt files", | |
| file_types=[".srt"], | |
| file_count="multiple", | |
| type="filepath", | |
| ) | |
| translate_chk = gr.Checkbox( | |
| label="Translate TEXT (EN → TR, only TEXT, not Character)", | |
| value=False, | |
| ) | |
| out_zip = gr.File(label="Download ZIP of DOCX files") | |
| convert_btn = gr.Button("Convert") | |
| convert_btn.click( | |
| fn=process_srt_files, | |
| inputs=[srt_files, translate_chk], | |
| outputs=out_zip, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |