import re import io import zipfile from pathlib import Path from typing import Tuple, Any, Optional, List import os import time import gradio as gr from docx import Document from docx.oxml import OxmlElement from docx.oxml.ns import qn from huggingface_hub import InferenceClient # ====================================================== # 1) HUGGING FACE INFERENCE API (EN -> TR ÇEVİRİ) - BATCH # ====================================================== HF_MODEL = "Helsinki-NLP/opus-mt-tc-big-en-tr" # Space → Settings → Variables and secrets → HF_TOKEN HF_TOKEN = os.environ.get("HF_TOKEN") # Token varsa kullan, yoksa anonim client if HF_TOKEN: client = InferenceClient(token=HF_TOKEN) else: client = InferenceClient() def _extract_translation_text(result: Any) -> str: """ InferenceClient.translation dönüş tipini normalize et: - str - obj.translation_text - {"translation_text": "..."} - [{"translation_text": "..."}] """ if isinstance(result, str): return result if hasattr(result, "translation_text"): try: return result.translation_text # type: ignore[attr-defined] except Exception: pass if isinstance(result, dict) and "translation_text" in result: return str(result["translation_text"]) if isinstance(result, list) and result: item = result[0] if isinstance(item, str): return item if isinstance(item, dict) and "translation_text" in item: return str(item["translation_text"]) if hasattr(item, "translation_text"): try: return item.translation_text # type: ignore[attr-defined] except Exception: pass return str(result) def _translate_batch_en_tr( texts: List[str], max_batch_size: int = 200, max_retries: int = 2, base_sleep: float = 2.0, ) -> List[str]: """ Çoklu TEXT listesi alır, en az istekle EN->TR çevirir. - texts: orijinal metin listesi - return: aynı uzunlukta, çevrilmiş (veya hata durumunda orijinal) metin listesi """ if not texts: return texts result_texts: List[str] = list(texts) # Çok düşük olasılıkla metin içinde geçebilecek, "garip" bir ayracı seçiyoruz SEP = "\n[[BLOCK-SEPARATOR-6b8b4567-ICETEA]]\n" n = len(texts) for start_idx in range(0, n, max_batch_size): end_idx = min(start_idx + max_batch_size, n) batch_indices = list(range(start_idx, end_idx)) batch_texts = [texts[i] for i in batch_indices] # Tamamen boş batch ise atla if not any(t.strip() for t in batch_texts): continue joined = SEP.join(batch_texts) translated_joined: Optional[str] = None for attempt in range(max_retries + 1): try: resp = client.translation(joined, model=HF_MODEL) translated_joined = _extract_translation_text(resp) break except Exception as e: print("HF translation error (batch):", repr(e)) if attempt < max_retries: time.sleep(base_sleep * (attempt + 1)) else: translated_joined = None # Çeviri tamamen patladıysa: bu batch orijinal kalsın if translated_joined is None: continue parts = translated_joined.split(SEP) # Ayracı model bozduysa / sayılar tutmazsa -> batch orijinal kalsın if len(parts) != len(batch_texts): print( "HF translation: mismatch between batch size and split parts, " "keeping original texts for this batch." ) continue # Başarılı: result_texts içine yaz for i, part in zip(batch_indices, parts): result_texts[i] = part return result_texts # ====================================================== # 2) SRT PARSER + ENCODING AUTO-DETECT # ====================================================== def read_srt_text(path: Path) -> str: """ SRT dosyasını binary okuyup birkaç encoding dener: - utf-8-sig - utf-8 - cp1254 (Windows-1254, Türkçe) - iso-8859-9 - latin-1 En az '�' ve kontrol karakteri üreten encoding'i seçer. Böylece 'Hastan�z' yerine 'Hastanız' gibi doğru TR karakterler gelir. """ raw_bytes = path.read_bytes() encodings = ["utf-8-sig", "utf-8", "cp1254", "iso-8859-9", "latin-1"] best_txt: Optional[str] = None best_score: Optional[int] = None best_enc: Optional[str] = None for enc in encodings: try: txt = raw_bytes.decode(enc, errors="replace") except LookupError: continue bad_repl = txt.count("�") bad_ctrl = sum( 1 for ch in txt if ord(ch) < 32 and ch not in "\n\r\t" ) score = bad_repl * 10 + bad_ctrl if best_score is None or score < best_score: best_score = score best_txt = txt best_enc = enc print(f"[SRT ENCODING] {path.name}: {best_enc} (score={best_score})") return best_txt if best_txt is not None else raw_bytes.decode("utf-8", errors="replace") def parse_srt(path: Path): """ SRT -> [{index, start, end, text}, ...] Encoding, read_srt_text ile otomatik tespit edilir (TR charset dahil). """ raw = read_srt_text(path).strip() blocks = re.split(r"\n\s*\n", raw) subs = [] time_re = re.compile( r"(?P\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*" r"(?P\d{2}:\d{2}:\d{2},\d{3})" ) for block in blocks: lines = [ln.strip() for ln in block.splitlines() if ln.strip()] if len(lines) < 2: continue # klasik blok: # 1 # 00:00:13,555 --> 00:00:17,559 # DR. GREENE: ... try: idx = int(lines[0]) time_line = lines[1] text_lines = lines[2:] except ValueError: idx = None time_line = lines[0] text_lines = lines[1:] m = time_re.match(time_line) if not m: continue start = m.group("start") end = m.group("end") text = "\n".join(text_lines) subs.append( { "index": idx, "start": start, "end": end, "text": text, } ) return subs # ====================================================== # 3) KARAKTER ÇIKARMA + TEXT TEMİZLEME (TR-SAFE HEURISTIC) # ====================================================== # Unicode harf tabanlı name-word: # - [^\W\d_] = herhangi bir Unicode harfi (A-Z, a-z, Ç,Ğ,İ,Ö,Ş,Ü,ç,ğ,ı,ö,ş,ü vs.) # - sonrasında harf, nokta, apostrof, tire gelebilir name_word = r"[^\W\d_][^\W\d_.'-]*" speaker_pattern = re.compile( rf'^\s*(?:>{{1,3}}\s*)?(?:-+\s*)?' rf'(?P(?:{name_word}(?:\s+{name_word}){{0,4}}))' rf'\s*:\s*(?P.*)$', flags=re.UNICODE, ) def looks_like_speaker_name(name: str) -> bool: """ Sadece büyük harf oranı yüksek olan isimleri speaker olarak kabul et. Örn: "DR. GREENE" -> EVET "HEMSİRE SELMA" -> EVET "Doktor" -> HAYIR "Merhaba" -> HAYIR """ letters = [ch for ch in name if ch.isalpha()] if not letters: return False upper_count = sum(1 for ch in letters if ch.isupper()) ratio = upper_count / len(letters) return ratio >= 0.8 # %80+ uppercase -> speaker tag def extract_character_and_clean_text(block: str): """ block içinden: - Character: ilk NAME: (büyük oranda uppercase olan) - TEXT: NAME: prefix'leri atılmış metin Eğer satır "normal cümle" ise (örn. Türkçe SRT, speaker yoksa): - Character = "" - TEXT = orijinal block """ if not block: return "", "" lines = block.splitlines() character = "" out_lines = [] for line in lines: original = line.strip() if not original: continue m = speaker_pattern.match(original) if m: name = m.group("name").strip() after = m.group("after").rstrip() if looks_like_speaker_name(name): if not character: character = name if after: out_lines.append(after) # bu satırı orijinal haliyle TEXT'e eklemiyoruz continue # speaker değil -> olduğu gibi TEXT'e ekle out_lines.append(original) out_lines = [ln for ln in out_lines if ln.strip()] return character, "\n".join(out_lines) def start_time_to_mm_ss(start: str) -> str: """ 'HH:MM:SS,mmm' -> 'MM.SS' """ hms, *_ = start.split(",") h, m, s = [int(x) for x in hms.split(":")] total_seconds = h * 3600 + m * 60 + s total_minutes = total_seconds // 60 seconds = total_seconds % 60 return f"{total_minutes:02d}.{seconds:02d}" # ====================================================== # 4) DOCX OLUŞTURMA # ====================================================== def style_header_cell(cell, text: str): """ Header hücresi: bold + gri background. """ p = cell.paragraphs[0] for r in p.runs: r.text = "" run = p.add_run(text) run.bold = True tc = cell._tc tcPr = tc.get_or_add_tcPr() shd = tcPr.find(qn("w:shd")) if shd is None: shd = OxmlElement("w:shd") tcPr.append(shd) shd.set(qn("w:fill"), "D9D9D9") # light grey def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str]: """ Tek SRT -> styled DOCX (bytes, filename) translate_to_tr=False ise *hiçbir şekilde* HF API çağrılmaz. """ subs = parse_srt(srt_path) doc = Document() # TABLE: Character | TC | note | TEXT table = doc.add_table(rows=1, cols=4) table.style = "Table Grid" hdr_cells = table.rows[0].cells headers = ["Character", "TC", "note", "TEXT"] for idx, label in enumerate(headers): style_header_cell(hdr_cells[idx], label) # Önce tüm satırları topla, sonra gerekiyorsa toplu çeviri yap characters: List[str] = [] tcs: List[str] = [] texts: List[str] = [] for sub in subs: raw_text = sub["text"] if not raw_text.strip(): continue character, clean_txt = extract_character_and_clean_text(raw_text) if not clean_txt.strip(): continue characters.append(character) tcs.append(start_time_to_mm_ss(sub["start"])) texts.append(clean_txt) # Kullanıcı checkbox'ı işaretlemediyse: hiç çeviri yok (HF API çağrısı YOK) if bool(translate_to_tr): texts = _translate_batch_en_tr(texts) # else: texts olduğu gibi kalıyor # Tabloya yaz for character, tc, text in zip(characters, tcs, texts): row = table.add_row() cells = row.cells cells[0].text = character # Character (asla çevrilmez) cells[1].text = tc # TC (MM.SS) cells[2].text = "" # note cells[3].text = text # TEXT (çevirildiyse TR, değilse orijinal) buffer = io.BytesIO() doc.save(buffer) buffer.seek(0) out_name = srt_path.with_suffix(".docx").name return buffer.getvalue(), out_name # ====================================================== # 5) GRADIO: MULTI SRT -> ZIP(DOCX) # ====================================================== def process_srt_files(files, translate_to_tr: bool): """ Çoklu SRT al, hepsini DOCX'e çevir, tek ZIP döndür. translate_to_tr False ise HF API'ye hiç gitmez. """ if not files: return None paths = [Path(p) for p in files] zip_buffer = io.BytesIO() with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf: for path in paths: doc_bytes, doc_name = srt_to_docx_bytes(path, bool(translate_to_tr)) zf.writestr(doc_name, doc_bytes) zip_buffer.seek(0) out_zip_path = "converted_subtitles.zip" with open(out_zip_path, "wb") as f: f.write(zip_buffer.read()) return out_zip_path # ====================================================== # 6) GRADIO UI # ====================================================== with gr.Blocks() as demo: gr.Markdown( """ # SRT → DOCX (Character / TC / TEXT) + EN→TR (HF Inference + Token) - Bir veya birden fazla **.srt** yükle. - Encoding otomatik tespit edilir (UTF-8, Windows-1254, ISO-8859-9, Latin-1). - Her subtitle bloğu için: - **Character**: - `WOMAN:`, `DR. GREENE:`, `HEMSİRE SELMA:` gibi *büyük harf ağırlıklı* isimler → Character. - Normal Türkçe cümleler -> Character boş, TEXT olduğu gibi. - **TC**: başlangıç zamanı **MM.SS**. - **TEXT**: gövde metin, gerçek speaker tag'leri temizlenmiş. - **Translate TEXT** işaretliyse, sadece TEXT alanı `Helsinki-NLP/opus-mt-tc-big-en-tr` ile EN→TR çevrilir (Character asla çevrilmez). - Çıktı: Tüm DOCX'leri içeren tek bir **ZIP** dosya. """ ) with gr.Row(): srt_files = gr.File( label="Upload .srt files", file_types=[".srt"], file_count="multiple", type="filepath", ) translate_chk = gr.Checkbox( label="Translate TEXT (EN → TR, only TEXT, not Character)", value=False, ) out_zip = gr.File(label="Download ZIP of DOCX files") convert_btn = gr.Button("Convert") convert_btn.click( fn=process_srt_files, inputs=[srt_files, translate_chk], outputs=out_zip, ) if __name__ == "__main__": demo.launch()