Spaces:

armyneo
/

srtconvert

Running

App Files Files Community

srtconvert / app.py

armyneo

update to checkbox

db31285 verified about 2 months ago

raw

history blame contribute delete

14 kB

	import re
	import io
	import zipfile
	from pathlib import Path
	from typing import Tuple, Any, Optional, List

	import os
	import time

	import gradio as gr
	from docx import Document
	from docx.oxml import OxmlElement
	from docx.oxml.ns import qn
	from huggingface_hub import InferenceClient

	# ======================================================
	# 1) HUGGING FACE INFERENCE API (EN -> TR ÇEVİRİ) - BATCH
	# ======================================================

	HF_MODEL = "Helsinki-NLP/opus-mt-tc-big-en-tr"

	# Space → Settings → Variables and secrets → HF_TOKEN
	HF_TOKEN = os.environ.get("HF_TOKEN")

	# Token varsa kullan, yoksa anonim client
	if HF_TOKEN:
	client = InferenceClient(token=HF_TOKEN)
	else:
	client = InferenceClient()


	def _extract_translation_text(result: Any) -> str:
	"""
	InferenceClient.translation dönüş tipini normalize et:
	- str
	- obj.translation_text
	- {"translation_text": "..."}
	- [{"translation_text": "..."}]
	"""
	if isinstance(result, str):
	return result

	if hasattr(result, "translation_text"):
	try:
	return result.translation_text # type: ignore[attr-defined]
	except Exception:
	pass

	if isinstance(result, dict) and "translation_text" in result:
	return str(result["translation_text"])

	if isinstance(result, list) and result:
	item = result[0]
	if isinstance(item, str):
	return item
	if isinstance(item, dict) and "translation_text" in item:
	return str(item["translation_text"])
	if hasattr(item, "translation_text"):
	try:
	return item.translation_text # type: ignore[attr-defined]
	except Exception:
	pass

	return str(result)


	def _translate_batch_en_tr(
	texts: List[str],
	max_batch_size: int = 200,
	max_retries: int = 2,
	base_sleep: float = 2.0,
	) -> List[str]:
	"""
	Çoklu TEXT listesi alır, en az istekle EN->TR çevirir.
	- texts: orijinal metin listesi
	- return: aynı uzunlukta, çevrilmiş (veya hata durumunda orijinal) metin listesi
	"""
	if not texts:
	return texts

	result_texts: List[str] = list(texts)
	# Çok düşük olasılıkla metin içinde geçebilecek, "garip" bir ayracı seçiyoruz
	SEP = "\n[[BLOCK-SEPARATOR-6b8b4567-ICETEA]]\n"

	n = len(texts)
	for start_idx in range(0, n, max_batch_size):
	end_idx = min(start_idx + max_batch_size, n)
	batch_indices = list(range(start_idx, end_idx))
	batch_texts = [texts[i] for i in batch_indices]

	# Tamamen boş batch ise atla
	if not any(t.strip() for t in batch_texts):
	continue

	joined = SEP.join(batch_texts)
	translated_joined: Optional[str] = None

	for attempt in range(max_retries + 1):
	try:
	resp = client.translation(joined, model=HF_MODEL)
	translated_joined = _extract_translation_text(resp)
	break
	except Exception as e:
	print("HF translation error (batch):", repr(e))
	if attempt < max_retries:
	time.sleep(base_sleep * (attempt + 1))
	else:
	translated_joined = None

	# Çeviri tamamen patladıysa: bu batch orijinal kalsın
	if translated_joined is None:
	continue

	parts = translated_joined.split(SEP)
	# Ayracı model bozduysa / sayılar tutmazsa -> batch orijinal kalsın
	if len(parts) != len(batch_texts):
	print(
	"HF translation: mismatch between batch size and split parts, "
	"keeping original texts for this batch."
	)
	continue

	# Başarılı: result_texts içine yaz
	for i, part in zip(batch_indices, parts):
	result_texts[i] = part

	return result_texts


	# ======================================================
	# 2) SRT PARSER + ENCODING AUTO-DETECT
	# ======================================================

	def read_srt_text(path: Path) -> str:
	"""
	SRT dosyasını binary okuyup birkaç encoding dener:
	- utf-8-sig
	- utf-8
	- cp1254 (Windows-1254, Türkçe)
	- iso-8859-9
	- latin-1

	En az '�' ve kontrol karakteri üreten encoding'i seçer.
	Böylece 'Hastan�z' yerine 'Hastanız' gibi doğru TR karakterler gelir.
	"""
	raw_bytes = path.read_bytes()
	encodings = ["utf-8-sig", "utf-8", "cp1254", "iso-8859-9", "latin-1"]

	best_txt: Optional[str] = None
	best_score: Optional[int] = None
	best_enc: Optional[str] = None

	for enc in encodings:
	try:
	txt = raw_bytes.decode(enc, errors="replace")
	except LookupError:
	continue

	bad_repl = txt.count("�")
	bad_ctrl = sum(
	1 for ch in txt
	if ord(ch) < 32 and ch not in "\n\r\t"
	)
	score = bad_repl * 10 + bad_ctrl

	if best_score is None or score < best_score:
	best_score = score
	best_txt = txt
	best_enc = enc

	print(f"[SRT ENCODING] {path.name}: {best_enc} (score={best_score})")
	return best_txt if best_txt is not None else raw_bytes.decode("utf-8", errors="replace")


	def parse_srt(path: Path):
	"""
	SRT -> [{index, start, end, text}, ...]
	Encoding, read_srt_text ile otomatik tespit edilir (TR charset dahil).
	"""
	raw = read_srt_text(path).strip()
	blocks = re.split(r"\n\s*\n", raw)
	subs = []

	time_re = re.compile(
	r"(?P<start>\d{2}:\d{2}:\d{2},\d{3})\s-->\s"
	r"(?P<end>\d{2}:\d{2}:\d{2},\d{3})"
	)

	for block in blocks:
	lines = [ln.strip() for ln in block.splitlines() if ln.strip()]
	if len(lines) < 2:
	continue

	# klasik blok:
	# 1
	# 00:00:13,555 --> 00:00:17,559
	# DR. GREENE: ...
	try:
	idx = int(lines[0])
	time_line = lines[1]
	text_lines = lines[2:]
	except ValueError:
	idx = None
	time_line = lines[0]
	text_lines = lines[1:]

	m = time_re.match(time_line)
	if not m:
	continue

	start = m.group("start")
	end = m.group("end")
	text = "\n".join(text_lines)

	subs.append(
	{
	"index": idx,
	"start": start,
	"end": end,
	"text": text,
	}
	)

	return subs


	# ======================================================
	# 3) KARAKTER ÇIKARMA + TEXT TEMİZLEME (TR-SAFE HEURISTIC)
	# ======================================================

	# Unicode harf tabanlı name-word:
	# - [^\W\d_] = herhangi bir Unicode harfi (A-Z, a-z, Ç,Ğ,İ,Ö,Ş,Ü,ç,ğ,ı,ö,ş,ü vs.)
	# - sonrasında harf, nokta, apostrof, tire gelebilir
	name_word = r"[^\W\d_][^\W\d_.'-]*"

	speaker_pattern = re.compile(
	rf'^\s(?:>{{1,3}}\s)?(?:-+\s*)?'
	rf'(?P<name>(?:{name_word}(?:\s+{name_word}){{0,4}}))'
	rf'\s:\s(?P<after>.*)$',
	flags=re.UNICODE,
	)


	def looks_like_speaker_name(name: str) -> bool:
	"""
	Sadece büyük harf oranı yüksek olan isimleri speaker olarak kabul et.
	Örn:
	"DR. GREENE" -> EVET
	"HEMSİRE SELMA" -> EVET
	"Doktor" -> HAYIR
	"Merhaba" -> HAYIR
	"""
	letters = [ch for ch in name if ch.isalpha()]
	if not letters:
	return False
	upper_count = sum(1 for ch in letters if ch.isupper())
	ratio = upper_count / len(letters)
	return ratio >= 0.8 # %80+ uppercase -> speaker tag


	def extract_character_and_clean_text(block: str):
	"""
	block içinden:
	- Character: ilk NAME: (büyük oranda uppercase olan)
	- TEXT: NAME: prefix'leri atılmış metin

	Eğer satır "normal cümle" ise (örn. Türkçe SRT, speaker yoksa):
	- Character = ""
	- TEXT = orijinal block
	"""
	if not block:
	return "", ""

	lines = block.splitlines()
	character = ""
	out_lines = []

	for line in lines:
	original = line.strip()
	if not original:
	continue

	m = speaker_pattern.match(original)
	if m:
	name = m.group("name").strip()
	after = m.group("after").rstrip()

	if looks_like_speaker_name(name):
	if not character:
	character = name
	if after:
	out_lines.append(after)
	# bu satırı orijinal haliyle TEXT'e eklemiyoruz
	continue

	# speaker değil -> olduğu gibi TEXT'e ekle
	out_lines.append(original)

	out_lines = [ln for ln in out_lines if ln.strip()]
	return character, "\n".join(out_lines)


	def start_time_to_mm_ss(start: str) -> str:
	"""
	'HH:MM:SS,mmm' -> 'MM.SS'
	"""
	hms, *_ = start.split(",")
	h, m, s = [int(x) for x in hms.split(":")]
	total_seconds = h * 3600 + m * 60 + s
	total_minutes = total_seconds // 60
	seconds = total_seconds % 60
	return f"{total_minutes:02d}.{seconds:02d}"


	# ======================================================
	# 4) DOCX OLUŞTURMA
	# ======================================================

	def style_header_cell(cell, text: str):
	"""
	Header hücresi: bold + gri background.
	"""
	p = cell.paragraphs[0]
	for r in p.runs:
	r.text = ""
	run = p.add_run(text)
	run.bold = True

	tc = cell._tc
	tcPr = tc.get_or_add_tcPr()
	shd = tcPr.find(qn("w:shd"))
	if shd is None:
	shd = OxmlElement("w:shd")
	tcPr.append(shd)
	shd.set(qn("w:fill"), "D9D9D9") # light grey


	def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str]:
	"""
	Tek SRT -> styled DOCX (bytes, filename)
	translate_to_tr=False ise hiçbir şekilde HF API çağrılmaz.
	"""
	subs = parse_srt(srt_path)
	doc = Document()

	# TABLE: Character \| TC \| note \| TEXT
	table = doc.add_table(rows=1, cols=4)
	table.style = "Table Grid"

	hdr_cells = table.rows[0].cells
	headers = ["Character", "TC", "note", "TEXT"]
	for idx, label in enumerate(headers):
	style_header_cell(hdr_cells[idx], label)

	# Önce tüm satırları topla, sonra gerekiyorsa toplu çeviri yap
	characters: List[str] = []
	tcs: List[str] = []
	texts: List[str] = []

	for sub in subs:
	raw_text = sub["text"]
	if not raw_text.strip():
	continue

	character, clean_txt = extract_character_and_clean_text(raw_text)
	if not clean_txt.strip():
	continue

	characters.append(character)
	tcs.append(start_time_to_mm_ss(sub["start"]))
	texts.append(clean_txt)

	# Kullanıcı checkbox'ı işaretlemediyse: hiç çeviri yok (HF API çağrısı YOK)
	if bool(translate_to_tr):
	texts = _translate_batch_en_tr(texts)
	# else: texts olduğu gibi kalıyor

	# Tabloya yaz
	for character, tc, text in zip(characters, tcs, texts):
	row = table.add_row()
	cells = row.cells

	cells[0].text = character # Character (asla çevrilmez)
	cells[1].text = tc # TC (MM.SS)
	cells[2].text = "" # note
	cells[3].text = text # TEXT (çevirildiyse TR, değilse orijinal)

	buffer = io.BytesIO()
	doc.save(buffer)
	buffer.seek(0)

	out_name = srt_path.with_suffix(".docx").name
	return buffer.getvalue(), out_name


	# ======================================================
	# 5) GRADIO: MULTI SRT -> ZIP(DOCX)
	# ======================================================

	def process_srt_files(files, translate_to_tr: bool):
	"""
	Çoklu SRT al, hepsini DOCX'e çevir, tek ZIP döndür.
	translate_to_tr False ise HF API'ye hiç gitmez.
	"""
	if not files:
	return None

	paths = [Path(p) for p in files]

	zip_buffer = io.BytesIO()
	with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
	for path in paths:
	doc_bytes, doc_name = srt_to_docx_bytes(path, bool(translate_to_tr))
	zf.writestr(doc_name, doc_bytes)

	zip_buffer.seek(0)
	out_zip_path = "converted_subtitles.zip"
	with open(out_zip_path, "wb") as f:
	f.write(zip_buffer.read())

	return out_zip_path


	# ======================================================
	# 6) GRADIO UI
	# ======================================================

	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# SRT → DOCX (Character / TC / TEXT) + EN→TR (HF Inference + Token)

	- Bir veya birden fazla .srt yükle.
	- Encoding otomatik tespit edilir (UTF-8, Windows-1254, ISO-8859-9, Latin-1).
	- Her subtitle bloğu için:
	- Character:
	- `WOMAN:`, `DR. GREENE:`, `HEMSİRE SELMA:` gibi büyük harf ağırlıklı isimler → Character.
	- Normal Türkçe cümleler -> Character boş, TEXT olduğu gibi.
	- TC: başlangıç zamanı MM.SS.
	- TEXT: gövde metin, gerçek speaker tag'leri temizlenmiş.
	- Translate TEXT işaretliyse, sadece TEXT alanı `Helsinki-NLP/opus-mt-tc-big-en-tr` ile EN→TR çevrilir
	(Character asla çevrilmez).
	- Çıktı: Tüm DOCX'leri içeren tek bir ZIP dosya.
	"""
	)

	with gr.Row():
	srt_files = gr.File(
	label="Upload .srt files",
	file_types=[".srt"],
	file_count="multiple",
	type="filepath",
	)

	translate_chk = gr.Checkbox(
	label="Translate TEXT (EN → TR, only TEXT, not Character)",
	value=False,
	)

	out_zip = gr.File(label="Download ZIP of DOCX files")

	convert_btn = gr.Button("Convert")

	convert_btn.click(
	fn=process_srt_files,
	inputs=[srt_files, translate_chk],
	outputs=out_zip,
	)

	if __name__ == "__main__":
	demo.launch()