armyneo commited on
Commit
db31285
·
verified ·
1 Parent(s): be8f198

update to checkbox

Browse files
Files changed (1) hide show
  1. app.py +86 -46
app.py CHANGED
@@ -2,7 +2,10 @@ import re
2
  import io
3
  import zipfile
4
  from pathlib import Path
5
- from typing import Tuple, Any, Optional
 
 
 
6
 
7
  import gradio as gr
8
  from docx import Document
@@ -11,22 +14,19 @@ from docx.oxml.ns import qn
11
  from huggingface_hub import InferenceClient
12
 
13
  # ======================================================
14
- # 1) HUGGING FACE INFERENCE API (EN -> TR ÇEVİRİ)
15
  # ======================================================
16
 
17
  HF_MODEL = "Helsinki-NLP/opus-mt-tc-big-en-tr"
18
 
19
- # HF token (Space → Settings → Variables and secrets → HF_TOKEN = hf_...)
20
- import os
21
-
22
  HF_TOKEN = os.environ.get("HF_TOKEN")
23
 
 
24
  if HF_TOKEN:
25
- # Token varsa: daha yüksek limit, özel modellere erişim vs.
26
- client = InferenceClient(model=HF_MODEL, token=HF_TOKEN)
27
  else:
28
- # Token yoksa: public, düşük limit ama çalışır
29
- client = InferenceClient(model=HF_MODEL)
30
 
31
 
32
  def _extract_translation_text(result: Any) -> str:
@@ -64,35 +64,67 @@ def _extract_translation_text(result: Any) -> str:
64
  return str(result)
65
 
66
 
67
- def translate_en_tr(text: str) -> str:
 
 
 
 
 
68
  """
69
- EN -> TR çeviri (HF Inference API).
70
- Satır satır gönderiyoruz, satır yapısı korunuyor.
71
- Hata olursa orijinal metni döndürür (app crash etmez).
72
  """
73
- text = text.strip()
74
- if not text:
75
- return text
76
 
77
- lines = text.splitlines()
78
- out_lines = []
 
79
 
80
- for line in lines:
81
- if not line.strip():
82
- out_lines.append("")
 
 
 
 
 
83
  continue
84
 
85
- try:
86
- # client zaten model=HF_MODEL ile bağlı
87
- result = client.translation(line)
88
- translated = _extract_translation_text(result)
89
- except Exception as e:
90
- print("HF translation error:", repr(e))
91
- translated = line # fallback
92
 
93
- out_lines.append(translated)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- return "\n".join(out_lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
 
98
  # ======================================================
@@ -202,7 +234,7 @@ def parse_srt(path: Path):
202
  name_word = r"[^\W\d_][^\W\d_.'-]*"
203
 
204
  speaker_pattern = re.compile(
205
- rf'^\s*(?:>{1,3}\s*)?(?:-+\s*)?'
206
  rf'(?P<name>(?:{name_word}(?:\s+{name_word}){{0,4}}))'
207
  rf'\s*:\s*(?P<after>.*)$',
208
  flags=re.UNICODE,
@@ -306,6 +338,7 @@ def style_header_cell(cell, text: str):
306
  def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str]:
307
  """
308
  Tek SRT -> styled DOCX (bytes, filename)
 
309
  """
310
  subs = parse_srt(srt_path)
311
  doc = Document()
@@ -319,6 +352,11 @@ def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str
319
  for idx, label in enumerate(headers):
320
  style_header_cell(hdr_cells[idx], label)
321
 
 
 
 
 
 
322
  for sub in subs:
323
  raw_text = sub["text"]
324
  if not raw_text.strip():
@@ -328,23 +366,24 @@ def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str
328
  if not clean_txt.strip():
329
  continue
330
 
331
- row = table.add_row()
332
- cells = row.cells
333
-
334
- # Character (asla çevrilmez)
335
- cells[0].text = character
336
 
337
- # TC -> MM.SS
338
- cells[1].text = start_time_to_mm_ss(sub["start"])
 
 
339
 
340
- # note -> boş
341
- cells[2].text = ""
 
 
342
 
343
- # TEXT -> isteğe bağlı EN->TR
344
- if translate_to_tr:
345
- cells[3].text = translate_en_tr(clean_txt)
346
- else:
347
- cells[3].text = clean_txt
348
 
349
  buffer = io.BytesIO()
350
  doc.save(buffer)
@@ -361,6 +400,7 @@ def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str
361
  def process_srt_files(files, translate_to_tr: bool):
362
  """
363
  Çoklu SRT al, hepsini DOCX'e çevir, tek ZIP döndür.
 
364
  """
365
  if not files:
366
  return None
@@ -370,7 +410,7 @@ def process_srt_files(files, translate_to_tr: bool):
370
  zip_buffer = io.BytesIO()
371
  with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
372
  for path in paths:
373
- doc_bytes, doc_name = srt_to_docx_bytes(path, translate_to_tr)
374
  zf.writestr(doc_name, doc_bytes)
375
 
376
  zip_buffer.seek(0)
 
2
  import io
3
  import zipfile
4
  from pathlib import Path
5
+ from typing import Tuple, Any, Optional, List
6
+
7
+ import os
8
+ import time
9
 
10
  import gradio as gr
11
  from docx import Document
 
14
  from huggingface_hub import InferenceClient
15
 
16
  # ======================================================
17
+ # 1) HUGGING FACE INFERENCE API (EN -> TR ÇEVİRİ) - BATCH
18
  # ======================================================
19
 
20
  HF_MODEL = "Helsinki-NLP/opus-mt-tc-big-en-tr"
21
 
22
+ # Space → Settings → Variables and secrets → HF_TOKEN
 
 
23
  HF_TOKEN = os.environ.get("HF_TOKEN")
24
 
25
+ # Token varsa kullan, yoksa anonim client
26
  if HF_TOKEN:
27
+ client = InferenceClient(token=HF_TOKEN)
 
28
  else:
29
+ client = InferenceClient()
 
30
 
31
 
32
  def _extract_translation_text(result: Any) -> str:
 
64
  return str(result)
65
 
66
 
67
+ def _translate_batch_en_tr(
68
+ texts: List[str],
69
+ max_batch_size: int = 200,
70
+ max_retries: int = 2,
71
+ base_sleep: float = 2.0,
72
+ ) -> List[str]:
73
  """
74
+ Çoklu TEXT listesi alır, en az istekle EN->TR çevirir.
75
+ - texts: orijinal metin listesi
76
+ - return: aynı uzunlukta, çevrilmiş (veya hata durumunda orijinal) metin listesi
77
  """
78
+ if not texts:
79
+ return texts
 
80
 
81
+ result_texts: List[str] = list(texts)
82
+ # Çok düşük olasılıkla metin içinde geçebilecek, "garip" bir ayracı seçiyoruz
83
+ SEP = "\n[[BLOCK-SEPARATOR-6b8b4567-ICETEA]]\n"
84
 
85
+ n = len(texts)
86
+ for start_idx in range(0, n, max_batch_size):
87
+ end_idx = min(start_idx + max_batch_size, n)
88
+ batch_indices = list(range(start_idx, end_idx))
89
+ batch_texts = [texts[i] for i in batch_indices]
90
+
91
+ # Tamamen boş batch ise atla
92
+ if not any(t.strip() for t in batch_texts):
93
  continue
94
 
95
+ joined = SEP.join(batch_texts)
96
+ translated_joined: Optional[str] = None
 
 
 
 
 
97
 
98
+ for attempt in range(max_retries + 1):
99
+ try:
100
+ resp = client.translation(joined, model=HF_MODEL)
101
+ translated_joined = _extract_translation_text(resp)
102
+ break
103
+ except Exception as e:
104
+ print("HF translation error (batch):", repr(e))
105
+ if attempt < max_retries:
106
+ time.sleep(base_sleep * (attempt + 1))
107
+ else:
108
+ translated_joined = None
109
+
110
+ # Çeviri tamamen patladıysa: bu batch orijinal kalsın
111
+ if translated_joined is None:
112
+ continue
113
 
114
+ parts = translated_joined.split(SEP)
115
+ # Ayracı model bozduysa / sayılar tutmazsa -> batch orijinal kalsın
116
+ if len(parts) != len(batch_texts):
117
+ print(
118
+ "HF translation: mismatch between batch size and split parts, "
119
+ "keeping original texts for this batch."
120
+ )
121
+ continue
122
+
123
+ # Başarılı: result_texts içine yaz
124
+ for i, part in zip(batch_indices, parts):
125
+ result_texts[i] = part
126
+
127
+ return result_texts
128
 
129
 
130
  # ======================================================
 
234
  name_word = r"[^\W\d_][^\W\d_.'-]*"
235
 
236
  speaker_pattern = re.compile(
237
+ rf'^\s*(?:>{{1,3}}\s*)?(?:-+\s*)?'
238
  rf'(?P<name>(?:{name_word}(?:\s+{name_word}){{0,4}}))'
239
  rf'\s*:\s*(?P<after>.*)$',
240
  flags=re.UNICODE,
 
338
  def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str]:
339
  """
340
  Tek SRT -> styled DOCX (bytes, filename)
341
+ translate_to_tr=False ise *hiçbir şekilde* HF API çağrılmaz.
342
  """
343
  subs = parse_srt(srt_path)
344
  doc = Document()
 
352
  for idx, label in enumerate(headers):
353
  style_header_cell(hdr_cells[idx], label)
354
 
355
+ # Önce tüm satırları topla, sonra gerekiyorsa toplu çeviri yap
356
+ characters: List[str] = []
357
+ tcs: List[str] = []
358
+ texts: List[str] = []
359
+
360
  for sub in subs:
361
  raw_text = sub["text"]
362
  if not raw_text.strip():
 
366
  if not clean_txt.strip():
367
  continue
368
 
369
+ characters.append(character)
370
+ tcs.append(start_time_to_mm_ss(sub["start"]))
371
+ texts.append(clean_txt)
 
 
372
 
373
+ # Kullanıcı checkbox'ı işaretlemediyse: hiç çeviri yok (HF API çağrısı YOK)
374
+ if bool(translate_to_tr):
375
+ texts = _translate_batch_en_tr(texts)
376
+ # else: texts olduğu gibi kalıyor
377
 
378
+ # Tabloya yaz
379
+ for character, tc, text in zip(characters, tcs, texts):
380
+ row = table.add_row()
381
+ cells = row.cells
382
 
383
+ cells[0].text = character # Character (asla çevrilmez)
384
+ cells[1].text = tc # TC (MM.SS)
385
+ cells[2].text = "" # note
386
+ cells[3].text = text # TEXT (çevirildiyse TR, değilse orijinal)
 
387
 
388
  buffer = io.BytesIO()
389
  doc.save(buffer)
 
400
  def process_srt_files(files, translate_to_tr: bool):
401
  """
402
  Çoklu SRT al, hepsini DOCX'e çevir, tek ZIP döndür.
403
+ translate_to_tr False ise HF API'ye hiç gitmez.
404
  """
405
  if not files:
406
  return None
 
410
  zip_buffer = io.BytesIO()
411
  with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
412
  for path in paths:
413
+ doc_bytes, doc_name = srt_to_docx_bytes(path, bool(translate_to_tr))
414
  zf.writestr(doc_name, doc_bytes)
415
 
416
  zip_buffer.seek(0)