armyneo commited on
Commit
18c13e3
·
verified ·
1 Parent(s): 3023c6b

new model addition

Browse files
Files changed (1) hide show
  1. app.py +73 -108
app.py CHANGED
@@ -3,126 +3,98 @@ import re
3
  import io
4
  import zipfile
5
  from pathlib import Path
6
- from typing import Tuple, List
7
 
8
- import requests
9
  import gradio as gr
10
  from docx import Document
11
  from docx.oxml import OxmlElement
12
  from docx.oxml.ns import qn
 
13
 
 
 
 
14
 
15
- # ----------------------------------------------------
16
- # 1) HUGGING FACE INFERENCE API (EN -> TR)
17
- # ----------------------------------------------------
18
-
19
- HF_TOKEN = os.getenv("HF_TOKEN")
20
  if not HF_TOKEN:
 
21
  raise RuntimeError(
22
  "HF_TOKEN environment variable is not set. "
23
- "Add it in Space Settings → Variables and secrets."
24
  )
25
 
26
- # Küçük EN→TR modeli
27
- MODEL_ID = "Helsinki-NLP/opus-mt-en-tr"
28
- API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"
29
- HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
30
 
31
- MAX_BATCH_SIZE = 16 # satırları parça parça yollayalım
 
 
 
32
 
33
 
34
- def _hf_translate_batch(lines: List[str]) -> List[str]:
35
  """
36
- HF Inference API'ye tek batch istek.
37
- lines: boş olmayan EN string listesi.
38
- return: TR string listesi (aynı uzunlukta).
 
 
39
  """
40
- if not lines:
41
- return []
42
-
43
- payload = {"inputs": lines}
44
- resp = requests.post(API_URL, headers=HEADERS, json=payload, timeout=120)
45
- resp.raise_for_status()
46
- data = resp.json()
47
-
48
- out: List[str] = []
49
-
50
- # Çıkan JSON bazen:
51
- # - [[{"translation_text": "..."}], ...]
52
- # - [{"translation_text": "..."}, ...]
53
- # - [{"generated_text": "..."}, ...]
54
- for item in data:
55
- obj = item
56
- if isinstance(item, list) and item:
57
- obj = item[0]
58
-
59
- if isinstance(obj, dict):
60
- if "translation_text" in obj:
61
- out.append(obj["translation_text"])
62
- elif "generated_text" in obj:
63
- out.append(obj["generated_text"])
64
- else:
65
- out.append("")
66
- else:
67
- out.append(str(obj))
68
 
69
- # Güvenlik için uzunluk eşitle
70
- if len(out) < len(lines):
71
- out.extend([""] * (len(lines) - len(out)))
72
- elif len(out) > len(lines):
73
- out = out[: len(lines)]
74
 
75
- return out
 
 
 
 
 
 
 
 
76
 
77
 
78
  def translate_en_tr(text: str) -> str:
79
  """
80
- EN->TR çeviri (satır yapısını korur).
81
- - Satırları böler.
82
- - Boş olmayanları batch batch Inference API'ye yollar.
83
- - Aynı sırayla geri yerleştirir.
84
- Hata durumunda orijinal text'i döner.
85
  """
86
- text = (text or "").strip()
87
  if not text:
88
  return text
89
 
90
  lines = text.splitlines()
91
- idxs = [i for i, ln in enumerate(lines) if ln.strip()]
92
- to_translate = [lines[i] for i in idxs]
93
-
94
- if not to_translate:
95
- return text
96
 
97
- translated_all: List[str] = []
98
-
99
- try:
100
- # Parça parça gönder (MAX_BATCH_SIZE)
101
- for start in range(0, len(to_translate), MAX_BATCH_SIZE):
102
- chunk = to_translate[start : start + MAX_BATCH_SIZE]
103
- chunk_out = _hf_translate_batch(chunk)
104
- translated_all.extend(chunk_out)
105
- except Exception as e:
106
- # Çeviri patlarsa tümünü orijinal bırak
107
- print("HF translation error:", repr(e))
108
- return text
109
-
110
- # Uzunluk makyajı
111
- if len(translated_all) < len(to_translate):
112
- translated_all.extend([""] * (len(to_translate) - len(translated_all)))
113
- elif len(translated_all) > len(to_translate):
114
- translated_all = translated_all[: len(to_translate)]
115
 
116
- out_lines = list(lines)
117
- for j, idx in enumerate(idxs):
118
- out_lines[idx] = translated_all[j]
 
 
 
 
 
 
 
 
 
 
119
 
120
  return "\n".join(out_lines)
121
 
122
 
123
- # ----------------------------------------------------
124
  # 2) SRT PARSER
125
- # ----------------------------------------------------
126
 
127
  def parse_srt(path: Path):
128
  """
@@ -175,15 +147,10 @@ def parse_srt(path: Path):
175
  return subs
176
 
177
 
178
- # ----------------------------------------------------
179
  # 3) KARAKTER ÇIKARMA + TEXT TEMİZLEME
180
- # ----------------------------------------------------
181
 
182
- # Örnek eşleşmeler:
183
- # WOMAN: ...
184
- # DR. LEWIS: ...
185
- # >>> NURSE: ...
186
- # -NURSE: ...
187
  speaker_pattern = re.compile(
188
  r'^\s*(?:>{1,3}\s*)?(?:-+\s*)?'
189
  r'(?P<name>(?:[A-Z][A-Z0-9.\']+(?:\s+[A-Z][A-Z0-9.\']+){0,4}))'
@@ -202,7 +169,7 @@ def extract_character_and_clean_text(block: str):
202
 
203
  lines = block.splitlines()
204
  character = ""
205
- out_lines: List[str] = []
206
 
207
  for line in lines:
208
  original = line.strip()
@@ -218,7 +185,6 @@ def extract_character_and_clean_text(block: str):
218
  if after:
219
  out_lines.append(after)
220
  else:
221
- # NAME: ile başlamayan satırlar olduğu gibi kalsın
222
  out_lines.append(original)
223
 
224
  out_lines = [ln for ln in out_lines if ln.strip()]
@@ -228,7 +194,6 @@ def extract_character_and_clean_text(block: str):
228
  def start_time_to_mm_ss(start: str) -> str:
229
  """
230
  'HH:MM:SS,mmm' -> 'MM.SS'
231
- (toplam dakika . saniye)
232
  """
233
  hms, *_ = start.split(",")
234
  h, m, s = [int(x) for x in hms.split(":")]
@@ -238,9 +203,9 @@ def start_time_to_mm_ss(start: str) -> str:
238
  return f"{total_minutes:02d}.{seconds:02d}"
239
 
240
 
241
- # ----------------------------------------------------
242
  # 4) DOCX OLUŞTURMA
243
- # ----------------------------------------------------
244
 
245
  def style_header_cell(cell, text: str):
246
  """
@@ -289,10 +254,10 @@ def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str
289
  row = table.add_row()
290
  cells = row.cells
291
 
292
- # Character -> ASLA çevirmiyoruz
293
  cells[0].text = character
294
 
295
- # TC -> MM.SS (start time)
296
  cells[1].text = start_time_to_mm_ss(sub["start"])
297
 
298
  # note -> boş
@@ -312,9 +277,9 @@ def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str
312
  return buffer.getvalue(), out_name
313
 
314
 
315
- # ----------------------------------------------------
316
- # 5) GRADIO ÇAĞRI FONKSİYONU (MULTI SRT -> ZIP)
317
- # ----------------------------------------------------
318
 
319
  def process_srt_files(files, translate_to_tr: bool):
320
  """
@@ -339,21 +304,21 @@ def process_srt_files(files, translate_to_tr: bool):
339
  return out_zip_path
340
 
341
 
342
- # ----------------------------------------------------
343
  # 6) GRADIO UI
344
- # ----------------------------------------------------
345
 
346
  with gr.Blocks() as demo:
347
  gr.Markdown(
348
  """
349
- # SRT → DOCX (Character / TC / TEXT) + EN→TR (HF Inference API)
350
 
351
  - Bir veya birden fazla **.srt** yükle.
352
  - Her satır için:
353
  - **Character**: `WOMAN:`, `LEWIS:`, `NURSE:` gibi isimler çıkarılır (**çeviri yok**).
354
- - **TC**: sadece **MM.SS** (start time'dan).
355
  - **TEXT**: `NAME:` prefix'leri atılmış metin.
356
- - İstersen TEXT'i **Hugging Face Inference API** ile EN→TR çevir.
357
  - Çıktı: Tüm DOCX'leri içeren tek bir **ZIP**.
358
  """
359
  )
@@ -367,7 +332,7 @@ with gr.Blocks() as demo:
367
  )
368
 
369
  translate_chk = gr.Checkbox(
370
- label="Translate TEXT (EN → TR, via HF Inference API)",
371
  value=False,
372
  )
373
 
 
3
  import io
4
  import zipfile
5
  from pathlib import Path
6
+ from typing import Tuple
7
 
 
8
  import gradio as gr
9
  from docx import Document
10
  from docx.oxml import OxmlElement
11
  from docx.oxml.ns import qn
12
+ from huggingface_hub import InferenceClient
13
 
14
+ # ======================================================
15
+ # 1) HF INFERENCE API: EN -> TR ÇEVİRİ
16
+ # ======================================================
17
 
18
+ HF_TOKEN = os.environ.get("HF_TOKEN")
 
 
 
 
19
  if not HF_TOKEN:
20
+ # Space içinde: Settings → Variables and secrets → New variable → Name=HF_TOKEN, Value=<token>
21
  raise RuntimeError(
22
  "HF_TOKEN environment variable is not set. "
23
+ "Go to your Space Settings → Variables and secrets and add HF_TOKEN."
24
  )
25
 
26
+ # EN→TR modeli
27
+ HF_MODEL = "Helsinki-NLP/opus-mt-tc-big-en-tr"
 
 
28
 
29
+ client = InferenceClient(
30
+ provider="hf-inference", # yeni router
31
+ api_key=HF_TOKEN,
32
+ )
33
 
34
 
35
+ def _extract_translation_text(result) -> str:
36
  """
37
+ InferenceClient dönüş tipini güvenli çıkar:
38
+ - str
39
+ - {"translation_text": "..."}
40
+ - [{"translation_text": "..."}]
41
+ vb.
42
  """
43
+ if isinstance(result, str):
44
+ return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ if isinstance(result, dict) and "translation_text" in result:
47
+ return result["translation_text"]
 
 
 
48
 
49
+ if isinstance(result, list) and result:
50
+ item = result[0]
51
+ if isinstance(item, str):
52
+ return item
53
+ if isinstance(item, dict) and "translation_text" in item:
54
+ return item["translation_text"]
55
+
56
+ # son çare: string'e dök
57
+ return str(result)
58
 
59
 
60
  def translate_en_tr(text: str) -> str:
61
  """
62
+ EN -> TR çeviri (HF Inference API).
63
+ Satır satır gönderiyoruz, satır yapısı korunuyor.
64
+ Hata olursa orijinal satırı geri döner.
 
 
65
  """
66
+ text = text.strip()
67
  if not text:
68
  return text
69
 
70
  lines = text.splitlines()
71
+ out_lines = []
 
 
 
 
72
 
73
+ for line in lines:
74
+ if not line.strip():
75
+ out_lines.append("")
76
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ try:
79
+ # docs'a uygun çağrı
80
+ result = client.translation(
81
+ text=line,
82
+ model=HF_MODEL,
83
+ )
84
+ translated = _extract_translation_text(result)
85
+ except Exception as e:
86
+ print("HF translation error:", repr(e))
87
+ # fallback: orijinal satırı kullan
88
+ translated = line
89
+
90
+ out_lines.append(translated)
91
 
92
  return "\n".join(out_lines)
93
 
94
 
95
+ # ======================================================
96
  # 2) SRT PARSER
97
+ # ======================================================
98
 
99
  def parse_srt(path: Path):
100
  """
 
147
  return subs
148
 
149
 
150
+ # ======================================================
151
  # 3) KARAKTER ÇIKARMA + TEXT TEMİZLEME
152
+ # ======================================================
153
 
 
 
 
 
 
154
  speaker_pattern = re.compile(
155
  r'^\s*(?:>{1,3}\s*)?(?:-+\s*)?'
156
  r'(?P<name>(?:[A-Z][A-Z0-9.\']+(?:\s+[A-Z][A-Z0-9.\']+){0,4}))'
 
169
 
170
  lines = block.splitlines()
171
  character = ""
172
+ out_lines = []
173
 
174
  for line in lines:
175
  original = line.strip()
 
185
  if after:
186
  out_lines.append(after)
187
  else:
 
188
  out_lines.append(original)
189
 
190
  out_lines = [ln for ln in out_lines if ln.strip()]
 
194
  def start_time_to_mm_ss(start: str) -> str:
195
  """
196
  'HH:MM:SS,mmm' -> 'MM.SS'
 
197
  """
198
  hms, *_ = start.split(",")
199
  h, m, s = [int(x) for x in hms.split(":")]
 
203
  return f"{total_minutes:02d}.{seconds:02d}"
204
 
205
 
206
+ # ======================================================
207
  # 4) DOCX OLUŞTURMA
208
+ # ======================================================
209
 
210
  def style_header_cell(cell, text: str):
211
  """
 
254
  row = table.add_row()
255
  cells = row.cells
256
 
257
+ # Character -> ÇEVİRME
258
  cells[0].text = character
259
 
260
+ # TC -> MM.SS
261
  cells[1].text = start_time_to_mm_ss(sub["start"])
262
 
263
  # note -> boş
 
277
  return buffer.getvalue(), out_name
278
 
279
 
280
+ # ======================================================
281
+ # 5) GRADIO: ÇOKLU SRT -> ZIP(DOCX)
282
+ # ======================================================
283
 
284
  def process_srt_files(files, translate_to_tr: bool):
285
  """
 
304
  return out_zip_path
305
 
306
 
307
+ # ======================================================
308
  # 6) GRADIO UI
309
+ # ======================================================
310
 
311
  with gr.Blocks() as demo:
312
  gr.Markdown(
313
  """
314
+ # SRT → DOCX (Character / TC / TEXT) + EN→TR Çeviri (HF Inference)
315
 
316
  - Bir veya birden fazla **.srt** yükle.
317
  - Her satır için:
318
  - **Character**: `WOMAN:`, `LEWIS:`, `NURSE:` gibi isimler çıkarılır (**çeviri yok**).
319
+ - **TC**: sadece **MM.SS** (start time).
320
  - **TEXT**: `NAME:` prefix'leri atılmış metin.
321
+ - İstersen TEXT'i **Helsinki-NLP/opus-mt-tc-big-en-tr** ile Türkçe'ye çevir.
322
  - Çıktı: Tüm DOCX'leri içeren tek bir **ZIP**.
323
  """
324
  )
 
332
  )
333
 
334
  translate_chk = gr.Checkbox(
335
+ label="Translate TEXT (EN → TR, only TEXT, not Character)",
336
  value=False,
337
  )
338