Spaces:
Running
Running
translate inference
Browse files
app.py
CHANGED
|
@@ -1,57 +1,121 @@
|
|
|
|
|
| 1 |
import re
|
| 2 |
import io
|
| 3 |
import zipfile
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import Tuple, List
|
| 6 |
|
|
|
|
| 7 |
import gradio as gr
|
| 8 |
from docx import Document
|
| 9 |
from docx.oxml import OxmlElement
|
| 10 |
from docx.oxml.ns import qn
|
| 11 |
-
|
| 12 |
|
| 13 |
# ----------------------------------------------------
|
| 14 |
-
# 1)
|
| 15 |
# ----------------------------------------------------
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
def translate_en_tr(text: str) -> str:
|
| 29 |
"""
|
| 30 |
-
EN->TR çeviri.
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
| 33 |
"""
|
| 34 |
text = (text or "").strip()
|
| 35 |
if not text:
|
| 36 |
return text
|
| 37 |
|
| 38 |
lines = text.splitlines()
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
non_empty_idx: List[int] = [i for i, ln in enumerate(lines) if ln.strip()]
|
| 42 |
-
to_translate: List[str] = [lines[i] for i in non_empty_idx]
|
| 43 |
|
| 44 |
if not to_translate:
|
| 45 |
return text
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
-
# Çevirilen satırları eski yerlerine koy
|
| 52 |
out_lines = list(lines)
|
| 53 |
-
for j, idx in enumerate(
|
| 54 |
-
out_lines[idx] =
|
| 55 |
|
| 56 |
return "\n".join(out_lines)
|
| 57 |
|
|
@@ -138,7 +202,7 @@ def extract_character_and_clean_text(block: str):
|
|
| 138 |
|
| 139 |
lines = block.splitlines()
|
| 140 |
character = ""
|
| 141 |
-
out_lines = []
|
| 142 |
|
| 143 |
for line in lines:
|
| 144 |
original = line.strip()
|
|
@@ -255,12 +319,10 @@ def srt_to_docx_bytes(srt_path: Path, translate_to_tr: bool) -> Tuple[bytes, str
|
|
| 255 |
def process_srt_files(files, translate_to_tr: bool):
|
| 256 |
"""
|
| 257 |
Çoklu SRT al, hepsini DOCX'e çevir, tek ZIP döndür.
|
| 258 |
-
Gradio output için path döndürüyoruz.
|
| 259 |
"""
|
| 260 |
if not files:
|
| 261 |
return None
|
| 262 |
|
| 263 |
-
# Gr.File(type="filepath") -> string path listesi
|
| 264 |
paths = [Path(p) for p in files]
|
| 265 |
|
| 266 |
zip_buffer = io.BytesIO()
|
|
@@ -270,7 +332,6 @@ def process_srt_files(files, translate_to_tr: bool):
|
|
| 270 |
zf.writestr(doc_name, doc_bytes)
|
| 271 |
|
| 272 |
zip_buffer.seek(0)
|
| 273 |
-
|
| 274 |
out_zip_path = "converted_subtitles.zip"
|
| 275 |
with open(out_zip_path, "wb") as f:
|
| 276 |
f.write(zip_buffer.read())
|
|
@@ -285,14 +346,14 @@ def process_srt_files(files, translate_to_tr: bool):
|
|
| 285 |
with gr.Blocks() as demo:
|
| 286 |
gr.Markdown(
|
| 287 |
"""
|
| 288 |
-
# SRT → DOCX (Character / TC / TEXT) + EN→TR
|
| 289 |
|
| 290 |
- Bir veya birden fazla **.srt** yükle.
|
| 291 |
- Her satır için:
|
| 292 |
- **Character**: `WOMAN:`, `LEWIS:`, `NURSE:` gibi isimler çıkarılır (**çeviri yok**).
|
| 293 |
- **TC**: sadece **MM.SS** (start time'dan).
|
| 294 |
- **TEXT**: `NAME:` prefix'leri atılmış metin.
|
| 295 |
-
- İstersen TEXT'i **EN→TR
|
| 296 |
- Çıktı: Tüm DOCX'leri içeren tek bir **ZIP**.
|
| 297 |
"""
|
| 298 |
)
|
|
@@ -306,7 +367,7 @@ with gr.Blocks() as demo:
|
|
| 306 |
)
|
| 307 |
|
| 308 |
translate_chk = gr.Checkbox(
|
| 309 |
-
label="Translate TEXT (EN → TR,
|
| 310 |
value=False,
|
| 311 |
)
|
| 312 |
|
|
|
|
| 1 |
+
import os
|
| 2 |
import re
|
| 3 |
import io
|
| 4 |
import zipfile
|
| 5 |
from pathlib import Path
|
| 6 |
from typing import Tuple, List
|
| 7 |
|
| 8 |
+
import requests
|
| 9 |
import gradio as gr
|
| 10 |
from docx import Document
|
| 11 |
from docx.oxml import OxmlElement
|
| 12 |
from docx.oxml.ns import qn
|
| 13 |
+
|
| 14 |
|
| 15 |
# ----------------------------------------------------
|
| 16 |
+
# 1) HUGGING FACE INFERENCE API (EN -> TR)
|
| 17 |
# ----------------------------------------------------
|
| 18 |
|
| 19 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 20 |
+
if not HF_TOKEN:
|
| 21 |
+
raise RuntimeError(
|
| 22 |
+
"HF_TOKEN environment variable is not set. "
|
| 23 |
+
"Add it in Space Settings → Variables and secrets."
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Küçük EN→TR modeli
|
| 27 |
+
MODEL_ID = "Helsinki-NLP/opus-mt-en-tr"
|
| 28 |
+
API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"
|
| 29 |
+
HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
|
| 30 |
|
| 31 |
+
MAX_BATCH_SIZE = 16 # satırları parça parça yollayalım
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _hf_translate_batch(lines: List[str]) -> List[str]:
|
| 35 |
+
"""
|
| 36 |
+
HF Inference API'ye tek batch istek.
|
| 37 |
+
lines: boş olmayan EN string listesi.
|
| 38 |
+
return: TR string listesi (aynı uzunlukta).
|
| 39 |
+
"""
|
| 40 |
+
if not lines:
|
| 41 |
+
return []
|
| 42 |
+
|
| 43 |
+
payload = {"inputs": lines}
|
| 44 |
+
resp = requests.post(API_URL, headers=HEADERS, json=payload, timeout=120)
|
| 45 |
+
resp.raise_for_status()
|
| 46 |
+
data = resp.json()
|
| 47 |
+
|
| 48 |
+
out: List[str] = []
|
| 49 |
+
|
| 50 |
+
# Çıkan JSON bazen:
|
| 51 |
+
# - [[{"translation_text": "..."}], ...]
|
| 52 |
+
# - [{"translation_text": "..."}, ...]
|
| 53 |
+
# - [{"generated_text": "..."}, ...]
|
| 54 |
+
for item in data:
|
| 55 |
+
obj = item
|
| 56 |
+
if isinstance(item, list) and item:
|
| 57 |
+
obj = item[0]
|
| 58 |
+
|
| 59 |
+
if isinstance(obj, dict):
|
| 60 |
+
if "translation_text" in obj:
|
| 61 |
+
out.append(obj["translation_text"])
|
| 62 |
+
elif "generated_text" in obj:
|
| 63 |
+
out.append(obj["generated_text"])
|
| 64 |
+
else:
|
| 65 |
+
out.append("")
|
| 66 |
+
else:
|
| 67 |
+
out.append(str(obj))
|
| 68 |
+
|
| 69 |
+
# Güvenlik için uzunluk eşitle
|
| 70 |
+
if len(out) < len(lines):
|
| 71 |
+
out.extend([""] * (len(lines) - len(out)))
|
| 72 |
+
elif len(out) > len(lines):
|
| 73 |
+
out = out[: len(lines)]
|
| 74 |
+
|
| 75 |
+
return out
|
| 76 |
|
| 77 |
|
| 78 |
def translate_en_tr(text: str) -> str:
|
| 79 |
"""
|
| 80 |
+
EN->TR çeviri (satır yapısını korur).
|
| 81 |
+
- Satırları böler.
|
| 82 |
+
- Boş olmayanları batch batch Inference API'ye yollar.
|
| 83 |
+
- Aynı sırayla geri yerleştirir.
|
| 84 |
+
Hata durumunda orijinal text'i döner.
|
| 85 |
"""
|
| 86 |
text = (text or "").strip()
|
| 87 |
if not text:
|
| 88 |
return text
|
| 89 |
|
| 90 |
lines = text.splitlines()
|
| 91 |
+
idxs = [i for i, ln in enumerate(lines) if ln.strip()]
|
| 92 |
+
to_translate = [lines[i] for i in idxs]
|
|
|
|
|
|
|
| 93 |
|
| 94 |
if not to_translate:
|
| 95 |
return text
|
| 96 |
|
| 97 |
+
translated_all: List[str] = []
|
| 98 |
+
|
| 99 |
+
try:
|
| 100 |
+
# Parça parça gönder (MAX_BATCH_SIZE)
|
| 101 |
+
for start in range(0, len(to_translate), MAX_BATCH_SIZE):
|
| 102 |
+
chunk = to_translate[start : start + MAX_BATCH_SIZE]
|
| 103 |
+
chunk_out = _hf_translate_batch(chunk)
|
| 104 |
+
translated_all.extend(chunk_out)
|
| 105 |
+
except Exception as e:
|
| 106 |
+
# Çeviri patlarsa tümünü orijinal bırak
|
| 107 |
+
print("HF translation error:", repr(e))
|
| 108 |
+
return text
|
| 109 |
+
|
| 110 |
+
# Uzunluk makyajı
|
| 111 |
+
if len(translated_all) < len(to_translate):
|
| 112 |
+
translated_all.extend([""] * (len(to_translate) - len(translated_all)))
|
| 113 |
+
elif len(translated_all) > len(to_translate):
|
| 114 |
+
translated_all = translated_all[: len(to_translate)]
|
| 115 |
|
|
|
|
| 116 |
out_lines = list(lines)
|
| 117 |
+
for j, idx in enumerate(idxs):
|
| 118 |
+
out_lines[idx] = translated_all[j]
|
| 119 |
|
| 120 |
return "\n".join(out_lines)
|
| 121 |
|
|
|
|
| 202 |
|
| 203 |
lines = block.splitlines()
|
| 204 |
character = ""
|
| 205 |
+
out_lines: List[str] = []
|
| 206 |
|
| 207 |
for line in lines:
|
| 208 |
original = line.strip()
|
|
|
|
| 319 |
def process_srt_files(files, translate_to_tr: bool):
|
| 320 |
"""
|
| 321 |
Çoklu SRT al, hepsini DOCX'e çevir, tek ZIP döndür.
|
|
|
|
| 322 |
"""
|
| 323 |
if not files:
|
| 324 |
return None
|
| 325 |
|
|
|
|
| 326 |
paths = [Path(p) for p in files]
|
| 327 |
|
| 328 |
zip_buffer = io.BytesIO()
|
|
|
|
| 332 |
zf.writestr(doc_name, doc_bytes)
|
| 333 |
|
| 334 |
zip_buffer.seek(0)
|
|
|
|
| 335 |
out_zip_path = "converted_subtitles.zip"
|
| 336 |
with open(out_zip_path, "wb") as f:
|
| 337 |
f.write(zip_buffer.read())
|
|
|
|
| 346 |
with gr.Blocks() as demo:
|
| 347 |
gr.Markdown(
|
| 348 |
"""
|
| 349 |
+
# SRT → DOCX (Character / TC / TEXT) + EN→TR (HF Inference API)
|
| 350 |
|
| 351 |
- Bir veya birden fazla **.srt** yükle.
|
| 352 |
- Her satır için:
|
| 353 |
- **Character**: `WOMAN:`, `LEWIS:`, `NURSE:` gibi isimler çıkarılır (**çeviri yok**).
|
| 354 |
- **TC**: sadece **MM.SS** (start time'dan).
|
| 355 |
- **TEXT**: `NAME:` prefix'leri atılmış metin.
|
| 356 |
+
- İstersen TEXT'i **Hugging Face Inference API** ile EN→TR çevir.
|
| 357 |
- Çıktı: Tüm DOCX'leri içeren tek bir **ZIP**.
|
| 358 |
"""
|
| 359 |
)
|
|
|
|
| 367 |
)
|
| 368 |
|
| 369 |
translate_chk = gr.Checkbox(
|
| 370 |
+
label="Translate TEXT (EN → TR, via HF Inference API)",
|
| 371 |
value=False,
|
| 372 |
)
|
| 373 |
|