Joyboy-dy commited on
Commit
e2fffa6
·
1 Parent(s): 7bfdd1b

Add /translate endpoint for SRT translation via OpenAI

Browse files
Files changed (3) hide show
  1. diff_output.txt +0 -0
  2. requirements.txt +2 -1
  3. server.py +311 -123
diff_output.txt ADDED
Binary file (40.2 kB). View file
 
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  fastapi
2
  uvicorn[standard]
3
  python-multipart
4
- whisperx
5
  torch
 
 
 
1
  fastapi
2
  uvicorn[standard]
3
  python-multipart
 
4
  torch
5
+ openai-whisper
6
+ openai
server.py CHANGED
@@ -3,78 +3,37 @@ import re
3
  import shutil
4
  import tempfile
5
  from contextlib import asynccontextmanager
6
- from pathlib import Path
7
 
8
- import whisperx
9
- from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
10
  from fastapi.middleware.cors import CORSMiddleware
11
- from fastapi.responses import FileResponse
 
12
 
13
  DEVICE = "cpu"
14
- MODEL_SIZE = "medium"
15
- model = None
16
- model_a = None
17
- metadata = None
18
 
 
19
 
20
- def _configure_torch_safe_loading_for_pyannote() -> None:
21
- # PyTorch 2.6+ defaults torch.load(weights_only=True). Some pyannote checkpoints
22
- # include OmegaConf objects; allowlisting avoids startup crashes.
23
- try:
24
- import torch # noqa: F401
25
- from omegaconf import DictConfig, ListConfig
26
-
27
- import torch.serialization
28
 
29
- torch.serialization.add_safe_globals([DictConfig, ListConfig])
30
- except Exception:
31
- # Best-effort: if deps aren't present, ignore.
32
- return
33
-
34
-
35
- def _load_whisperx_asr_model():
36
- # Prefer silero VAD to avoid pyannote checkpoint issues on some environments.
37
- common_kwargs = {"device": DEVICE, "compute_type": "int8"}
38
- try:
39
- return whisperx.load_model(MODEL_SIZE, vad_method="silero", **common_kwargs)
40
- except TypeError:
41
- # Older WhisperX versions may not support vad_method.
42
- _configure_torch_safe_loading_for_pyannote()
43
- return whisperx.load_model(MODEL_SIZE, **common_kwargs)
44
-
45
-
46
- def _transcribe_with_compat(asr_model, audio_path: str) -> dict:
47
- """
48
- WhisperX versions differ:
49
- - Some expose vad_filter/batch_size on .transcribe()
50
- - Some (FasterWhisperPipeline) don't accept vad_filter
51
- We prefer VAD when supported, but never fail the request because of kwargs.
52
- """
53
- try:
54
- return asr_model.transcribe(audio_path, batch_size=4, vad_filter=True)
55
- except TypeError:
56
- try:
57
- return asr_model.transcribe(audio_path, batch_size=4)
58
- except TypeError:
59
- return asr_model.transcribe(audio_path)
60
-
61
-
62
- def _align_with_compat(segments: list[dict], audio_path: str) -> dict:
63
- # WhisperX align() sometimes expects raw audio array rather than a path.
64
- try:
65
- return whisperx.align(segments, model_a, metadata, audio_path, DEVICE)
66
- except Exception:
67
- audio = whisperx.load_audio(audio_path)
68
- return whisperx.align(segments, model_a, metadata, audio, DEVICE)
69
 
70
 
71
  @asynccontextmanager
72
  async def lifespan(app: FastAPI):
73
- global model, model_a, metadata
74
- print("Server starting up - loading WhisperX models...")
75
- model = _load_whisperx_asr_model()
76
- model_a, metadata = whisperx.load_align_model(language_code="fr", device=DEVICE)
77
- print("WhisperX models ready")
 
 
 
 
78
  yield
79
  print("Server shutting down...")
80
 
@@ -93,7 +52,13 @@ app.add_middleware(
93
  @app.get("/")
94
  @app.head("/")
95
  async def root():
96
- return {"service": "LyricSync Backend", "engine": "whisperx", "status": "operational"}
 
 
 
 
 
 
97
 
98
 
99
  @app.get("/health")
@@ -117,7 +82,8 @@ def _format_srt_time(seconds: float) -> str:
117
  return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
118
 
119
 
120
- def _write_srt_file(segments: list[dict], file_obj) -> None:
 
121
  index = 1
122
  for segment in segments:
123
  text = (segment.get("text") or "").strip()
@@ -126,14 +92,24 @@ def _write_srt_file(segments: list[dict], file_obj) -> None:
126
  if not text or start is None or end is None:
127
  continue
128
 
129
- file_obj.write(f"{index}\n")
130
- file_obj.write(f"{_format_srt_time(start)} --> {_format_srt_time(end)}\n")
131
- file_obj.write(f"{text}\n\n")
 
132
  index += 1
133
 
 
 
 
 
134
 
135
- _STRONG_PUNCT_RE = re.compile(r"[.!?]+$")
136
- _SOFT_PUNCT_RE = re.compile(r"[,;:]+$")
 
 
 
 
 
137
 
138
 
139
  def _cleanup_spacing(text: str) -> str:
@@ -143,40 +119,58 @@ def _cleanup_spacing(text: str) -> str:
143
  return text.strip()
144
 
145
 
146
- def _extract_word_segments(aligned_segments: list[dict]) -> list[dict]:
147
- words: list[dict] = []
148
- for segment in aligned_segments:
149
- for word in segment.get("words") or []:
150
- token = (word.get("word") or word.get("text") or "").strip()
151
- start = word.get("start")
152
- end = word.get("end")
153
- if not token or start is None or end is None:
154
- continue
155
- entry = {"word": token, "start": float(start), "end": float(end)}
156
- score = word.get("score")
157
- if score is None:
158
- score = word.get("probability")
159
- if score is not None:
160
- entry["score"] = float(score)
161
- words.append(entry)
162
 
163
- words.sort(key=lambda w: (w["start"], w["end"]))
164
- return words
165
 
166
-
167
- def _paragraph_segments_from_aligned(aligned_segments: list[dict]) -> list[dict]:
168
  segments: list[dict] = []
169
- for seg in aligned_segments:
170
  text = _cleanup_spacing((seg.get("text") or "").strip())
171
- words = [w for w in (seg.get("words") or []) if w.get("start") is not None and w.get("end") is not None]
172
- if not text or not words:
 
 
 
173
  continue
174
- start = float(words[0]["start"])
175
- end = float(words[-1]["end"])
176
- segments.append({"start": start, "end": end, "text": text})
177
  return segments
178
 
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  def _sentence_segments_from_words(word_segments: list[dict], max_words: int = 8, gap_s: float = 0.4) -> list[dict]:
181
  segments: list[dict] = []
182
  current: list[dict] = []
@@ -217,53 +211,247 @@ def _sentence_segments_from_words(word_segments: list[dict], max_words: int = 8,
217
  return segments
218
 
219
 
220
- @app.post("/align")
221
- async def align_audio(
222
- background_tasks: BackgroundTasks,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  audio_file: UploadFile = File(...),
224
- srt_mode: str = Form("paragraph"),
225
  ):
226
- if model is None or model_a is None or metadata is None:
227
- raise HTTPException(status_code=503, detail="WhisperX models are not ready")
228
 
 
 
 
 
229
  temp_dir = tempfile.mkdtemp(prefix="lyric-sync-")
230
-
231
  try:
232
- if srt_mode not in ("paragraph", "sentence"):
233
- raise HTTPException(status_code=400, detail="Invalid srt_mode (expected 'paragraph' or 'sentence')")
 
 
 
 
234
 
235
  source_name = audio_file.filename or "audio"
236
  audio_path = os.path.join(temp_dir, source_name)
237
  with open(audio_path, "wb") as f:
238
  shutil.copyfileobj(audio_file.file, f)
239
 
240
- result = _transcribe_with_compat(model, audio_path)
241
- result = _align_with_compat(result["segments"], audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
- word_segments = _extract_word_segments(result["segments"])
244
- if srt_mode == "sentence":
245
- srt_segments = _sentence_segments_from_words(word_segments)
246
- else:
247
- srt_segments = _paragraph_segments_from_aligned(result["segments"])
248
 
249
- srt_path = os.path.join(temp_dir, f"{Path(source_name).stem}.srt")
250
- with open(srt_path, "w", encoding="utf-8") as srt_file:
251
- _write_srt_file(srt_segments, srt_file)
252
 
253
- background_tasks.add_task(_cleanup_temp_dir, temp_dir)
254
 
255
- return FileResponse(
256
- path=srt_path,
257
- media_type="application/x-subrip",
258
- filename=f"{Path(source_name).stem}.srt",
259
- background=background_tasks,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  )
261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  except Exception as e:
263
- _cleanup_temp_dir(temp_dir)
264
- raise HTTPException(status_code=500, detail=str(e)) from e
265
- finally:
266
- audio_file.file.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
 
269
  if __name__ == "__main__":
 
3
  import shutil
4
  import tempfile
5
  from contextlib import asynccontextmanager
6
+ from typing import Literal
7
 
8
+ import whisper
9
+ from fastapi import FastAPI, File, Form, HTTPException, UploadFile
10
  from fastapi.middleware.cors import CORSMiddleware
11
+ from fastapi.responses import PlainTextResponse
12
+ from pydantic import BaseModel
13
 
14
  DEVICE = "cpu"
15
+ WHISPER_MODEL_NAME = "large-v2"
16
+ whisper_model = None
 
 
17
 
18
+ SrtMode = Literal["lyric", "paragraph"]
19
 
 
 
 
 
 
 
 
 
20
 
21
+ class TranslateRequest(BaseModel):
22
+ srt_content: str
23
+ target_language: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
 
26
  @asynccontextmanager
27
  async def lifespan(app: FastAPI):
28
+ global whisper_model
29
+ print(f"Server starting up - loading Whisper model '{WHISPER_MODEL_NAME}' on {DEVICE}...")
30
+ whisper_model = whisper.load_model(WHISPER_MODEL_NAME)
31
+ try:
32
+ whisper_model.to(DEVICE)
33
+ except Exception:
34
+ # Best effort: some whisper builds may not expose .to()
35
+ pass
36
+ print("Whisper model ready")
37
  yield
38
  print("Server shutting down...")
39
 
 
52
  @app.get("/")
53
  @app.head("/")
54
  async def root():
55
+ return {
56
+ "service": "LyricSync Backend",
57
+ "engine": "openai-whisper",
58
+ "model": WHISPER_MODEL_NAME,
59
+ "device": DEVICE,
60
+ "status": "operational",
61
+ }
62
 
63
 
64
  @app.get("/health")
 
82
  return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
83
 
84
 
85
+ def _build_srt(segments: list[dict]) -> str:
86
+ lines: list[str] = []
87
  index = 1
88
  for segment in segments:
89
  text = (segment.get("text") or "").strip()
 
92
  if not text or start is None or end is None:
93
  continue
94
 
95
+ lines.append(str(index))
96
+ lines.append(f"{_format_srt_time(float(start))} --> {_format_srt_time(float(end))}")
97
+ lines.append(text)
98
+ lines.append("")
99
  index += 1
100
 
101
+ if not lines:
102
+ return ""
103
+
104
+ return "\n".join(lines).rstrip() + "\n"
105
 
106
+
107
+ _STRONG_PUNCT_RE = re.compile(r"[.!?。!?]+$")
108
+ _SOFT_PUNCT_RE = re.compile(r"[,;:、,;:]+$")
109
+ _INSTRUMENTAL_RE = re.compile(
110
+ r"^\s*(?:\[(?:music|instrumental|applause|silence)\]|\((?:music|instrumental)\)|[♪♫]+)\s*$",
111
+ re.IGNORECASE,
112
+ )
113
 
114
 
115
  def _cleanup_spacing(text: str) -> str:
 
119
  return text.strip()
120
 
121
 
122
+ def _is_instrumental_text(text: str) -> bool:
123
+ if not text or not text.strip():
124
+ return True
125
+ cleaned = text.strip()
126
+ return bool(_INSTRUMENTAL_RE.match(cleaned))
 
 
 
 
 
 
 
 
 
 
 
127
 
 
 
128
 
129
+ def _whisper_segments(transcribe_result: dict) -> list[dict]:
 
130
  segments: list[dict] = []
131
+ for seg in transcribe_result.get("segments") or []:
132
  text = _cleanup_spacing((seg.get("text") or "").strip())
133
+ start = seg.get("start")
134
+ end = seg.get("end")
135
+ if start is None or end is None:
136
+ continue
137
+ if _is_instrumental_text(text):
138
  continue
139
+ segments.append({"start": float(start), "end": float(end), "text": text})
 
 
140
  return segments
141
 
142
 
143
+ def _tokenize_units(text: str) -> list[str]:
144
+ text = (text or "").strip()
145
+ if not text:
146
+ return []
147
+
148
+ if re.search(r"\s", text):
149
+ return [t for t in text.split() if t]
150
+
151
+ # Languages without spaces (CJK, etc.): approximate words by chunking.
152
+ chunk_size = 4
153
+ return [text[i : i + chunk_size] for i in range(0, len(text), chunk_size) if text[i : i + chunk_size].strip()]
154
+
155
+
156
+ def _pseudo_word_segments_from_whisper(segments: list[dict]) -> list[dict]:
157
+ words: list[dict] = []
158
+ for seg in segments:
159
+ units = _tokenize_units(seg["text"])
160
+ if not units:
161
+ continue
162
+ start = float(seg["start"])
163
+ end = float(seg["end"])
164
+ dur = max(0.001, end - start)
165
+ step = dur / len(units)
166
+ for idx, unit in enumerate(units):
167
+ w_start = start + (idx * step)
168
+ w_end = start + ((idx + 1) * step)
169
+ words.append({"word": unit, "start": w_start, "end": w_end})
170
+ words.sort(key=lambda w: (w["start"], w["end"]))
171
+ return words
172
+
173
+
174
  def _sentence_segments_from_words(word_segments: list[dict], max_words: int = 8, gap_s: float = 0.4) -> list[dict]:
175
  segments: list[dict] = []
176
  current: list[dict] = []
 
211
  return segments
212
 
213
 
214
+ def _transcribe_audio(audio_path: str) -> dict:
215
+ if whisper_model is None:
216
+ raise HTTPException(status_code=503, detail="Whisper model is not ready")
217
+
218
+ return whisper_model.transcribe(
219
+ audio_path,
220
+ fp16=False,
221
+ verbose=False,
222
+ condition_on_previous_text=False,
223
+ no_speech_threshold=0.7,
224
+ )
225
+
226
+
227
+ def _segments_for_mode(segments: list[dict], mode: SrtMode) -> list[dict]:
228
+ if mode == "paragraph":
229
+ return segments
230
+
231
+ # Lyric mode: post-process into short lines (~8 words) using punctuation + pauses.
232
+ pseudo_words = _pseudo_word_segments_from_whisper(segments)
233
+ return _sentence_segments_from_words(pseudo_words, max_words=8, gap_s=0.4)
234
+
235
+
236
+ @app.post("/srt", response_class=PlainTextResponse)
237
+ async def generate_srt(
238
  audio_file: UploadFile = File(...),
239
+ srt_mode: str = Form("lyric"),
240
  ):
241
+ """
242
+ Generate SRT from audio using official OpenAI Whisper (large-v2) on CPU.
243
 
244
+ srt_mode:
245
+ - lyric (default): short lines for lyric videos
246
+ - paragraph: raw Whisper segments (longer transcript blocks)
247
+ """
248
  temp_dir = tempfile.mkdtemp(prefix="lyric-sync-")
 
249
  try:
250
+ mode = srt_mode.strip().lower()
251
+ # Backward-compat with old UI values.
252
+ if mode == "sentence":
253
+ mode = "lyric"
254
+ if mode not in ("lyric", "paragraph"):
255
+ raise HTTPException(status_code=400, detail="Invalid srt_mode (expected 'lyric' or 'paragraph')")
256
 
257
  source_name = audio_file.filename or "audio"
258
  audio_path = os.path.join(temp_dir, source_name)
259
  with open(audio_path, "wb") as f:
260
  shutil.copyfileobj(audio_file.file, f)
261
 
262
+ transcribe_result = _transcribe_audio(audio_path)
263
+ whisper_segs = _whisper_segments(transcribe_result)
264
+ srt_segments = _segments_for_mode(whisper_segs, mode) # type: ignore[arg-type]
265
+ srt_content = _build_srt(srt_segments)
266
+ return PlainTextResponse(content=srt_content, media_type="application/x-subrip")
267
+ except HTTPException:
268
+ raise
269
+ except Exception as e:
270
+ raise HTTPException(status_code=500, detail=str(e)) from e
271
+ finally:
272
+ try:
273
+ audio_file.file.close()
274
+ finally:
275
+ _cleanup_temp_dir(temp_dir)
276
+
277
+
278
+ @app.post("/align", response_class=PlainTextResponse)
279
+ async def align_audio_compat(
280
+ audio_file: UploadFile = File(...),
281
+ srt_mode: str = Form("lyric"),
282
+ ):
283
+ # Compatibility route: the old frontend calls /align.
284
+ return await generate_srt(audio_file=audio_file, srt_mode=srt_mode)
285
 
 
 
 
 
 
286
 
287
+ _SRT_TS_RE = re.compile(
288
+ r"^(?P<start>\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(?P<end>\d{2}:\d{2}:\d{2},\d{3})\s*$"
289
+ )
290
 
 
291
 
292
+ def _parse_srt(srt_content: str) -> list[dict]:
293
+ blocks: list[dict] = []
294
+ lines = (srt_content or "").splitlines()
295
+ i = 0
296
+
297
+ while i < len(lines):
298
+ if not lines[i].strip():
299
+ i += 1
300
+ continue
301
+
302
+ raw_index = lines[i].strip()
303
+ try:
304
+ index = int(raw_index)
305
+ except ValueError:
306
+ index = len(blocks) + 1
307
+ i += 1
308
+ if i >= len(lines):
309
+ break
310
+
311
+ m = _SRT_TS_RE.match(lines[i].strip())
312
+ if not m:
313
+ # Skip malformed block
314
+ i += 1
315
+ continue
316
+ start = m.group("start")
317
+ end = m.group("end")
318
+ i += 1
319
+
320
+ text_lines: list[str] = []
321
+ while i < len(lines) and lines[i].strip():
322
+ text_lines.append(lines[i].rstrip("\n"))
323
+ i += 1
324
+
325
+ blocks.append({"index": index, "start": start, "end": end, "text": "\n".join(text_lines).strip()})
326
+
327
+ return blocks
328
+
329
+
330
+ def _render_srt(blocks: list[dict]) -> str:
331
+ out: list[str] = []
332
+ for idx, block in enumerate(blocks, start=1):
333
+ out.append(str(idx))
334
+ out.append(f"{block['start']} --> {block['end']}")
335
+ out.append((block.get("text") or "").strip())
336
+ out.append("")
337
+ return "\n".join(out).rstrip() + "\n"
338
+
339
+
340
+ _LANGUAGES: dict[str, dict] = {
341
+ "en": {"label": "English", "transliterate": False},
342
+ "fr": {"label": "French", "transliterate": False},
343
+ "es": {"label": "Spanish", "transliterate": False},
344
+ "de": {"label": "German", "transliterate": False},
345
+ "it": {"label": "Italian", "transliterate": False},
346
+ "ja": {"label": "Japanese (Romaji)", "transliterate": True, "scheme": "Romaji"},
347
+ "zh-Hans": {"label": "Chinese (Simplified, Pinyin)", "transliterate": True, "scheme": "Hanyu Pinyin"},
348
+ "zh-Hant": {"label": "Chinese (Traditional, Pinyin)", "transliterate": True, "scheme": "Hanyu Pinyin"},
349
+ "ko": {"label": "Korean (Romanized)", "transliterate": True, "scheme": "Revised Romanization"},
350
+ "th": {"label": "Thai (Romanized)", "transliterate": True, "scheme": "RTGS"},
351
+ "pt": {"label": "Portuguese", "transliterate": False},
352
+ "ru": {"label": "Russian (Transliterated)", "transliterate": True, "scheme": "Latin transliteration"},
353
+ "ar": {"label": "Arabic (Latin phonetic)", "transliterate": True, "scheme": "Latin phonetic"},
354
+ "hi": {"label": "Hindi (Latin transliteration)", "transliterate": True, "scheme": "Latin transliteration"},
355
+ "nl": {"label": "Dutch", "transliterate": False},
356
+ "id": {"label": "Indonesian", "transliterate": False},
357
+ "vi": {"label": "Vietnamese", "transliterate": False},
358
+ "tr": {"label": "Turkish", "transliterate": False},
359
+ "pl": {"label": "Polish", "transliterate": False},
360
+ }
361
+
362
+
363
+ def _translate_blocks_via_openai(texts: list[str], target_code: str) -> list[str]:
364
+ api_key = os.environ.get("OPENAI_API_KEY")
365
+ if not api_key:
366
+ raise HTTPException(status_code=503, detail="OPENAI_API_KEY is not configured on the server")
367
+
368
+ language = _LANGUAGES[target_code]
369
+ label = language["label"]
370
+ transliterate = bool(language.get("transliterate"))
371
+ scheme = language.get("scheme")
372
+
373
+ system = (
374
+ "You translate short subtitle lines. Preserve meaning, punctuation, and line breaks. "
375
+ "Return ONLY valid JSON with shape {\"translations\": [..]}. No markdown."
376
+ )
377
+
378
+ translit_rule = ""
379
+ if transliterate:
380
+ extra = f" using {scheme}" if scheme else ""
381
+ translit_rule = (
382
+ f"IMPORTANT: Output MUST be Latin-script transliteration{extra}. "
383
+ "Do NOT output any native-script characters (no Kana/Kanji/Hanzi/Hangul/Cyrillic/Arabic/Devanagari/Thai)."
384
  )
385
 
386
+ from openai import OpenAI
387
+
388
+ client = OpenAI(api_key=api_key)
389
+ model_name = os.environ.get("OPENAI_TRANSLATE_MODEL", "gpt-4o-mini")
390
+
391
+ user = {
392
+ "target_language": label,
393
+ "rule": translit_rule,
394
+ "lines": texts,
395
+ }
396
+
397
+ import json
398
+
399
+ user_json = json.dumps(user, ensure_ascii=False)
400
+
401
+ resp = client.chat.completions.create(
402
+ model=model_name,
403
+ temperature=0,
404
+ messages=[
405
+ {"role": "system", "content": system},
406
+ {
407
+ "role": "user",
408
+ "content": (
409
+ f"Translate each line to {label}. {translit_rule}\n"
410
+ "Return JSON: {\"translations\": [\"...\", ...]} with the same length and order as input.\n\n"
411
+ "Input JSON:\n"
412
+ f"{user_json}"
413
+ ),
414
+ },
415
+ ],
416
+ )
417
+
418
+ content = (resp.choices[0].message.content or "").strip()
419
+ try:
420
+ import json
421
+
422
+ # Best-effort: extract the first JSON object from the response.
423
+ start = content.find("{")
424
+ end = content.rfind("}")
425
+ payload = json.loads(content[start : end + 1] if start != -1 and end != -1 else content)
426
+ translations = payload.get("translations")
427
+ if not isinstance(translations, list) or len(translations) != len(texts):
428
+ raise ValueError("Invalid translations payload")
429
+ return [str(t) for t in translations]
430
  except Exception as e:
431
+ raise HTTPException(status_code=502, detail=f"Translation parsing failed: {e}") from e
432
+
433
+
434
+ @app.post("/translate", response_class=PlainTextResponse)
435
+ async def translate_srt(req: TranslateRequest):
436
+ target = (req.target_language or "").strip()
437
+ if target not in _LANGUAGES:
438
+ raise HTTPException(status_code=400, detail=f"Unsupported target_language (supported: {', '.join(_LANGUAGES)})")
439
+
440
+ blocks = _parse_srt(req.srt_content)
441
+ if not blocks:
442
+ raise HTTPException(status_code=400, detail="Empty or invalid SRT content")
443
+
444
+ texts = [b["text"] for b in blocks]
445
+ # Chunk to keep prompts manageable.
446
+ translated: list[str] = []
447
+ chunk_size = 60
448
+ for i in range(0, len(texts), chunk_size):
449
+ translated.extend(_translate_blocks_via_openai(texts[i : i + chunk_size], target))
450
+
451
+ for block, new_text in zip(blocks, translated, strict=True):
452
+ block["text"] = (new_text or "").strip()
453
+
454
+ return PlainTextResponse(content=_render_srt(blocks), media_type="application/x-subrip")
455
 
456
 
457
  if __name__ == "__main__":