Spaces:

Joyboy-dy
/

lyric-sync-api

Sleeping

App Files Files Community

Joyboy-dy commited on Feb 7

Commit

e2fffa6

1 Parent(s): 7bfdd1b

Add /translate endpoint for SRT translation via OpenAI

Browse files

Files changed (3) hide show

diff_output.txt +0 -0
requirements.txt +2 -1
server.py +311 -123

diff_output.txt ADDED Viewed

Binary file (40.2 kB). View file

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
 fastapi
 uvicorn[standard]
 python-multipart
-whisperx
 torch

 fastapi
 uvicorn[standard]
 python-multipart
 torch
+openai-whisper
+openai

server.py CHANGED Viewed

@@ -3,78 +3,37 @@ import re
 import shutil
 import tempfile
 from contextlib import asynccontextmanager
-from pathlib import Path
-import whisperx
-from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import FileResponse
 DEVICE = "cpu"
-MODEL_SIZE = "medium"
-model = None
-model_a = None
-metadata = None
-def _configure_torch_safe_loading_for_pyannote() -> None:
-    # PyTorch 2.6+ defaults torch.load(weights_only=True). Some pyannote checkpoints
-    # include OmegaConf objects; allowlisting avoids startup crashes.
-    try:
-        import torch  # noqa: F401
-        from omegaconf import DictConfig, ListConfig
-        import torch.serialization
-        torch.serialization.add_safe_globals([DictConfig, ListConfig])
-    except Exception:
-        # Best-effort: if deps aren't present, ignore.
-        return
-def _load_whisperx_asr_model():
-    # Prefer silero VAD to avoid pyannote checkpoint issues on some environments.
-    common_kwargs = {"device": DEVICE, "compute_type": "int8"}
-    try:
-        return whisperx.load_model(MODEL_SIZE, vad_method="silero", **common_kwargs)
-    except TypeError:
-        # Older WhisperX versions may not support vad_method.
-        _configure_torch_safe_loading_for_pyannote()
-        return whisperx.load_model(MODEL_SIZE, **common_kwargs)
-def _transcribe_with_compat(asr_model, audio_path: str) -> dict:
-    """
-    WhisperX versions differ:
-    - Some expose vad_filter/batch_size on .transcribe()
-    - Some (FasterWhisperPipeline) don't accept vad_filter
-    We prefer VAD when supported, but never fail the request because of kwargs.
-    """
-    try:
-        return asr_model.transcribe(audio_path, batch_size=4, vad_filter=True)
-    except TypeError:
-        try:
-            return asr_model.transcribe(audio_path, batch_size=4)
-        except TypeError:
-            return asr_model.transcribe(audio_path)
-def _align_with_compat(segments: list[dict], audio_path: str) -> dict:
-    # WhisperX align() sometimes expects raw audio array rather than a path.
-    try:
-        return whisperx.align(segments, model_a, metadata, audio_path, DEVICE)
-    except Exception:
-        audio = whisperx.load_audio(audio_path)
-        return whisperx.align(segments, model_a, metadata, audio, DEVICE)
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    global model, model_a, metadata
-    print("Server starting up - loading WhisperX models...")
-    model = _load_whisperx_asr_model()
-    model_a, metadata = whisperx.load_align_model(language_code="fr", device=DEVICE)
-    print("WhisperX models ready")
     yield
     print("Server shutting down...")
@@ -93,7 +52,13 @@ app.add_middleware(
 @app.get("/")
 @app.head("/")
 async def root():
-    return {"service": "LyricSync Backend", "engine": "whisperx", "status": "operational"}
 @app.get("/health")
@@ -117,7 +82,8 @@ def _format_srt_time(seconds: float) -> str:
     return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
-def _write_srt_file(segments: list[dict], file_obj) -> None:
     index = 1
     for segment in segments:
         text = (segment.get("text") or "").strip()
@@ -126,14 +92,24 @@ def _write_srt_file(segments: list[dict], file_obj) -> None:
         if not text or start is None or end is None:
             continue
-        file_obj.write(f"{index}\n")
-        file_obj.write(f"{_format_srt_time(start)} --> {_format_srt_time(end)}\n")
-        file_obj.write(f"{text}\n\n")
         index += 1
-_STRONG_PUNCT_RE = re.compile(r"[.!?]+$")
-_SOFT_PUNCT_RE = re.compile(r"[,;:]+$")
 def _cleanup_spacing(text: str) -> str:
@@ -143,40 +119,58 @@ def _cleanup_spacing(text: str) -> str:
     return text.strip()
-def _extract_word_segments(aligned_segments: list[dict]) -> list[dict]:
-    words: list[dict] = []
-    for segment in aligned_segments:
-        for word in segment.get("words") or []:
-            token = (word.get("word") or word.get("text") or "").strip()
-            start = word.get("start")
-            end = word.get("end")
-            if not token or start is None or end is None:
-                continue
-            entry = {"word": token, "start": float(start), "end": float(end)}
-            score = word.get("score")
-            if score is None:
-                score = word.get("probability")
-            if score is not None:
-                entry["score"] = float(score)
-            words.append(entry)
-    words.sort(key=lambda w: (w["start"], w["end"]))
-    return words
-def _paragraph_segments_from_aligned(aligned_segments: list[dict]) -> list[dict]:
     segments: list[dict] = []
-    for seg in aligned_segments:
         text = _cleanup_spacing((seg.get("text") or "").strip())
-        words = [w for w in (seg.get("words") or []) if w.get("start") is not None and w.get("end") is not None]
-        if not text or not words:
             continue
-        start = float(words[0]["start"])
-        end = float(words[-1]["end"])
-        segments.append({"start": start, "end": end, "text": text})
     return segments
 def _sentence_segments_from_words(word_segments: list[dict], max_words: int = 8, gap_s: float = 0.4) -> list[dict]:
     segments: list[dict] = []
     current: list[dict] = []
@@ -217,53 +211,247 @@ def _sentence_segments_from_words(word_segments: list[dict], max_words: int = 8,
     return segments
-@app.post("/align")
-async def align_audio(
-    background_tasks: BackgroundTasks,
     audio_file: UploadFile = File(...),
-    srt_mode: str = Form("paragraph"),
 ):
-    if model is None or model_a is None or metadata is None:
-        raise HTTPException(status_code=503, detail="WhisperX models are not ready")
     temp_dir = tempfile.mkdtemp(prefix="lyric-sync-")
     try:
-        if srt_mode not in ("paragraph", "sentence"):
-            raise HTTPException(status_code=400, detail="Invalid srt_mode (expected 'paragraph' or 'sentence')")
         source_name = audio_file.filename or "audio"
         audio_path = os.path.join(temp_dir, source_name)
         with open(audio_path, "wb") as f:
             shutil.copyfileobj(audio_file.file, f)
-        result = _transcribe_with_compat(model, audio_path)
-        result = _align_with_compat(result["segments"], audio_path)
-        word_segments = _extract_word_segments(result["segments"])
-        if srt_mode == "sentence":
-            srt_segments = _sentence_segments_from_words(word_segments)
-        else:
-            srt_segments = _paragraph_segments_from_aligned(result["segments"])
-        srt_path = os.path.join(temp_dir, f"{Path(source_name).stem}.srt")
-        with open(srt_path, "w", encoding="utf-8") as srt_file:
-            _write_srt_file(srt_segments, srt_file)
-        background_tasks.add_task(_cleanup_temp_dir, temp_dir)
-        return FileResponse(
-            path=srt_path,
-            media_type="application/x-subrip",
-            filename=f"{Path(source_name).stem}.srt",
-            background=background_tasks,
         )
     except Exception as e:
-        _cleanup_temp_dir(temp_dir)
-        raise HTTPException(status_code=500, detail=str(e)) from e
-    finally:
-        audio_file.file.close()
 if __name__ == "__main__":

 import shutil
 import tempfile
 from contextlib import asynccontextmanager
+from typing import Literal
+import whisper
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import PlainTextResponse
+from pydantic import BaseModel
 DEVICE = "cpu"
+WHISPER_MODEL_NAME = "large-v2"
+whisper_model = None
+SrtMode = Literal["lyric", "paragraph"]
+class TranslateRequest(BaseModel):
+    srt_content: str
+    target_language: str
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    global whisper_model
+    print(f"Server starting up - loading Whisper model '{WHISPER_MODEL_NAME}' on {DEVICE}...")
+    whisper_model = whisper.load_model(WHISPER_MODEL_NAME)
+    try:
+        whisper_model.to(DEVICE)
+    except Exception:
+        # Best effort: some whisper builds may not expose .to()
+        pass
+    print("Whisper model ready")
     yield
     print("Server shutting down...")
 @app.get("/")
 @app.head("/")
 async def root():
+    return {
+        "service": "LyricSync Backend",
+        "engine": "openai-whisper",
+        "model": WHISPER_MODEL_NAME,
+        "device": DEVICE,
+        "status": "operational",
+    }
 @app.get("/health")
     return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
+def _build_srt(segments: list[dict]) -> str:
+    lines: list[str] = []
     index = 1
     for segment in segments:
         text = (segment.get("text") or "").strip()
         if not text or start is None or end is None:
             continue
+        lines.append(str(index))
+        lines.append(f"{_format_srt_time(float(start))} --> {_format_srt_time(float(end))}")
+        lines.append(text)
+        lines.append("")
         index += 1
+    if not lines:
+        return ""
+    return "\n".join(lines).rstrip() + "\n"
+_STRONG_PUNCT_RE = re.compile(r"[.!?。！？]+$")
+_SOFT_PUNCT_RE = re.compile(r"[,;:、，；：]+$")
+_INSTRUMENTAL_RE = re.compile(
+    r"^\s*(?:\[(?:music|instrumental|applause|silence)\]|\((?:music|instrumental)\)|[♪♫]+)\s*$",
+    re.IGNORECASE,
+)
 def _cleanup_spacing(text: str) -> str:
     return text.strip()
+def _is_instrumental_text(text: str) -> bool:
+    if not text or not text.strip():
+        return True
+    cleaned = text.strip()
+    return bool(_INSTRUMENTAL_RE.match(cleaned))
+def _whisper_segments(transcribe_result: dict) -> list[dict]:
     segments: list[dict] = []
+    for seg in transcribe_result.get("segments") or []:
         text = _cleanup_spacing((seg.get("text") or "").strip())
+        start = seg.get("start")
+        end = seg.get("end")
+        if start is None or end is None:
+            continue
+        if _is_instrumental_text(text):
             continue
+        segments.append({"start": float(start), "end": float(end), "text": text})
     return segments
+def _tokenize_units(text: str) -> list[str]:
+    text = (text or "").strip()
+    if not text:
+        return []
+    if re.search(r"\s", text):
+        return [t for t in text.split() if t]
+    # Languages without spaces (CJK, etc.): approximate words by chunking.
+    chunk_size = 4
+    return [text[i : i + chunk_size] for i in range(0, len(text), chunk_size) if text[i : i + chunk_size].strip()]
+def _pseudo_word_segments_from_whisper(segments: list[dict]) -> list[dict]:
+    words: list[dict] = []
+    for seg in segments:
+        units = _tokenize_units(seg["text"])
+        if not units:
+            continue
+        start = float(seg["start"])
+        end = float(seg["end"])
+        dur = max(0.001, end - start)
+        step = dur / len(units)
+        for idx, unit in enumerate(units):
+            w_start = start + (idx * step)
+            w_end = start + ((idx + 1) * step)
+            words.append({"word": unit, "start": w_start, "end": w_end})
+    words.sort(key=lambda w: (w["start"], w["end"]))
+    return words
 def _sentence_segments_from_words(word_segments: list[dict], max_words: int = 8, gap_s: float = 0.4) -> list[dict]:
     segments: list[dict] = []
     current: list[dict] = []
     return segments
+def _transcribe_audio(audio_path: str) -> dict:
+    if whisper_model is None:
+        raise HTTPException(status_code=503, detail="Whisper model is not ready")
+    return whisper_model.transcribe(
+        audio_path,
+        fp16=False,
+        verbose=False,
+        condition_on_previous_text=False,
+        no_speech_threshold=0.7,
+    )
+def _segments_for_mode(segments: list[dict], mode: SrtMode) -> list[dict]:
+    if mode == "paragraph":
+        return segments
+    # Lyric mode: post-process into short lines (~8 words) using punctuation + pauses.
+    pseudo_words = _pseudo_word_segments_from_whisper(segments)
+    return _sentence_segments_from_words(pseudo_words, max_words=8, gap_s=0.4)
+@app.post("/srt", response_class=PlainTextResponse)
+async def generate_srt(
     audio_file: UploadFile = File(...),
+    srt_mode: str = Form("lyric"),
 ):
+    """
+    Generate SRT from audio using official OpenAI Whisper (large-v2) on CPU.
+    srt_mode:
+      - lyric (default): short lines for lyric videos
+      - paragraph: raw Whisper segments (longer transcript blocks)
+    """
     temp_dir = tempfile.mkdtemp(prefix="lyric-sync-")
     try:
+        mode = srt_mode.strip().lower()
+        # Backward-compat with old UI values.
+        if mode == "sentence":
+            mode = "lyric"
+        if mode not in ("lyric", "paragraph"):
+            raise HTTPException(status_code=400, detail="Invalid srt_mode (expected 'lyric' or 'paragraph')")
         source_name = audio_file.filename or "audio"
         audio_path = os.path.join(temp_dir, source_name)
         with open(audio_path, "wb") as f:
             shutil.copyfileobj(audio_file.file, f)
+        transcribe_result = _transcribe_audio(audio_path)
+        whisper_segs = _whisper_segments(transcribe_result)
+        srt_segments = _segments_for_mode(whisper_segs, mode)  # type: ignore[arg-type]
+        srt_content = _build_srt(srt_segments)
+        return PlainTextResponse(content=srt_content, media_type="application/x-subrip")
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e)) from e
+    finally:
+        try:
+            audio_file.file.close()
+        finally:
+            _cleanup_temp_dir(temp_dir)
+@app.post("/align", response_class=PlainTextResponse)
+async def align_audio_compat(
+    audio_file: UploadFile = File(...),
+    srt_mode: str = Form("lyric"),
+):
+    # Compatibility route: the old frontend calls /align.
+    return await generate_srt(audio_file=audio_file, srt_mode=srt_mode)
+_SRT_TS_RE = re.compile(
+    r"^(?P<start>\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(?P<end>\d{2}:\d{2}:\d{2},\d{3})\s*$"
+)
+def _parse_srt(srt_content: str) -> list[dict]:
+    blocks: list[dict] = []
+    lines = (srt_content or "").splitlines()
+    i = 0
+    while i < len(lines):
+        if not lines[i].strip():
+            i += 1
+            continue
+        raw_index = lines[i].strip()
+        try:
+            index = int(raw_index)
+        except ValueError:
+            index = len(blocks) + 1
+        i += 1
+        if i >= len(lines):
+            break
+        m = _SRT_TS_RE.match(lines[i].strip())
+        if not m:
+            # Skip malformed block
+            i += 1
+            continue
+        start = m.group("start")
+        end = m.group("end")
+        i += 1
+        text_lines: list[str] = []
+        while i < len(lines) and lines[i].strip():
+            text_lines.append(lines[i].rstrip("\n"))
+            i += 1
+        blocks.append({"index": index, "start": start, "end": end, "text": "\n".join(text_lines).strip()})
+    return blocks
+def _render_srt(blocks: list[dict]) -> str:
+    out: list[str] = []
+    for idx, block in enumerate(blocks, start=1):
+        out.append(str(idx))
+        out.append(f"{block['start']} --> {block['end']}")
+        out.append((block.get("text") or "").strip())
+        out.append("")
+    return "\n".join(out).rstrip() + "\n"
+_LANGUAGES: dict[str, dict] = {
+    "en": {"label": "English", "transliterate": False},
+    "fr": {"label": "French", "transliterate": False},
+    "es": {"label": "Spanish", "transliterate": False},
+    "de": {"label": "German", "transliterate": False},
+    "it": {"label": "Italian", "transliterate": False},
+    "ja": {"label": "Japanese (Romaji)", "transliterate": True, "scheme": "Romaji"},
+    "zh-Hans": {"label": "Chinese (Simplified, Pinyin)", "transliterate": True, "scheme": "Hanyu Pinyin"},
+    "zh-Hant": {"label": "Chinese (Traditional, Pinyin)", "transliterate": True, "scheme": "Hanyu Pinyin"},
+    "ko": {"label": "Korean (Romanized)", "transliterate": True, "scheme": "Revised Romanization"},
+    "th": {"label": "Thai (Romanized)", "transliterate": True, "scheme": "RTGS"},
+    "pt": {"label": "Portuguese", "transliterate": False},
+    "ru": {"label": "Russian (Transliterated)", "transliterate": True, "scheme": "Latin transliteration"},
+    "ar": {"label": "Arabic (Latin phonetic)", "transliterate": True, "scheme": "Latin phonetic"},
+    "hi": {"label": "Hindi (Latin transliteration)", "transliterate": True, "scheme": "Latin transliteration"},
+    "nl": {"label": "Dutch", "transliterate": False},
+    "id": {"label": "Indonesian", "transliterate": False},
+    "vi": {"label": "Vietnamese", "transliterate": False},
+    "tr": {"label": "Turkish", "transliterate": False},
+    "pl": {"label": "Polish", "transliterate": False},
+}
+def _translate_blocks_via_openai(texts: list[str], target_code: str) -> list[str]:
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        raise HTTPException(status_code=503, detail="OPENAI_API_KEY is not configured on the server")
+    language = _LANGUAGES[target_code]
+    label = language["label"]
+    transliterate = bool(language.get("transliterate"))
+    scheme = language.get("scheme")
+    system = (
+        "You translate short subtitle lines. Preserve meaning, punctuation, and line breaks. "
+        "Return ONLY valid JSON with shape {\"translations\": [..]}. No markdown."
+    )
+    translit_rule = ""
+    if transliterate:
+        extra = f" using {scheme}" if scheme else ""
+        translit_rule = (
+            f"IMPORTANT: Output MUST be Latin-script transliteration{extra}. "
+            "Do NOT output any native-script characters (no Kana/Kanji/Hanzi/Hangul/Cyrillic/Arabic/Devanagari/Thai)."
         )
+    from openai import OpenAI
+    client = OpenAI(api_key=api_key)
+    model_name = os.environ.get("OPENAI_TRANSLATE_MODEL", "gpt-4o-mini")
+    user = {
+        "target_language": label,
+        "rule": translit_rule,
+        "lines": texts,
+    }
+    import json
+    user_json = json.dumps(user, ensure_ascii=False)
+    resp = client.chat.completions.create(
+        model=model_name,
+        temperature=0,
+        messages=[
+            {"role": "system", "content": system},
+            {
+                "role": "user",
+                "content": (
+                    f"Translate each line to {label}. {translit_rule}\n"
+                    "Return JSON: {\"translations\": [\"...\", ...]} with the same length and order as input.\n\n"
+                    "Input JSON:\n"
+                    f"{user_json}"
+                ),
+            },
+        ],
+    )
+    content = (resp.choices[0].message.content or "").strip()
+    try:
+        import json
+        # Best-effort: extract the first JSON object from the response.
+        start = content.find("{")
+        end = content.rfind("}")
+        payload = json.loads(content[start : end + 1] if start != -1 and end != -1 else content)
+        translations = payload.get("translations")
+        if not isinstance(translations, list) or len(translations) != len(texts):
+            raise ValueError("Invalid translations payload")
+        return [str(t) for t in translations]
     except Exception as e:
+        raise HTTPException(status_code=502, detail=f"Translation parsing failed: {e}") from e
+@app.post("/translate", response_class=PlainTextResponse)
+async def translate_srt(req: TranslateRequest):
+    target = (req.target_language or "").strip()
+    if target not in _LANGUAGES:
+        raise HTTPException(status_code=400, detail=f"Unsupported target_language (supported: {', '.join(_LANGUAGES)})")
+    blocks = _parse_srt(req.srt_content)
+    if not blocks:
+        raise HTTPException(status_code=400, detail="Empty or invalid SRT content")
+    texts = [b["text"] for b in blocks]
+    # Chunk to keep prompts manageable.
+    translated: list[str] = []
+    chunk_size = 60
+    for i in range(0, len(texts), chunk_size):
+        translated.extend(_translate_blocks_via_openai(texts[i : i + chunk_size], target))
+    for block, new_text in zip(blocks, translated, strict=True):
+        block["text"] = (new_text or "").strip()
+    return PlainTextResponse(content=_render_srt(blocks), media_type="application/x-subrip")
 if __name__ == "__main__":