Spaces:

Joyboy-dy
/

lyric-sync-api

Sleeping

App Files Files Community

Joyboy-dy commited on Feb 7

Commit

7bfdd1b

1 Parent(s): 6b8c228

Update server.py and requirements.txt

Browse files

Files changed (2) hide show

requirements.txt +0 -1
server.py +102 -3

requirements.txt CHANGED Viewed

@@ -3,4 +3,3 @@ uvicorn[standard]
 python-multipart
 whisperx
 torch
-ffmpeg

 python-multipart
 whisperx
 torch

server.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import os
 import shutil
 import tempfile
 from contextlib import asynccontextmanager
 from pathlib import Path
 import whisperx
-from fastapi import BackgroundTasks, FastAPI, File, HTTPException, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse
@@ -131,14 +132,106 @@ def _write_srt_file(segments: list[dict], file_obj) -> None:
         index += 1
 @app.post("/align")
-async def align_audio(background_tasks: BackgroundTasks, audio_file: UploadFile = File(...)):
     if model is None or model_a is None or metadata is None:
         raise HTTPException(status_code=503, detail="WhisperX models are not ready")
     temp_dir = tempfile.mkdtemp(prefix="lyric-sync-")
     try:
         source_name = audio_file.filename or "audio"
         audio_path = os.path.join(temp_dir, source_name)
         with open(audio_path, "wb") as f:
@@ -147,9 +240,15 @@ async def align_audio(background_tasks: BackgroundTasks, audio_file: UploadFile
         result = _transcribe_with_compat(model, audio_path)
         result = _align_with_compat(result["segments"], audio_path)
         srt_path = os.path.join(temp_dir, f"{Path(source_name).stem}.srt")
         with open(srt_path, "w", encoding="utf-8") as srt_file:
-            _write_srt_file(result["segments"], srt_file)
         background_tasks.add_task(_cleanup_temp_dir, temp_dir)

 import os
+import re
 import shutil
 import tempfile
 from contextlib import asynccontextmanager
 from pathlib import Path
 import whisperx
+from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse
         index += 1
+_STRONG_PUNCT_RE = re.compile(r"[.!?]+$")
+_SOFT_PUNCT_RE = re.compile(r"[,;:]+$")
+def _cleanup_spacing(text: str) -> str:
+    text = re.sub(r"\s+([,.;:!?])", r"\1", text)
+    text = re.sub(r"\(\s+", "(", text)
+    text = re.sub(r"\s+\)", ")", text)
+    return text.strip()
+def _extract_word_segments(aligned_segments: list[dict]) -> list[dict]:
+    words: list[dict] = []
+    for segment in aligned_segments:
+        for word in segment.get("words") or []:
+            token = (word.get("word") or word.get("text") or "").strip()
+            start = word.get("start")
+            end = word.get("end")
+            if not token or start is None or end is None:
+                continue
+            entry = {"word": token, "start": float(start), "end": float(end)}
+            score = word.get("score")
+            if score is None:
+                score = word.get("probability")
+            if score is not None:
+                entry["score"] = float(score)
+            words.append(entry)
+    words.sort(key=lambda w: (w["start"], w["end"]))
+    return words
+def _paragraph_segments_from_aligned(aligned_segments: list[dict]) -> list[dict]:
+    segments: list[dict] = []
+    for seg in aligned_segments:
+        text = _cleanup_spacing((seg.get("text") or "").strip())
+        words = [w for w in (seg.get("words") or []) if w.get("start") is not None and w.get("end") is not None]
+        if not text or not words:
+            continue
+        start = float(words[0]["start"])
+        end = float(words[-1]["end"])
+        segments.append({"start": start, "end": end, "text": text})
+    return segments
+def _sentence_segments_from_words(word_segments: list[dict], max_words: int = 8, gap_s: float = 0.4) -> list[dict]:
+    segments: list[dict] = []
+    current: list[dict] = []
+    def flush() -> None:
+        nonlocal current
+        if not current:
+            return
+        text = _cleanup_spacing(" ".join(w["word"] for w in current))
+        if text:
+            segments.append({"start": current[0]["start"], "end": current[-1]["end"], "text": text})
+        current = []
+    for word in word_segments:
+        if current:
+            gap = float(word["start"]) - float(current[-1]["end"])
+            if gap > gap_s:
+                flush()
+        current.append(word)
+        token = word["word"]
+        is_strong = bool(_STRONG_PUNCT_RE.search(token))
+        is_soft = bool(_SOFT_PUNCT_RE.search(token))
+        if is_strong:
+            flush()
+            continue
+        if len(current) >= max_words:
+            flush()
+            continue
+        if is_soft and len(current) >= 4:
+            flush()
+    flush()
+    return segments
 @app.post("/align")
+async def align_audio(
+    background_tasks: BackgroundTasks,
+    audio_file: UploadFile = File(...),
+    srt_mode: str = Form("paragraph"),
+):
     if model is None or model_a is None or metadata is None:
         raise HTTPException(status_code=503, detail="WhisperX models are not ready")
     temp_dir = tempfile.mkdtemp(prefix="lyric-sync-")
     try:
+        if srt_mode not in ("paragraph", "sentence"):
+            raise HTTPException(status_code=400, detail="Invalid srt_mode (expected 'paragraph' or 'sentence')")
         source_name = audio_file.filename or "audio"
         audio_path = os.path.join(temp_dir, source_name)
         with open(audio_path, "wb") as f:
         result = _transcribe_with_compat(model, audio_path)
         result = _align_with_compat(result["segments"], audio_path)
+        word_segments = _extract_word_segments(result["segments"])
+        if srt_mode == "sentence":
+            srt_segments = _sentence_segments_from_words(word_segments)
+        else:
+            srt_segments = _paragraph_segments_from_aligned(result["segments"])
         srt_path = os.path.join(temp_dir, f"{Path(source_name).stem}.srt")
         with open(srt_path, "w", encoding="utf-8") as srt_file:
+            _write_srt_file(srt_segments, srt_file)
         background_tasks.add_task(_cleanup_temp_dir, temp_dir)