Spaces:

bichnhan2701
/

PhoWhisperBaseAPI

Sleeping

App Files Files Community

bichnhan2701 commited on Jan 6

Commit

bbf158e

1 Parent(s): 75ad4a0

Debug timeout

Browse files

Files changed (3) hide show

app/api/transcribe.py +21 -9
app/core/asr_engine.py +82 -75
app/jobs/transcribe_job.py +21 -8

app/api/transcribe.py CHANGED Viewed

@@ -29,6 +29,7 @@ from app.core.asr_engine import (
     load_model,
     transcribe_file,
     transcribe_file_chunks,
 )
 router = APIRouter()
@@ -58,14 +59,26 @@ def _ensure_file_limits(path: str):
         raise HTTPException(413, "Audio duration exceeds limit")
-def _enqueue_async_job(audio_url: str, note_id: str, user_id: str | None = None):
     q = Queue("asr", connection=redis_client)
     return q.enqueue(
         transcribe_job,
         audio_url,
         note_id,
         user_id,
-        job_timeout=1800,
         retry=Retry(max=3, interval=[2, 5, 10]),
     )
@@ -74,16 +87,15 @@ def _enqueue_async_job(audio_url: str, note_id: str, user_id: str | None = None)
 async def _run_sync_pipeline(tmp_wav: str, note_id: str, audio_url: str | None = None):
     """
     Sync ASR → update existing note
     """
     note_service = NoteServiceClient()
     info = get_audio_info(tmp_wav) or {}
     with ASR_DURATION.labels("/transcribe").time():
-        text = await asyncio.to_thread(
-            transcribe_file, ASR_MODEL, tmp_wav, 30.0, 5.0
-        )
-        chunks = await asyncio.to_thread(
-            transcribe_file_chunks, ASR_MODEL, tmp_wav, 30.0, 5.0
         )
     chunks = [
@@ -190,7 +202,7 @@ async def transcribe(file: UploadFile = File(...)):
                 audio_url = await asyncio.to_thread(upload_temp_audio, tmp_wav)
                 await _create_placeholder_note(note_id, duration, audio_url)
-                job = _enqueue_async_job(audio_url, note_id)
                 REQUEST_COUNT.labels(endpoint, "queued").inc()
                 return JSONResponse(
@@ -252,7 +264,7 @@ async def transcribe_url(payload: dict):
             # ---------- ASYNC ----------
             if duration > ASYNC_THRESHOLD:
                 await _create_placeholder_note(note_id, duration, audio_url)
-                job = _enqueue_async_job(audio_url, note_id, user_id)
                 REQUEST_COUNT.labels(endpoint, "queued").inc()
                 return JSONResponse(

     load_model,
     transcribe_file,
     transcribe_file_chunks,
+    transcribe_file_unified,
 )
 router = APIRouter()
         raise HTTPException(413, "Audio duration exceeds limit")
+def _calculate_job_timeout(duration: float) -> int:
+    """
+    Calculate dynamic job timeout based on audio duration.
+    Formula: max(1800, duration * 3 + 300)
+    - Minimum 30 minutes
+    - ~3x realtime + 5 min buffer for long audio
+    """
+    return max(1800, int(duration * 3) + 300)
+def _enqueue_async_job(audio_url: str, note_id: str, user_id: str | None = None, duration: float = 0, duration: float = 0):
     q = Queue("asr", connection=redis_client)
+    job_timeout = _calculate_job_timeout(duration)
+    logger.info("[ASR] Enqueuing job for note=%s, duration=%.2fs, timeout=%ds", note_id, duration, job_timeout)
     return q.enqueue(
         transcribe_job,
         audio_url,
         note_id,
         user_id,
+        job_timeout=job_timeout,
         retry=Retry(max=3, interval=[2, 5, 10]),
     )
 async def _run_sync_pipeline(tmp_wav: str, note_id: str, audio_url: str | None = None):
     """
     Sync ASR → update existing note
+    🔥 FIX: Use unified function to avoid double inference
     """
     note_service = NoteServiceClient()
     info = get_audio_info(tmp_wav) or {}
     with ASR_DURATION.labels("/transcribe").time():
+        # 🔥 SINGLE INFERENCE - returns both text and chunks
+        text, chunks = await asyncio.to_thread(
+            transcribe_file_unified, ASR_MODEL, tmp_wav, 30.0, 5.0
         )
     chunks = [
                 audio_url = await asyncio.to_thread(upload_temp_audio, tmp_wav)
                 await _create_placeholder_note(note_id, duration, audio_url)
+                job = _enqueue_async_job(audio_url, note_id, duration=duration)
                 REQUEST_COUNT.labels(endpoint, "queued").inc()
                 return JSONResponse(
             # ---------- ASYNC ----------
             if duration > ASYNC_THRESHOLD:
                 await _create_placeholder_note(note_id, duration, audio_url)
+                job = _enqueue_async_job(audio_url, note_id, user_id, duration=duration)
                 REQUEST_COUNT.labels(endpoint, "queued").inc()
                 return JSONResponse(

app/core/asr_engine.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import logging
-from typing import List, Dict
 import torch
 from transformers import pipeline
 from transformers import logging as transformers_logging
 import warnings
 import os
-from typing import Tuple
 from app.core.chunking import split_audio_to_chunks
 from app.core.audio_utils import get_audio_info
@@ -60,50 +60,105 @@ def load_model(chunk_length_s: float = 30.0):
 # ===============================
 # Transcribe full text
 # ===============================
-def transcribe_file(
     model,
     wav_path: str,
     chunk_length_s: float = 30.0,
     stride_s: float = 5.0,
-) -> str:
     """
-    Return full transcript text.
     """
     if not wav_path:
-        return ""
     # If audio is long, prefer chunked inference to avoid memory/time issues
     info = get_audio_info(wav_path) or {}
     duration = info.get("duration", 0)
     if duration and duration > chunk_length_s:
         try:
-            text, _chunks = transcribe_long_audio(
                 model, wav_path, chunk_length_s=chunk_length_s, overlap_s=stride_s
             )
-            return text
         except Exception:
             logger.exception("transcribe_long_audio failed, falling back to pipeline")
     out = model(
         wav_path,
         chunk_length_s=chunk_length_s,
         stride_length_s=stride_s,
-        # return_timestamps may be ignored for full-text outputs but safe to pass
     )
-    # Primary: pipeline may return 'text'
     text = (out.get("text") or "").strip()
-    if text:
-        return text
-    # Fallback: some pipeline versions return detailed segments/chunks
-    segs = out.get("chunks") or out.get("segments") or []
-    if segs:
-        parts = [ (s.get("text") or "").strip() for s in segs ]
-        joined = " ".join([p for p in parts if p])
-        return joined.strip()
-    return ""
 def transcribe_long_audio(
@@ -119,14 +174,17 @@ def transcribe_long_audio(
     if not wav_path:
         return "", []
     # prefer VAD-based splitting if available
     try:
         from app.core.chunking import split_audio_with_vad
         chunk_paths = split_audio_with_vad(wav_path)
-    except Exception:
         chunk_paths = split_audio_to_chunks(wav_path, chunk_length_s=chunk_length_s, overlap_s=overlap_s)
-    logger.debug("transcribe_long_audio: split into %d chunk_paths", len(chunk_paths))
     combined_text_parts = []
     combined_chunks: List[Dict] = []
@@ -230,58 +288,7 @@ def transcribe_file_chunks(
     """
     Return list of chunks:
     [{ start, end, text }]
     """
-    if not wav_path:
-        return []
-    # For long audio prefer explicit chunked inference (split + per-chunk inference)
-    info = get_audio_info(wav_path) or {}
-    duration = info.get("duration", 0)
-    if duration and duration > chunk_length_s:
-        try:
-            _, combined = transcribe_long_audio(
-                model, wav_path, chunk_length_s=chunk_length_s, overlap_s=stride_s
-            )
-            return combined
-        except Exception:
-            logger.exception("transcribe_long_audio failed in transcribe_file_chunks, falling back to pipeline")
-    out = model(
-        wav_path,
-        chunk_length_s=chunk_length_s,
-        stride_length_s=stride_s,
-        return_timestamps=True,
-    )
-    # Pipeline output can vary across transformers versions/models:
-    # - some return `chunks` (with `timestamp` list),
-    # - others return `segments` (with `start`/end),
-    # so be permissive and handle both shapes.
-    raw_segments = out.get("chunks") or out.get("segments") or []
-    chunks = []
-    for c in raw_segments:
-        # try multiple timestamp shapes
-        start = None
-        end = None
-        if isinstance(c.get("timestamp"), (list, tuple)) and len(c.get("timestamp")) >= 2:
-            ts = c.get("timestamp")
-            start, end = ts[0], ts[1]
-        elif c.get("start") is not None and c.get("end") is not None:
-            start, end = c.get("start"), c.get("end")
-        text = (c.get("text") or "").strip()
-        if not text:
-            continue
-        # If timestamps are missing, skip (we don't want chunks without timing)
-        if start is None or end is None:
-            continue
-        try:
-            chunks.append({"start": float(start), "end": float(end), "text": text})
-        except Exception:
-            # be robust against unexpected types
-            continue
     return chunks

 import logging
+import time
+from typing import List, Dict, Tuple
 import torch
 from transformers import pipeline
 from transformers import logging as transformers_logging
 import warnings
 import os
 from app.core.chunking import split_audio_to_chunks
 from app.core.audio_utils import get_audio_info
 # ===============================
 # Transcribe full text
 # ===============================
+def transcribe_file_unified(
     model,
     wav_path: str,
     chunk_length_s: float = 30.0,
     stride_s: float = 5.0,
+) -> Tuple[str, List[Dict]]:
     """
+    🔥 UNIFIED: Return both full transcript text AND timestamped chunks in ONE inference pass.
+    This avoids the costly double-inference that was causing timeouts.
+    Returns:
+        (text, chunks) where chunks = [{"start": float, "end": float, "text": str}, ...]
     """
     if not wav_path:
+        return "", []
+    start_time = time.time()
+    logger.info("[ASR] Starting unified transcription for %s", wav_path)
     # If audio is long, prefer chunked inference to avoid memory/time issues
     info = get_audio_info(wav_path) or {}
     duration = info.get("duration", 0)
+    logger.info("[ASR] Audio duration: %.2fs", duration)
     if duration and duration > chunk_length_s:
         try:
+            text, chunks = transcribe_long_audio(
                 model, wav_path, chunk_length_s=chunk_length_s, overlap_s=stride_s
             )
+            elapsed = time.time() - start_time
+            logger.info("[ASR] Long audio transcription completed in %.2fs (%.2fx realtime)", elapsed, elapsed / duration if duration else 0)
+            return text, chunks
         except Exception:
             logger.exception("transcribe_long_audio failed, falling back to pipeline")
+    # Short audio: single pipeline call with timestamps
     out = model(
         wav_path,
         chunk_length_s=chunk_length_s,
         stride_length_s=stride_s,
+        return_timestamps=True,
     )
+    # Extract text
     text = (out.get("text") or "").strip()
+    if not text:
+        segs = out.get("chunks") or out.get("segments") or []
+        if segs:
+            parts = [(s.get("text") or "").strip() for s in segs]
+            text = " ".join([p for p in parts if p]).strip()
+    # Extract chunks with timestamps
+    chunks = _extract_chunks_from_output(out)
+    elapsed = time.time() - start_time
+    logger.info("[ASR] Short audio transcription completed in %.2fs", elapsed)
+    return text, chunks
+def _extract_chunks_from_output(out: dict) -> List[Dict]:
+    """Extract timestamped chunks from model output."""
+    raw_segments = out.get("chunks") or out.get("segments") or []
+    chunks = []
+    for c in raw_segments:
+        start = None
+        end = None
+        if isinstance(c.get("timestamp"), (list, tuple)) and len(c.get("timestamp")) >= 2:
+            ts = c.get("timestamp")
+            start, end = ts[0], ts[1]
+        elif c.get("start") is not None and c.get("end") is not None:
+            start, end = c.get("start"), c.get("end")
+        text = (c.get("text") or "").strip()
+        if not text or start is None or end is None:
+            continue
+        try:
+            chunks.append({"start": float(start), "end": float(end), "text": text})
+        except Exception:
+            continue
+    return chunks
+def transcribe_file(
+    model,
+    wav_path: str,
+    chunk_length_s: float = 30.0,
+    stride_s: float = 5.0,
+) -> str:
+    """
+    Return full transcript text.
+    ⚠️ DEPRECATED: Use transcribe_file_unified() to get both text and chunks in one pass.
+    """
+    text, _ = transcribe_file_unified(model, wav_path, chunk_length_s, stride_s)
+    return text
 def transcribe_long_audio(
     if not wav_path:
         return "", []
+    split_start = time.time()
     # prefer VAD-based splitting if available
     try:
         from app.core.chunking import split_audio_with_vad
         chunk_paths = split_audio_with_vad(wav_path)
+        logger.info("[ASR] VAD split into %d chunks in %.2fs", len(chunk_paths), time.time() - split_start)
+    except Exception as e:
+        logger.warning("[ASR] VAD split failed (%s), using fixed windows", e)
         chunk_paths = split_audio_to_chunks(wav_path, chunk_length_s=chunk_length_s, overlap_s=overlap_s)
+        logger.info("[ASR] Fixed-window split into %d chunks in %.2fs", len(chunk_paths), time.time() - split_start)
     combined_text_parts = []
     combined_chunks: List[Dict] = []
     """
     Return list of chunks:
     [{ start, end, text }]
+    ⚠️ DEPRECATED: Use transcribe_file_unified() to get both text and chunks in one pass.
     """
+    _, chunks = transcribe_file_unified(model, wav_path, chunk_length_s, stride_s)
     return chunks

app/jobs/transcribe_job.py CHANGED Viewed

@@ -6,11 +6,13 @@ import requests
 import httpx
 import time
-from app.core.asr_engine import load_model, transcribe_file, transcribe_file_chunks
 from app.services.note_client import NoteServiceClient
 from app.core.audio_utils import get_audio_info
 from app.core.audio_utils import ensure_wav_16k_mono, make_temp_path
 def run_async(coro):
     try:
         loop = asyncio.get_running_loop()
@@ -35,19 +37,28 @@ def download_audio(audio_url: str) -> str:
 def transcribe_job(audio_url: str, note_id: str, user_id: str | None = None):
     model = load_model()
     wav_path = None
     try:
         # 1️⃣ Download audio
         wav_path = download_audio(audio_url)
         # Ensure WAV is 16k mono for consistent chunking and ASR behavior
         try:
             info = get_audio_info(wav_path) or {}
             if info.get("samplerate") != 16000 or info.get("channels") != 1:
                 tmp_wav = make_temp_path(suffix=".wav")
                 ensure_wav_16k_mono(wav_path, tmp_wav)
                 # replace wav_path with converted file and remove original
                 try:
                     os.remove(wav_path)
@@ -55,11 +66,12 @@ def transcribe_job(audio_url: str, note_id: str, user_id: str | None = None):
                     pass
                 wav_path = tmp_wav
         except Exception:
-            logging.exception("Failed to ensure wav format for %s", wav_path)
-        # 2️⃣ ASR
-        text = transcribe_file(model, wav_path, 30.0, 5.0)
-        chunks = transcribe_file_chunks(model, wav_path, 30.0, 5.0)
         # normalize chunks list
         chunks = [
@@ -75,7 +87,7 @@ def transcribe_job(audio_url: str, note_id: str, user_id: str | None = None):
                 duration = info.get("duration") or 0.0
                 chunks = [{"text": text.strip(), "start": 0.0, "end": float(duration)}]
             except Exception:
-                logging.exception("failed to create fallback chunk for note %s", note_id)
         # Consider transcribed if we have either timestamped chunks or non-empty text
         note_status = "transcribed" if (chunks or (text and text.strip())) else "error"
@@ -105,7 +117,7 @@ def transcribe_job(audio_url: str, note_id: str, user_id: str | None = None):
             try:
                 payload["metadata"]["audio"]["url"] = audio_url
             except Exception:
-                logging.exception("Failed to attach audio_url to payload for note %s", note_id)
         generate_tasks = (
             ["normalize", "keywords", "summary", "mindmap"]
@@ -128,9 +140,10 @@ def transcribe_job(audio_url: str, note_id: str, user_id: str | None = None):
                     },
                 )
             )
         except httpx.HTTPStatusError as e:
             if e.response.status_code == 404:
-                logging.warning(
                     "Note not found on update, will retry later note_id=%s",
                     note_id,
                 )

 import httpx
 import time
+from app.core.asr_engine import load_model, transcribe_file, transcribe_file_chunks, transcribe_file_unified
 from app.services.note_client import NoteServiceClient
 from app.core.audio_utils import get_audio_info
 from app.core.audio_utils import ensure_wav_16k_mono, make_temp_path
+logger = logging.getLogger(__name__)
 def run_async(coro):
     try:
         loop = asyncio.get_running_loop()
 def transcribe_job(audio_url: str, note_id: str, user_id: str | None = None):
+    job_start = time.time()
+    logger.info("[JOB] Starting transcribe_job for note=%s, url=%s", note_id, audio_url)
     model = load_model()
     wav_path = None
     try:
         # 1️⃣ Download audio
+        download_start = time.time()
         wav_path = download_audio(audio_url)
+        logger.info("[JOB] Downloaded audio in %.2fs", time.time() - download_start)
         # Ensure WAV is 16k mono for consistent chunking and ASR behavior
         try:
             info = get_audio_info(wav_path) or {}
+            logger.info("[JOB] Audio info: duration=%.2fs, samplerate=%s, channels=%s",
+                        info.get("duration", 0), info.get("samplerate"), info.get("channels"))
             if info.get("samplerate") != 16000 or info.get("channels") != 1:
+                convert_start = time.time()
                 tmp_wav = make_temp_path(suffix=".wav")
                 ensure_wav_16k_mono(wav_path, tmp_wav)
+                logger.info("[JOB] Converted to 16k mono in %.2fs", time.time() - convert_start)
                 # replace wav_path with converted file and remove original
                 try:
                     os.remove(wav_path)
                     pass
                 wav_path = tmp_wav
         except Exception:
+            logger.exception("Failed to ensure wav format for %s", wav_path)
+        # 2️⃣ ASR - 🔥 SINGLE INFERENCE using unified function
+        asr_start = time.time()
+        text, chunks = transcribe_file_unified(model, wav_path, 30.0, 5.0)
+        logger.info("[JOB] ASR completed in %.2fs", time.time() - asr_start)
         # normalize chunks list
         chunks = [
                 duration = info.get("duration") or 0.0
                 chunks = [{"text": text.strip(), "start": 0.0, "end": float(duration)}]
             except Exception:
+                logger.exception("failed to create fallback chunk for note %s", note_id)
         # Consider transcribed if we have either timestamped chunks or non-empty text
         note_status = "transcribed" if (chunks or (text and text.strip())) else "error"
             try:
                 payload["metadata"]["audio"]["url"] = audio_url
             except Exception:
+                logger.exception("Failed to attach audio_url to payload for note %s", note_id)
         generate_tasks = (
             ["normalize", "keywords", "summary", "mindmap"]
                     },
                 )
             )
+            logger.info("[JOB] Completed note=%s in %.2fs, status=%s", note_id, time.time() - job_start, note_status)
         except httpx.HTTPStatusError as e:
             if e.response.status_code == 404:
+                logger.warning(
                     "Note not found on update, will retry later note_id=%s",
                     note_id,
                 )