Spaces:

bichnhan2701
/

PhoWhisperBaseAPI

Sleeping

App Files Files Community

bichnhan2701 commited on Dec 21, 2025

Commit

580c470

1 Parent(s): 7d9eaeb

update

Browse files

Files changed (1) hide show

app/core/asr_engine.py +105 -0

app/core/asr_engine.py CHANGED Viewed

@@ -3,6 +3,11 @@ from typing import List, Dict
 import torch
 from transformers import pipeline
 logger = logging.getLogger(__name__)
@@ -56,6 +61,18 @@ def transcribe_file(
     if not wav_path:
         return ""
     out = model(
         wav_path,
         chunk_length_s=chunk_length_s,
@@ -78,6 +95,81 @@ def transcribe_file(
     return ""
 # ===============================
 # Transcribe chunks with timestamps
 # ===============================
@@ -132,4 +224,17 @@ def transcribe_file_chunks(
             # be robust against unexpected types
             continue
     return chunks

 import torch
 from transformers import pipeline
+import os
+from typing import Tuple
+from app.core.chunking import split_audio_to_chunks
+from app.core.audio_utils import get_audio_info
 logger = logging.getLogger(__name__)
     if not wav_path:
         return ""
+    # If audio is long, prefer chunked inference to avoid memory/time issues
+    info = get_audio_info(wav_path) or {}
+    duration = info.get("duration", 0)
+    if duration and duration > chunk_length_s:
+        try:
+            text, _chunks = transcribe_long_audio(
+                model, wav_path, chunk_length_s=chunk_length_s, overlap_s=stride_s
+            )
+            return text
+        except Exception:
+            logger.exception("transcribe_long_audio failed, falling back to pipeline")
     out = model(
         wav_path,
         chunk_length_s=chunk_length_s,
     return ""
+def transcribe_long_audio(
+    model,
+    wav_path: str,
+    chunk_length_s: float = 30.0,
+    overlap_s: float = 5.0,
+) -> Tuple[str, List[Dict]]:
+    """
+    Split `wav_path` into chunks and run inference on each chunk sequentially.
+    Returns (full_text, chunks) where chunks have global start/end timestamps.
+    """
+    if not wav_path:
+        return "", []
+    chunk_paths = split_audio_to_chunks(wav_path, chunk_length_s=chunk_length_s, overlap_s=overlap_s)
+    combined_text_parts = []
+    combined_chunks: List[Dict] = []
+    step = chunk_length_s - overlap_s
+    try:
+        for i, cp in enumerate(chunk_paths):
+            base_offset = i * step
+            try:
+                out = model(
+                    cp,
+                    chunk_length_s=chunk_length_s,
+                    stride_length_s=overlap_s,
+                    return_timestamps=True,
+                )
+            except Exception:
+                logger.exception("model inference failed for chunk %s", cp)
+                continue
+            part_text = (out.get("text") or "").strip()
+            if not part_text:
+                segs = out.get("chunks") or out.get("segments") or []
+                parts = [ (s.get("text") or "").strip() for s in segs ]
+                part_text = " ".join([p for p in parts if p]).strip()
+            if part_text:
+                combined_text_parts.append(part_text)
+            raw_segs = out.get("chunks") or out.get("segments") or []
+            for s in raw_segs:
+                start = None
+                end = None
+                if isinstance(s.get("timestamp"), (list, tuple)) and len(s.get("timestamp")) >= 2:
+                    ts = s.get("timestamp")
+                    start, end = ts[0], ts[1]
+                elif s.get("start") is not None and s.get("end") is not None:
+                    start, end = s.get("start"), s.get("end")
+                text = (s.get("text") or "").strip()
+                if not text or start is None or end is None:
+                    continue
+                try:
+                    combined_chunks.append(
+                        {"start": float(start) + base_offset, "end": float(end) + base_offset, "text": text}
+                    )
+                except Exception:
+                    continue
+    finally:
+        for p in chunk_paths:
+            try:
+                if p and os.path.exists(p):
+                    os.remove(p)
+            except Exception:
+                logger.debug("Failed to remove chunk file %s", p)
+    full_text = " ".join([p for p in combined_text_parts if p]).strip()
+    return full_text, combined_chunks
 # ===============================
 # Transcribe chunks with timestamps
 # ===============================
             # be robust against unexpected types
             continue
+    # If no timestamped chunks found and file is long, try chunked inference
+    if not chunks:
+        info = get_audio_info(wav_path) or {}
+        duration = info.get("duration", 0)
+        if duration and duration > chunk_length_s:
+            try:
+                _, combined = transcribe_long_audio(
+                    model, wav_path, chunk_length_s=chunk_length_s, overlap_s=stride_s
+                )
+                return combined
+            except Exception:
+                logger.exception("transcribe_long_audio fallback failed for %s", wav_path)
     return chunks