Spaces:

bichnhan2701
/

PhoWhisperBaseAPI

Sleeping

App Files Files Community

bichnhan2701 commited on Dec 21, 2025

Commit

5399362

1 Parent(s): 643318e

update vad logic for chunks

Browse files

Files changed (3) hide show

app/core/asr_engine.py +23 -1
app/core/chunking.py +103 -0
requirements.txt +2 -1

app/core/asr_engine.py CHANGED Viewed

@@ -108,7 +108,14 @@ def transcribe_long_audio(
     if not wav_path:
         return "", []
-    chunk_paths = split_audio_to_chunks(wav_path, chunk_length_s=chunk_length_s, overlap_s=overlap_s)
     combined_text_parts = []
     combined_chunks: List[Dict] = []
@@ -117,6 +124,14 @@ def transcribe_long_audio(
         for i, cp in enumerate(chunk_paths):
             base_offset = i * step
             try:
                 out = model(
                     cp,
@@ -128,6 +143,13 @@ def transcribe_long_audio(
                 logger.exception("model inference failed for chunk %s", cp)
                 continue
             part_text = (out.get("text") or "").strip()
             if not part_text:
                 segs = out.get("chunks") or out.get("segments") or []

     if not wav_path:
         return "", []
+    # prefer VAD-based splitting if available
+    try:
+        from app.core.chunking import split_audio_with_vad
+        chunk_paths = split_audio_with_vad(wav_path)
+    except Exception:
+        chunk_paths = split_audio_to_chunks(wav_path, chunk_length_s=chunk_length_s, overlap_s=overlap_s)
+    logger.debug("transcribe_long_audio: split into %d chunk_paths", len(chunk_paths))
     combined_text_parts = []
     combined_chunks: List[Dict] = []
         for i, cp in enumerate(chunk_paths):
             base_offset = i * step
+            try:
+                cinfo = get_audio_info(cp) or {}
+                logger.debug(
+                    "chunk[%d]=%s duration=%.3fs samplerate=%s", i, cp, cinfo.get("duration"), cinfo.get("samplerate")
+                )
+            except Exception:
+                logger.debug("chunk[%d]=%s (info unavailable)", i, cp)
             try:
                 out = model(
                     cp,
                 logger.exception("model inference failed for chunk %s", cp)
                 continue
+            # debug: log output shape/keys (only first few chunks to avoid huge logs)
+            try:
+                if i < 5:
+                    logger.debug("model out keys for chunk[%d]: %s", i, list(out.keys()) if isinstance(out, dict) else type(out))
+            except Exception:
+                logger.debug("failed to log model out keys for chunk %d", i)
             part_text = (out.get("text") or "").strip()
             if not part_text:
                 segs = out.get("chunks") or out.get("segments") or []

app/core/chunking.py CHANGED Viewed

@@ -4,6 +4,15 @@ import shlex
 import subprocess
 from typing import List
 from app.core.audio_utils import get_audio_info, make_temp_path
 def ffmpeg_extract_segment(src: str, start: float, duration: float, dst: str):
     """
@@ -34,3 +43,97 @@ def split_audio_to_chunks(src_wav: str, chunk_length_s: float = 30.0, overlap_s:
         ffmpeg_extract_segment(src_wav, s, min(chunk_length_s, duration - s), chunk_path)
         chunks.append(chunk_path)
     return chunks

 import subprocess
 from typing import List
 from app.core.audio_utils import get_audio_info, make_temp_path
+import soundfile as sf
+import numpy as np
+# optional webrtcvad for speech-based splitting
+try:
+    import webrtcvad
+    _HAS_VAD = True
+except Exception:
+    _HAS_VAD = False
 def ffmpeg_extract_segment(src: str, start: float, duration: float, dst: str):
     """
         ffmpeg_extract_segment(src_wav, s, min(chunk_length_s, duration - s), chunk_path)
         chunks.append(chunk_path)
     return chunks
+def split_audio_with_vad(
+    src_wav: str,
+    aggressiveness: int = 2,
+    frame_ms: int = 30,
+    padding_ms: int = 300,
+) -> List[str]:
+    """
+    Split audio using webrtcvad speech detection. Returns list of chunk file paths.
+    Falls back to fixed-window splitting if webrtcvad is not available or audio not 16k mono.
+    """
+    if not _HAS_VAD:
+        return split_audio_to_chunks(src_wav)
+    info = get_audio_info(src_wav)
+    if not info:
+        raise RuntimeError("Cannot read audio info for VAD split")
+    sr = int(info.get("samplerate", 0))
+    channels = int(info.get("channels", 0))
+    if sr != 16000 or channels != 1:
+        # require 16k mono for webrtcvad reliability; fallback
+        return split_audio_to_chunks(src_wav)
+    # read PCM samples
+    data, _ = sf.read(src_wav, dtype="int16")
+    if data.ndim > 1:
+        data = data[:, 0]
+    pcm_bytes = data.tobytes()
+    vad = webrtcvad.Vad(aggressiveness)
+    frame_size = int(sr * frame_ms / 1000)  # samples per frame
+    frame_bytes = frame_size * 2
+    total_frames = (len(pcm_bytes) + frame_bytes - 1) // frame_bytes
+    speech_frames = []
+    for i in range(total_frames):
+        start = i * frame_bytes
+        end = start + frame_bytes
+        frame = pcm_bytes[start:end]
+        if len(frame) < frame_bytes:
+            # pad last frame
+            frame = frame.ljust(frame_bytes, b"\x00")
+        is_speech = False
+        try:
+            is_speech = vad.is_speech(frame, sr)
+        except Exception:
+            is_speech = False
+        speech_frames.append(bool(is_speech))
+    # group contiguous speech frames into segments
+    segments = []
+    in_speech = False
+    seg_start = 0
+    for idx, val in enumerate(speech_frames):
+        if val and not in_speech:
+            in_speech = True
+            seg_start = idx
+        elif not val and in_speech:
+            in_speech = False
+            seg_end = idx - 1
+            segments.append((seg_start, seg_end))
+    if in_speech:
+        segments.append((seg_start, len(speech_frames) - 1))
+    # merge segments if gap smaller than padding_ms
+    merged = []
+    pad_frames = int(padding_ms / frame_ms)
+    for seg in segments:
+        if not merged:
+            merged.append(seg)
+            continue
+        prev = merged[-1]
+        if seg[0] - prev[1] <= pad_frames:
+            merged[-1] = (prev[0], seg[1])
+        else:
+            merged.append(seg)
+    # convert frame indices to times and extract with ffmpeg
+    chunks = []
+    for i, (s_idx, e_idx) in enumerate(merged):
+        start_s = s_idx * frame_ms / 1000.0
+        dur = (e_idx - s_idx + 1) * frame_ms / 1000.0
+        chunk_path = make_temp_path(suffix=f"_vad_chunk{i}.wav")
+        ffmpeg_extract_segment(src_wav, start_s, dur, chunk_path)
+        chunks.append(chunk_path)
+    # If VAD found nothing, fallback to fixed windows
+    if not chunks:
+        return split_audio_to_chunks(src_wav)
+    return chunks

requirements.txt CHANGED Viewed

@@ -14,4 +14,5 @@ google-generativeai
 google-genai
 numpy
 pytest
-cloudinary

 google-genai
 numpy
 pytest
+cloudinary
+webrtcvad