bichnhan2701 commited on
Commit
5399362
·
1 Parent(s): 643318e

update vad logic for chunks

Browse files
Files changed (3) hide show
  1. app/core/asr_engine.py +23 -1
  2. app/core/chunking.py +103 -0
  3. requirements.txt +2 -1
app/core/asr_engine.py CHANGED
@@ -108,7 +108,14 @@ def transcribe_long_audio(
108
  if not wav_path:
109
  return "", []
110
 
111
- chunk_paths = split_audio_to_chunks(wav_path, chunk_length_s=chunk_length_s, overlap_s=overlap_s)
 
 
 
 
 
 
 
112
  combined_text_parts = []
113
  combined_chunks: List[Dict] = []
114
 
@@ -117,6 +124,14 @@ def transcribe_long_audio(
117
  for i, cp in enumerate(chunk_paths):
118
  base_offset = i * step
119
 
 
 
 
 
 
 
 
 
120
  try:
121
  out = model(
122
  cp,
@@ -128,6 +143,13 @@ def transcribe_long_audio(
128
  logger.exception("model inference failed for chunk %s", cp)
129
  continue
130
 
 
 
 
 
 
 
 
131
  part_text = (out.get("text") or "").strip()
132
  if not part_text:
133
  segs = out.get("chunks") or out.get("segments") or []
 
108
  if not wav_path:
109
  return "", []
110
 
111
+ # prefer VAD-based splitting if available
112
+ try:
113
+ from app.core.chunking import split_audio_with_vad
114
+ chunk_paths = split_audio_with_vad(wav_path)
115
+ except Exception:
116
+ chunk_paths = split_audio_to_chunks(wav_path, chunk_length_s=chunk_length_s, overlap_s=overlap_s)
117
+
118
+ logger.debug("transcribe_long_audio: split into %d chunk_paths", len(chunk_paths))
119
  combined_text_parts = []
120
  combined_chunks: List[Dict] = []
121
 
 
124
  for i, cp in enumerate(chunk_paths):
125
  base_offset = i * step
126
 
127
+ try:
128
+ cinfo = get_audio_info(cp) or {}
129
+ logger.debug(
130
+ "chunk[%d]=%s duration=%.3fs samplerate=%s", i, cp, cinfo.get("duration"), cinfo.get("samplerate")
131
+ )
132
+ except Exception:
133
+ logger.debug("chunk[%d]=%s (info unavailable)", i, cp)
134
+
135
  try:
136
  out = model(
137
  cp,
 
143
  logger.exception("model inference failed for chunk %s", cp)
144
  continue
145
 
146
+ # debug: log output shape/keys (only first few chunks to avoid huge logs)
147
+ try:
148
+ if i < 5:
149
+ logger.debug("model out keys for chunk[%d]: %s", i, list(out.keys()) if isinstance(out, dict) else type(out))
150
+ except Exception:
151
+ logger.debug("failed to log model out keys for chunk %d", i)
152
+
153
  part_text = (out.get("text") or "").strip()
154
  if not part_text:
155
  segs = out.get("chunks") or out.get("segments") or []
app/core/chunking.py CHANGED
@@ -4,6 +4,15 @@ import shlex
4
  import subprocess
5
  from typing import List
6
  from app.core.audio_utils import get_audio_info, make_temp_path
 
 
 
 
 
 
 
 
 
7
 
8
  def ffmpeg_extract_segment(src: str, start: float, duration: float, dst: str):
9
  """
@@ -34,3 +43,97 @@ def split_audio_to_chunks(src_wav: str, chunk_length_s: float = 30.0, overlap_s:
34
  ffmpeg_extract_segment(src_wav, s, min(chunk_length_s, duration - s), chunk_path)
35
  chunks.append(chunk_path)
36
  return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import subprocess
5
  from typing import List
6
  from app.core.audio_utils import get_audio_info, make_temp_path
7
+ import soundfile as sf
8
+ import numpy as np
9
+
10
+ # optional webrtcvad for speech-based splitting
11
+ try:
12
+ import webrtcvad
13
+ _HAS_VAD = True
14
+ except Exception:
15
+ _HAS_VAD = False
16
 
17
  def ffmpeg_extract_segment(src: str, start: float, duration: float, dst: str):
18
  """
 
43
  ffmpeg_extract_segment(src_wav, s, min(chunk_length_s, duration - s), chunk_path)
44
  chunks.append(chunk_path)
45
  return chunks
46
+
47
+
48
+ def split_audio_with_vad(
49
+ src_wav: str,
50
+ aggressiveness: int = 2,
51
+ frame_ms: int = 30,
52
+ padding_ms: int = 300,
53
+ ) -> List[str]:
54
+ """
55
+ Split audio using webrtcvad speech detection. Returns list of chunk file paths.
56
+ Falls back to fixed-window splitting if webrtcvad is not available or audio not 16k mono.
57
+ """
58
+ if not _HAS_VAD:
59
+ return split_audio_to_chunks(src_wav)
60
+
61
+ info = get_audio_info(src_wav)
62
+ if not info:
63
+ raise RuntimeError("Cannot read audio info for VAD split")
64
+
65
+ sr = int(info.get("samplerate", 0))
66
+ channels = int(info.get("channels", 0))
67
+ if sr != 16000 or channels != 1:
68
+ # require 16k mono for webrtcvad reliability; fallback
69
+ return split_audio_to_chunks(src_wav)
70
+
71
+ # read PCM samples
72
+ data, _ = sf.read(src_wav, dtype="int16")
73
+ if data.ndim > 1:
74
+ data = data[:, 0]
75
+
76
+ pcm_bytes = data.tobytes()
77
+ vad = webrtcvad.Vad(aggressiveness)
78
+
79
+ frame_size = int(sr * frame_ms / 1000) # samples per frame
80
+ frame_bytes = frame_size * 2
81
+ total_frames = (len(pcm_bytes) + frame_bytes - 1) // frame_bytes
82
+
83
+ speech_frames = []
84
+ for i in range(total_frames):
85
+ start = i * frame_bytes
86
+ end = start + frame_bytes
87
+ frame = pcm_bytes[start:end]
88
+ if len(frame) < frame_bytes:
89
+ # pad last frame
90
+ frame = frame.ljust(frame_bytes, b"\x00")
91
+ is_speech = False
92
+ try:
93
+ is_speech = vad.is_speech(frame, sr)
94
+ except Exception:
95
+ is_speech = False
96
+ speech_frames.append(bool(is_speech))
97
+
98
+ # group contiguous speech frames into segments
99
+ segments = []
100
+ in_speech = False
101
+ seg_start = 0
102
+ for idx, val in enumerate(speech_frames):
103
+ if val and not in_speech:
104
+ in_speech = True
105
+ seg_start = idx
106
+ elif not val and in_speech:
107
+ in_speech = False
108
+ seg_end = idx - 1
109
+ segments.append((seg_start, seg_end))
110
+ if in_speech:
111
+ segments.append((seg_start, len(speech_frames) - 1))
112
+
113
+ # merge segments if gap smaller than padding_ms
114
+ merged = []
115
+ pad_frames = int(padding_ms / frame_ms)
116
+ for seg in segments:
117
+ if not merged:
118
+ merged.append(seg)
119
+ continue
120
+ prev = merged[-1]
121
+ if seg[0] - prev[1] <= pad_frames:
122
+ merged[-1] = (prev[0], seg[1])
123
+ else:
124
+ merged.append(seg)
125
+
126
+ # convert frame indices to times and extract with ffmpeg
127
+ chunks = []
128
+ for i, (s_idx, e_idx) in enumerate(merged):
129
+ start_s = s_idx * frame_ms / 1000.0
130
+ dur = (e_idx - s_idx + 1) * frame_ms / 1000.0
131
+ chunk_path = make_temp_path(suffix=f"_vad_chunk{i}.wav")
132
+ ffmpeg_extract_segment(src_wav, start_s, dur, chunk_path)
133
+ chunks.append(chunk_path)
134
+
135
+ # If VAD found nothing, fallback to fixed windows
136
+ if not chunks:
137
+ return split_audio_to_chunks(src_wav)
138
+
139
+ return chunks
requirements.txt CHANGED
@@ -14,4 +14,5 @@ google-generativeai
14
  google-genai
15
  numpy
16
  pytest
17
- cloudinary
 
 
14
  google-genai
15
  numpy
16
  pytest
17
+ cloudinary
18
+ webrtcvad