Ytgetahun Claude Sonnet 4.6 commited on
Commit
0bae6d8
·
1 Parent(s): 03e67a0

refactor(gaps): replace Whisper with Deepgram Nova-3 + FFmpeg silencedetect

Browse files

Speech detection now uses Deepgram Nova-3 REST API for accurate word
timestamps (~50ms precision, ~30s for 90-min film). Gap type classification
(silence vs music_only) retained via FFmpeg silencedetect — no model
downloads required. Removes openai-whisper dependency and --whisper-model
flag. DEEPGRAM_API_KEY required in environment.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (4) hide show
  1. cli/pyproject.toml +0 -1
  2. cli/vn/compliance.py +1 -2
  3. cli/vn/gaps.py +58 -179
  4. cli/vn/main.py +3 -6
cli/pyproject.toml CHANGED
@@ -27,7 +27,6 @@ classifiers = [
27
  dependencies = [
28
  "ffmpeg-python>=0.2.0",
29
  "httpx>=0.27.0",
30
- "openai-whisper>=20231117",
31
  "typer>=0.12.0",
32
  "yt-dlp>=2024.8.6",
33
  ]
 
27
  dependencies = [
28
  "ffmpeg-python>=0.2.0",
29
  "httpx>=0.27.0",
 
30
  "typer>=0.12.0",
31
  "yt-dlp>=2024.8.6",
32
  ]
cli/vn/compliance.py CHANGED
@@ -51,11 +51,10 @@ class ComplianceReport:
51
 
52
  def analyze_compliance(
53
  source: Path,
54
- whisper_model: str = "base",
55
  min_gap: float = 2.0,
56
  ) -> ComplianceReport:
57
  """Score accessibility compliance using narration gaps from detect_gaps()."""
58
- gaps = detect_gaps(source, whisper_model=whisper_model, min_gap=min_gap)
59
  duration, _has_audio = probe_media(source.expanduser().resolve())
60
  coverage_percent = _coverage_percent(gaps, duration)
61
  max_unbroken_speech_sec = _max_unbroken_speech_stretch(gaps, duration)
 
51
 
52
  def analyze_compliance(
53
  source: Path,
 
54
  min_gap: float = 2.0,
55
  ) -> ComplianceReport:
56
  """Score accessibility compliance using narration gaps from detect_gaps()."""
57
+ gaps = detect_gaps(source, min_gap=min_gap)
58
  duration, _has_audio = probe_media(source.expanduser().resolve())
59
  coverage_percent = _coverage_percent(gaps, duration)
60
  max_unbroken_speech_sec = _max_unbroken_speech_stretch(gaps, duration)
cli/vn/gaps.py CHANGED
@@ -1,8 +1,7 @@
1
  from __future__ import annotations
2
 
3
- import os
4
  import json
5
- import importlib
6
  import re
7
  import shutil
8
  import subprocess
@@ -11,12 +10,16 @@ from dataclasses import dataclass
11
  from pathlib import Path
12
  from typing import Any, Iterable
13
 
 
 
14
  from .output import GapResult
15
 
16
 
17
  SILENCE_START_RE = re.compile(r"silence_start:\s*(?P<seconds>\d+(?:\.\d+)?)")
18
  SILENCE_END_RE = re.compile(r"silence_end:\s*(?P<seconds>\d+(?:\.\d+)?)")
19
 
 
 
20
 
21
  class GapDetectionError(RuntimeError):
22
  """Raised when narration gaps cannot be detected."""
@@ -32,7 +35,7 @@ class Interval:
32
  return max(0.0, self.end - self.start)
33
 
34
 
35
- def detect_gaps(source: Path, whisper_model: str = "base", min_gap: float = 2.0) -> list[GapResult]:
36
  source = source.expanduser().resolve()
37
  if min_gap <= 0:
38
  raise GapDetectionError("--min-gap must be greater than 0")
@@ -55,10 +58,8 @@ def detect_gaps(source: Path, whisper_model: str = "base", min_gap: float = 2.0)
55
  audio_path = tmp_path / "audio.wav"
56
  _extract_audio(source, audio_path)
57
  silences = _detect_silences(source, duration, min_gap)
58
- transcription = _transcribe_audio(audio_path, whisper_model, _whisper_model_dir())
59
 
60
- words = _collect_words(transcription)
61
- segments = _collect_segments(transcription)
62
  candidates = _build_candidates(words, duration)
63
  if not candidates and duration >= min_gap:
64
  candidates = [Interval(0.0, duration)]
@@ -67,7 +68,7 @@ def detect_gaps(source: Path, whisper_model: str = "base", min_gap: float = 2.0)
67
  for candidate in candidates:
68
  if candidate.duration < min_gap:
69
  continue
70
- gap_type = _classify_gap(candidate, silences, segments)
71
  gaps.append(
72
  GapResult(
73
  start_sec=candidate.start,
@@ -90,10 +91,8 @@ def _probe_media(source: Path) -> tuple[float, bool]:
90
  completed = subprocess.run(
91
  [
92
  "ffprobe",
93
- "-v",
94
- "error",
95
- "-print_format",
96
- "json",
97
  "-show_format",
98
  "-show_streams",
99
  str(source),
@@ -139,19 +138,9 @@ def _extract_audio(source: Path, output_path: Path) -> None:
139
  try:
140
  subprocess.run(
141
  [
142
- "ffmpeg",
143
- "-hide_banner",
144
- "-loglevel",
145
- "error",
146
- "-i",
147
- str(source),
148
- "-vn",
149
- "-ac",
150
- "1",
151
- "-ar",
152
- "16000",
153
- "-acodec",
154
- "pcm_s16le",
155
  str(output_path),
156
  ],
157
  check=True,
@@ -170,16 +159,10 @@ def _detect_silences(source: Path, duration: float, min_gap: float) -> list[Inte
170
  silence_floor = "30dB"
171
  silence_duration = max(0.25, min(0.75, min_gap / 2))
172
  command = [
173
- "ffmpeg",
174
- "-hide_banner",
175
- "-nostats",
176
- "-i",
177
- str(source),
178
- "-af",
179
- f"silencedetect=noise=-{silence_floor}:d={silence_duration}",
180
- "-f",
181
- "null",
182
- "-",
183
  ]
184
 
185
  try:
@@ -206,145 +189,56 @@ def _detect_silences(source: Path, duration: float, min_gap: float) -> list[Inte
206
  return _merge_intervals(intervals)
207
 
208
 
209
- def _whisper_model_dir() -> Path:
210
- model_dir = Path(
211
- os.getenv("VN_WHISPER_MODEL_DIR") or Path(tempfile.gettempdir()) / "vn-whisper-models"
212
- ).expanduser()
213
- model_dir.mkdir(parents=True, exist_ok=True)
214
- return model_dir
215
-
216
-
217
- def _transcribe_audio(audio_path: Path, whisper_model: str, model_dir: Path) -> dict[str, Any]:
218
- try:
219
- whisper = importlib.import_module("whisper")
220
- except ImportError as exc:
221
- return _transcribe_with_cli(audio_path, whisper_model, model_dir)
222
-
223
- try:
224
- model = whisper.load_model(whisper_model, download_root=str(model_dir))
225
- except Exception as exc: # noqa: BLE001
226
- raise GapDetectionError(f"failed to load Whisper model '{whisper_model}': {exc}") from exc
227
-
228
- try:
229
- import io
230
- import sys as _sys
231
- _old_stdout = _sys.stdout
232
- _sys.stdout = io.StringIO()
233
- try:
234
- result = model.transcribe(str(audio_path), word_timestamps=True, verbose=False)
235
- finally:
236
- _sys.stdout = _old_stdout
237
- return result
238
- except Exception as exc: # noqa: BLE001
239
- raise GapDetectionError(f"Whisper transcription failed: {exc}") from exc
240
-
241
-
242
- def _transcribe_with_cli(audio_path: Path, whisper_model: str, model_dir: Path) -> dict[str, Any]:
243
- whisper_bin = shutil.which("whisper")
244
- if whisper_bin is None:
245
  raise GapDetectionError(
246
- "Whisper is required for gap detection. Install openai-whisper or make the `whisper` CLI available."
247
  )
248
 
249
- with tempfile.TemporaryDirectory(prefix="vn-whisper-") as output_dir:
250
- try:
251
- subprocess.run(
252
- [
253
- whisper_bin,
254
- str(audio_path),
255
- "--model",
256
- whisper_model,
257
- "--output_format",
258
- "json",
259
- "--output_dir",
260
- output_dir,
261
- "--model_dir",
262
- str(model_dir),
263
- "--word_timestamps",
264
- "True",
265
- "--fp16",
266
- "False",
267
- "--verbose",
268
- "False",
269
- ],
270
- check=True,
271
- capture_output=True,
272
- text=True,
273
- )
274
- except subprocess.CalledProcessError as exc:
275
- stderr = (exc.stderr or exc.stdout or "").strip()
276
- raise GapDetectionError(f"Whisper CLI transcription failed: {stderr}") from exc
277
-
278
- json_path = Path(output_dir) / f"{audio_path.stem}.json"
279
- if not json_path.exists():
280
- raise GapDetectionError("Whisper CLI completed but did not produce a JSON transcript")
281
-
282
- try:
283
- return json.loads(json_path.read_text())
284
- except json.JSONDecodeError as exc:
285
- raise GapDetectionError(f"Whisper CLI produced invalid JSON: {exc}") from exc
286
-
287
-
288
- # Segments with no_speech_prob above this threshold are likely hallucinated
289
- # (gunshots, music, etc.) and are excluded from speech word collection.
290
- _NO_SPEECH_PROB_THRESHOLD = 0.35
291
-
292
- # Words with probability below this threshold inside a valid speech segment
293
- # are treated as hallucinated and excluded from candidate building.
294
- _WORD_PROB_THRESHOLD = 0.30
295
-
296
 
297
- def _collect_words(transcription: dict[str, Any]) -> list[Interval]:
298
  words: list[Interval] = []
299
- for segment in transcription.get("segments", []):
300
- # Skip segments Whisper itself flagged as likely non-speech
301
- no_speech_prob = segment.get("no_speech_prob")
302
- if no_speech_prob is not None:
303
- try:
304
- if float(no_speech_prob) > _NO_SPEECH_PROB_THRESHOLD:
305
- continue
306
- except (TypeError, ValueError):
307
- pass
308
- for word in segment.get("words", []) or []:
309
- start = word.get("start")
310
- end = word.get("end")
311
- prob = word.get("probability")
312
- if start is None or end is None:
313
- continue
314
- # Skip low-confidence words (hallucinations from non-speech audio)
315
- if prob is not None:
316
- try:
317
- if float(prob) < _WORD_PROB_THRESHOLD:
318
- continue
319
- except (TypeError, ValueError):
320
- pass
321
- try:
322
- words.append(Interval(start=float(start), end=float(end)))
323
- except (TypeError, ValueError):
324
- continue
325
- return sorted(words, key=lambda item: (item.start, item.end))
326
-
327
-
328
- def _collect_segments(transcription: dict[str, Any]) -> list[Interval]:
329
- segments: list[Interval] = []
330
- for segment in transcription.get("segments", []):
331
- # Exclude segments Whisper flagged as likely non-speech
332
- no_speech_prob = segment.get("no_speech_prob")
333
- if no_speech_prob is not None:
334
- try:
335
- if float(no_speech_prob) > _NO_SPEECH_PROB_THRESHOLD:
336
- continue
337
- except (TypeError, ValueError):
338
- pass
339
- start = segment.get("start")
340
- end = segment.get("end")
341
  if start is None or end is None:
342
  continue
343
  try:
344
- segments.append(Interval(start=float(start), end=float(end)))
345
  except (TypeError, ValueError):
346
  continue
347
- return sorted(segments, key=lambda item: (item.start, item.end))
348
 
349
 
350
  def _build_candidates(words: list[Interval], duration: float) -> list[Interval]:
@@ -367,18 +261,12 @@ def _build_candidates(words: list[Interval], duration: float) -> list[Interval]:
367
  return _merge_intervals(candidates)
368
 
369
 
370
- def _classify_gap(candidate: Interval, silences: list[Interval], segments: list[Interval]) -> str:
371
  if candidate.duration <= 0:
372
  return "silence"
373
-
374
  silence_overlap = _coverage(candidate, silences)
375
  if silence_overlap / candidate.duration >= 0.8:
376
  return "silence"
377
-
378
- for segment in segments:
379
- if candidate.start >= segment.start and candidate.end <= segment.end:
380
- return "speech"
381
-
382
  return "music_only"
383
 
384
 
@@ -405,12 +293,3 @@ def _merge_intervals(intervals: list[Interval]) -> list[Interval]:
405
  else:
406
  merged.append(interval)
407
  return merged
408
-
409
-
410
- def _decode_ffmpeg_error(exc: Exception) -> str:
411
- stderr = getattr(exc, "stderr", b"")
412
- stdout = getattr(exc, "stdout", b"")
413
- payload = stderr or stdout or b""
414
- if isinstance(payload, bytes):
415
- return payload.decode("utf-8", errors="replace").strip()
416
- return str(payload).strip()
 
1
  from __future__ import annotations
2
 
 
3
  import json
4
+ import os
5
  import re
6
  import shutil
7
  import subprocess
 
10
  from pathlib import Path
11
  from typing import Any, Iterable
12
 
13
+ import httpx
14
+
15
  from .output import GapResult
16
 
17
 
18
  SILENCE_START_RE = re.compile(r"silence_start:\s*(?P<seconds>\d+(?:\.\d+)?)")
19
  SILENCE_END_RE = re.compile(r"silence_end:\s*(?P<seconds>\d+(?:\.\d+)?)")
20
 
21
+ DEEPGRAM_URL = "https://api.deepgram.com/v1/listen"
22
+
23
 
24
  class GapDetectionError(RuntimeError):
25
  """Raised when narration gaps cannot be detected."""
 
35
  return max(0.0, self.end - self.start)
36
 
37
 
38
+ def detect_gaps(source: Path, min_gap: float = 2.0) -> list[GapResult]:
39
  source = source.expanduser().resolve()
40
  if min_gap <= 0:
41
  raise GapDetectionError("--min-gap must be greater than 0")
 
58
  audio_path = tmp_path / "audio.wav"
59
  _extract_audio(source, audio_path)
60
  silences = _detect_silences(source, duration, min_gap)
61
+ words = _transcribe_with_deepgram(audio_path)
62
 
 
 
63
  candidates = _build_candidates(words, duration)
64
  if not candidates and duration >= min_gap:
65
  candidates = [Interval(0.0, duration)]
 
68
  for candidate in candidates:
69
  if candidate.duration < min_gap:
70
  continue
71
+ gap_type = _classify_gap(candidate, silences)
72
  gaps.append(
73
  GapResult(
74
  start_sec=candidate.start,
 
91
  completed = subprocess.run(
92
  [
93
  "ffprobe",
94
+ "-v", "error",
95
+ "-print_format", "json",
 
 
96
  "-show_format",
97
  "-show_streams",
98
  str(source),
 
138
  try:
139
  subprocess.run(
140
  [
141
+ "ffmpeg", "-hide_banner", "-loglevel", "error",
142
+ "-i", str(source),
143
+ "-vn", "-ac", "1", "-ar", "16000", "-acodec", "pcm_s16le",
 
 
 
 
 
 
 
 
 
 
144
  str(output_path),
145
  ],
146
  check=True,
 
159
  silence_floor = "30dB"
160
  silence_duration = max(0.25, min(0.75, min_gap / 2))
161
  command = [
162
+ "ffmpeg", "-hide_banner", "-nostats",
163
+ "-i", str(source),
164
+ "-af", f"silencedetect=noise=-{silence_floor}:d={silence_duration}",
165
+ "-f", "null", "-",
 
 
 
 
 
 
166
  ]
167
 
168
  try:
 
189
  return _merge_intervals(intervals)
190
 
191
 
192
+ def _transcribe_with_deepgram(audio_path: Path) -> list[Interval]:
193
+ api_key = os.getenv("DEEPGRAM_API_KEY")
194
+ if not api_key:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  raise GapDetectionError(
196
+ "DEEPGRAM_API_KEY is not set. Get a free key at console.deepgram.com"
197
  )
198
 
199
+ try:
200
+ response = httpx.post(
201
+ DEEPGRAM_URL,
202
+ headers={
203
+ "Authorization": f"Token {api_key}",
204
+ "Content-Type": "audio/wav",
205
+ },
206
+ params={
207
+ "model": "nova-3",
208
+ "words": "true",
209
+ "punctuate": "false",
210
+ "smart_format": "false",
211
+ },
212
+ content=audio_path.read_bytes(),
213
+ timeout=60.0,
214
+ )
215
+ response.raise_for_status()
216
+ except httpx.HTTPStatusError as exc:
217
+ raise GapDetectionError(
218
+ f"Deepgram API error {exc.response.status_code}: {exc.response.text}"
219
+ ) from exc
220
+ except httpx.RequestError as exc:
221
+ raise GapDetectionError(f"Deepgram request failed: {exc}") from exc
222
+
223
+ words_raw = (
224
+ response.json()
225
+ .get("results", {})
226
+ .get("channels", [{}])[0]
227
+ .get("alternatives", [{}])[0]
228
+ .get("words", [])
229
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
 
231
  words: list[Interval] = []
232
+ for w in words_raw:
233
+ start = w.get("start")
234
+ end = w.get("end")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  if start is None or end is None:
236
  continue
237
  try:
238
+ words.append(Interval(start=float(start), end=float(end)))
239
  except (TypeError, ValueError):
240
  continue
241
+ return sorted(words, key=lambda item: (item.start, item.end))
242
 
243
 
244
  def _build_candidates(words: list[Interval], duration: float) -> list[Interval]:
 
261
  return _merge_intervals(candidates)
262
 
263
 
264
+ def _classify_gap(candidate: Interval, silences: list[Interval]) -> str:
265
  if candidate.duration <= 0:
266
  return "silence"
 
267
  silence_overlap = _coverage(candidate, silences)
268
  if silence_overlap / candidate.duration >= 0.8:
269
  return "silence"
 
 
 
 
 
270
  return "music_only"
271
 
272
 
 
293
  else:
294
  merged.append(interval)
295
  return merged
 
 
 
 
 
 
 
 
 
cli/vn/main.py CHANGED
@@ -22,7 +22,6 @@ app.add_typer(keys_app, name="keys")
22
 
23
  OutputFormat = typer.Option("json", "--format", "-f", help="Output format: json, srt, or text.")
24
  ApiUrl = typer.Option(DEFAULT_API_URL, "--api-url", help="Visual Narrator API base URL.")
25
- WhisperModel = typer.Option("base", "--whisper-model", help="Whisper model to use for gap detection.")
26
 
27
 
28
  @app.command()
@@ -75,16 +74,15 @@ def gaps(
75
  source: str = typer.Argument(..., help="Local video file or YouTube URL."),
76
  output_format: str = OutputFormat,
77
  min_gap: float = typer.Option(2.0, "--min-gap", min=0.001, help="Filter out gaps shorter than this many seconds."),
78
- whisper_model: str = WhisperModel,
79
  ) -> None:
80
- """Detect narration-friendly dialogue gaps with Whisper."""
81
  output_format = _normalize_format(output_format)
82
 
83
  with tempfile.TemporaryDirectory(prefix="vn-cli-") as tmp:
84
  tmp_path = Path(tmp)
85
  try:
86
  media_path = _resolve_source(source, tmp_path / "download")
87
- gaps = detect_gaps(media_path, whisper_model=whisper_model, min_gap=min_gap)
88
  except (GapDetectionError, YouTubeDownloadError) as exc:
89
  _fail(str(exc))
90
 
@@ -96,7 +94,6 @@ def compliance(
96
  source: str = typer.Argument(..., help="Local video file or YouTube URL."),
97
  output_format: str = typer.Option("json", "--format", "-f", help="Output format: json or text."),
98
  min_gap: float = typer.Option(2.0, "--min-gap", min=0.001, help="Filter out gaps shorter than this many seconds."),
99
- whisper_model: str = WhisperModel,
100
  ) -> None:
101
  """Generate a WCAG/CVAA compliance report from detected narration gaps."""
102
  output_format = _normalize_compliance_format(output_format)
@@ -105,7 +102,7 @@ def compliance(
105
  tmp_path = Path(tmp)
106
  try:
107
  media_path = _resolve_source(source, tmp_path / "download")
108
- report = analyze_compliance(media_path, whisper_model=whisper_model, min_gap=min_gap)
109
  except (GapDetectionError, YouTubeDownloadError) as exc:
110
  _fail(str(exc))
111
 
 
22
 
23
  OutputFormat = typer.Option("json", "--format", "-f", help="Output format: json, srt, or text.")
24
  ApiUrl = typer.Option(DEFAULT_API_URL, "--api-url", help="Visual Narrator API base URL.")
 
25
 
26
 
27
  @app.command()
 
74
  source: str = typer.Argument(..., help="Local video file or YouTube URL."),
75
  output_format: str = OutputFormat,
76
  min_gap: float = typer.Option(2.0, "--min-gap", min=0.001, help="Filter out gaps shorter than this many seconds."),
 
77
  ) -> None:
78
+ """Detect narration-friendly dialogue gaps with Deepgram Nova-3."""
79
  output_format = _normalize_format(output_format)
80
 
81
  with tempfile.TemporaryDirectory(prefix="vn-cli-") as tmp:
82
  tmp_path = Path(tmp)
83
  try:
84
  media_path = _resolve_source(source, tmp_path / "download")
85
+ gaps = detect_gaps(media_path, min_gap=min_gap)
86
  except (GapDetectionError, YouTubeDownloadError) as exc:
87
  _fail(str(exc))
88
 
 
94
  source: str = typer.Argument(..., help="Local video file or YouTube URL."),
95
  output_format: str = typer.Option("json", "--format", "-f", help="Output format: json or text."),
96
  min_gap: float = typer.Option(2.0, "--min-gap", min=0.001, help="Filter out gaps shorter than this many seconds."),
 
97
  ) -> None:
98
  """Generate a WCAG/CVAA compliance report from detected narration gaps."""
99
  output_format = _normalize_compliance_format(output_format)
 
102
  tmp_path = Path(tmp)
103
  try:
104
  media_path = _resolve_source(source, tmp_path / "download")
105
+ report = analyze_compliance(media_path, min_gap=min_gap)
106
  except (GapDetectionError, YouTubeDownloadError) as exc:
107
  _fail(str(exc))
108