liuyang commited on
Commit
5411f5d
·
1 Parent(s): 9f7c374

Refine VAD parameters and transcription options in WhisperTranscriber for improved audio processing. Adjust max speech duration, min speech duration, and silence duration, and set chunk length to 12 seconds.

Browse files
Files changed (1) hide show
  1. app.py +6 -3
app.py CHANGED
@@ -371,14 +371,16 @@ class WhisperTranscriber:
371
  beam_size=5,
372
  vad_filter=False, # VAD is enabled by default for batched transcription
373
  vad_parameters=VadOptions(
374
- max_speech_duration_s=whisper.feature_extractor.chunk_length,
375
- min_speech_duration_ms=100,
376
- speech_pad_ms=100,
377
  threshold=0.25,
378
  neg_threshold=0.2,
 
379
  ),
380
  word_timestamps=True,
381
  initial_prompt=prompt,
 
382
  language_detection_segments=1,
383
  task="translate" if translate else "transcribe",
384
  )
@@ -387,6 +389,7 @@ class WhisperTranscriber:
387
  segments, transcript_info = batched_whisper.transcribe(
388
  audio_path,
389
  batch_size=batch_size,
 
390
  **options
391
  )
392
  segments = list(segments)
 
371
  beam_size=5,
372
  vad_filter=False, # VAD is enabled by default for batched transcription
373
  vad_parameters=VadOptions(
374
+ max_speech_duration_s=12,
375
+ min_speech_duration_ms=150, # ignore ultra-short blips
376
+ min_silence_duration_ms=150, # split on short Mandarin pauses (if supported) speech_pad_ms=100,
377
  threshold=0.25,
378
  neg_threshold=0.2,
379
+
380
  ),
381
  word_timestamps=True,
382
  initial_prompt=prompt,
383
+ condition_on_previous_text=False, # avoid runaway context
384
  language_detection_segments=1,
385
  task="translate" if translate else "transcribe",
386
  )
 
389
  segments, transcript_info = batched_whisper.transcribe(
390
  audio_path,
391
  batch_size=batch_size,
392
+ chunk_length=12, # ↓ from 30s default; try 10–15
393
  **options
394
  )
395
  segments = list(segments)