Spaces:

nsfwalex
/

whisper-transcribe-new

Runtime error

liuyang commited on Sep 5

Commit

5411f5d

1 Parent(s): 9f7c374

Refine VAD parameters and transcription options in WhisperTranscriber for improved audio processing. Adjust max speech duration, min speech duration, and silence duration, and set chunk length to 12 seconds.

Files changed (1) hide show

app.py CHANGED Viewed

@@ -371,14 +371,16 @@ class WhisperTranscriber:
             beam_size=5,
             vad_filter=False,  # VAD is enabled by default for batched transcription
             vad_parameters=VadOptions(
-                max_speech_duration_s=whisper.feature_extractor.chunk_length,
-                min_speech_duration_ms=100,
-                speech_pad_ms=100,
                 threshold=0.25,
                 neg_threshold=0.2,
             ),
             word_timestamps=True,
             initial_prompt=prompt,
             language_detection_segments=1,
             task="translate" if translate else "transcribe",
         )
@@ -387,6 +389,7 @@ class WhisperTranscriber:
         segments, transcript_info = batched_whisper.transcribe(
             audio_path,
             batch_size=batch_size,
             **options
         )
         segments = list(segments)

             beam_size=5,
             vad_filter=False,  # VAD is enabled by default for batched transcription
             vad_parameters=VadOptions(
+                max_speech_duration_s=12,
+                min_speech_duration_ms=150,   # ignore ultra-short blips
+                min_silence_duration_ms=150,  # split on short Mandarin pauses (if supported)                speech_pad_ms=100,
                 threshold=0.25,
                 neg_threshold=0.2,
             ),
             word_timestamps=True,
             initial_prompt=prompt,
+            condition_on_previous_text=False,  # avoid runaway context
             language_detection_segments=1,
             task="translate" if translate else "transcribe",
         )
         segments, transcript_info = batched_whisper.transcribe(
             audio_path,
             batch_size=batch_size,
+            chunk_length=12,   # ↓ from 30s default; try 10–15
             **options
         )
         segments = list(segments)