Update modules/whisper/whisper_base.py
Browse files
modules/whisper/whisper_base.py
CHANGED
|
@@ -9,6 +9,7 @@ import numpy as np
|
|
| 9 |
from datetime import datetime
|
| 10 |
from faster_whisper.vad import VadOptions
|
| 11 |
from dataclasses import astuple
|
|
|
|
| 12 |
|
| 13 |
from modules.uvr.music_separator import MusicSeparator
|
| 14 |
from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
|
|
@@ -99,13 +100,10 @@ class WhisperBase(ABC):
|
|
| 99 |
elapsed_time: float
|
| 100 |
elapsed time for running
|
| 101 |
"""
|
|
|
|
|
|
|
| 102 |
params = WhisperParameters.as_value(*whisper_params)
|
| 103 |
|
| 104 |
-
self.cache_parameters(
|
| 105 |
-
whisper_params=params,
|
| 106 |
-
add_timestamp=add_timestamp
|
| 107 |
-
)
|
| 108 |
-
|
| 109 |
if params.lang is None:
|
| 110 |
pass
|
| 111 |
elif params.lang == "Automatic Detection":
|
|
@@ -134,12 +132,16 @@ class WhisperBase(ABC):
|
|
| 134 |
|
| 135 |
if params.uvr_enable_offload:
|
| 136 |
self.music_separator.offload()
|
|
|
|
| 137 |
|
|
|
|
|
|
|
| 138 |
if params.vad_filter:
|
| 139 |
# Explicit value set for float('inf') from gr.Number()
|
| 140 |
if params.max_speech_duration_s is None or params.max_speech_duration_s >= 9999:
|
| 141 |
params.max_speech_duration_s = float('inf')
|
| 142 |
|
|
|
|
| 143 |
vad_options = VadOptions(
|
| 144 |
threshold=params.threshold,
|
| 145 |
min_speech_duration_ms=params.min_speech_duration_ms,
|
|
@@ -148,31 +150,57 @@ class WhisperBase(ABC):
|
|
| 148 |
speech_pad_ms=params.speech_pad_ms
|
| 149 |
)
|
| 150 |
|
| 151 |
-
|
| 152 |
audio=audio,
|
| 153 |
vad_parameters=vad_options,
|
| 154 |
progress=progress
|
| 155 |
)
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
result, elapsed_time = self.transcribe(
|
| 158 |
audio,
|
| 159 |
progress,
|
| 160 |
*astuple(params)
|
| 161 |
)
|
|
|
|
|
|
|
| 162 |
|
| 163 |
if params.vad_filter:
|
| 164 |
-
|
| 165 |
segments=result,
|
| 166 |
speech_chunks=speech_chunks,
|
| 167 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
if params.is_diarize:
|
|
|
|
| 170 |
result, elapsed_time_diarization = self.diarizer.run(
|
| 171 |
-
audio=
|
| 172 |
use_auth_token=params.hf_token,
|
| 173 |
transcribed_result=result,
|
|
|
|
| 174 |
)
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
return result, elapsed_time
|
| 177 |
|
| 178 |
def transcribe_file(self,
|
|
|
|
| 9 |
from datetime import datetime
|
| 10 |
from faster_whisper.vad import VadOptions
|
| 11 |
from dataclasses import astuple
|
| 12 |
+
from copy import deepcopy
|
| 13 |
|
| 14 |
from modules.uvr.music_separator import MusicSeparator
|
| 15 |
from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
|
|
|
|
| 100 |
elapsed_time: float
|
| 101 |
elapsed time for running
|
| 102 |
"""
|
| 103 |
+
|
| 104 |
+
start_time = time.time()
|
| 105 |
params = WhisperParameters.as_value(*whisper_params)
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
if params.lang is None:
|
| 108 |
pass
|
| 109 |
elif params.lang == "Automatic Detection":
|
|
|
|
| 132 |
|
| 133 |
if params.uvr_enable_offload:
|
| 134 |
self.music_separator.offload()
|
| 135 |
+
elapsed_time_bgm_sep = time.time() - start_time
|
| 136 |
|
| 137 |
+
origin_audio = deepcopy(audio)
|
| 138 |
+
|
| 139 |
if params.vad_filter:
|
| 140 |
# Explicit value set for float('inf') from gr.Number()
|
| 141 |
if params.max_speech_duration_s is None or params.max_speech_duration_s >= 9999:
|
| 142 |
params.max_speech_duration_s = float('inf')
|
| 143 |
|
| 144 |
+
progress(0, desc="Filtering silent parts from audio...")
|
| 145 |
vad_options = VadOptions(
|
| 146 |
threshold=params.threshold,
|
| 147 |
min_speech_duration_ms=params.min_speech_duration_ms,
|
|
|
|
| 150 |
speech_pad_ms=params.speech_pad_ms
|
| 151 |
)
|
| 152 |
|
| 153 |
+
vad_processed, speech_chunks = self.vad.run(
|
| 154 |
audio=audio,
|
| 155 |
vad_parameters=vad_options,
|
| 156 |
progress=progress
|
| 157 |
)
|
| 158 |
|
| 159 |
+
if vad_processed.size > 0:
|
| 160 |
+
audio = vad_processed
|
| 161 |
+
else:
|
| 162 |
+
vad_params.vad_filter = False
|
| 163 |
+
|
| 164 |
result, elapsed_time = self.transcribe(
|
| 165 |
audio,
|
| 166 |
progress,
|
| 167 |
*astuple(params)
|
| 168 |
)
|
| 169 |
+
if params.whisper_enable_offload:
|
| 170 |
+
self.offload()
|
| 171 |
|
| 172 |
if params.vad_filter:
|
| 173 |
+
restored_result = self.vad.restore_speech_timestamps(
|
| 174 |
segments=result,
|
| 175 |
speech_chunks=speech_chunks,
|
| 176 |
)
|
| 177 |
+
if restored_result:
|
| 178 |
+
result = restored_result
|
| 179 |
+
else:
|
| 180 |
+
print("VAD detected no speech segments in the audio.")
|
| 181 |
|
| 182 |
if params.is_diarize:
|
| 183 |
+
progress(0.99, desc="Diarizing speakers...")
|
| 184 |
result, elapsed_time_diarization = self.diarizer.run(
|
| 185 |
+
audio=origin_audio,
|
| 186 |
use_auth_token=params.hf_token,
|
| 187 |
transcribed_result=result,
|
| 188 |
+
device=params.diarization_device
|
| 189 |
)
|
| 190 |
+
if params.diarization_enable_offload:
|
| 191 |
+
self.diarizer.offload()
|
| 192 |
+
|
| 193 |
+
self.cache_parameters(
|
| 194 |
+
whisper_params=params,
|
| 195 |
+
add_timestamp=add_timestamp
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
if not result:
|
| 199 |
+
print(f"Whisper did not detected any speech segments in the audio.")
|
| 200 |
+
result = list()
|
| 201 |
+
|
| 202 |
+
progress(1.0, desc="Processing done!")
|
| 203 |
+
total_elapsed_time = time.time() - start_time
|
| 204 |
return result, elapsed_time
|
| 205 |
|
| 206 |
def transcribe_file(self,
|