switch to whisperx
Browse files- app.py +41 -14
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -11,7 +11,8 @@ from PIL import Image, ImageOps
|
|
| 11 |
import numpy as np
|
| 12 |
from simple_lama_inpainting import SimpleLama
|
| 13 |
from contextlib import contextmanager
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
@contextmanager
|
| 17 |
def float32_high_matmul_precision():
|
|
@@ -173,23 +174,49 @@ def erase(image=None, mask=None):
|
|
| 173 |
return simple_lama(image, mask)
|
| 174 |
|
| 175 |
|
| 176 |
-
# Initialize Whisper model
|
| 177 |
-
whisper = pipeline(
|
| 178 |
-
task="automatic-speech-recognition",
|
| 179 |
-
model="openai/whisper-large-v3",
|
| 180 |
-
chunk_length_s=30,
|
| 181 |
-
device="cuda" if torch.cuda.is_available() else "cpu",
|
| 182 |
-
)
|
| 183 |
-
|
| 184 |
-
|
| 185 |
def transcribe(audio, task="transcribe"):
|
| 186 |
if audio is None:
|
| 187 |
raise gr.Error("No audio file submitted!")
|
| 188 |
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
|
| 195 |
@spaces.GPU(duration=120)
|
|
|
|
| 11 |
import numpy as np
|
| 12 |
from simple_lama_inpainting import SimpleLama
|
| 13 |
from contextlib import contextmanager
|
| 14 |
+
import whisperx
|
| 15 |
+
import gc
|
| 16 |
|
| 17 |
@contextmanager
|
| 18 |
def float32_high_matmul_precision():
|
|
|
|
| 174 |
return simple_lama(image, mask)
|
| 175 |
|
| 176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
def transcribe(audio, task="transcribe"):
|
| 178 |
if audio is None:
|
| 179 |
raise gr.Error("No audio file submitted!")
|
| 180 |
|
| 181 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 182 |
+
compute_type = "float16" # can be changed to "int8" if low on GPU memory
|
| 183 |
+
batch_size = 8 # reduced batch size to be conservative with memory
|
| 184 |
+
|
| 185 |
+
# 1. Load model and transcribe
|
| 186 |
+
model = whisperx.load_model("large-v2", device, compute_type=compute_type)
|
| 187 |
+
audio_input = whisperx.load_audio(audio)
|
| 188 |
+
result = model.transcribe(audio_input, batch_size=batch_size)
|
| 189 |
+
|
| 190 |
+
# Clear GPU memory
|
| 191 |
+
del model
|
| 192 |
+
gc.collect()
|
| 193 |
+
torch.cuda.empty_cache()
|
| 194 |
+
|
| 195 |
+
# 2. Align whisper output
|
| 196 |
+
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
| 197 |
+
result = whisperx.align(result["segments"], model_a, metadata, audio_input, device, return_char_alignments=False)
|
| 198 |
+
|
| 199 |
+
# Clear GPU memory
|
| 200 |
+
del model_a
|
| 201 |
+
gc.collect()
|
| 202 |
+
torch.cuda.empty_cache()
|
| 203 |
+
|
| 204 |
+
# 3. Assign speaker labels
|
| 205 |
+
diarize_model = whisperx.DiarizationPipeline(device=device)
|
| 206 |
+
diarize_segments = diarize_model(audio_input)
|
| 207 |
+
|
| 208 |
+
# Combine transcription with speaker diarization
|
| 209 |
+
result = whisperx.assign_word_speakers(diarize_segments, result)
|
| 210 |
+
|
| 211 |
+
# Format output with speaker labels and timestamps
|
| 212 |
+
formatted_text = ""
|
| 213 |
+
for segment in result["segments"]:
|
| 214 |
+
speaker = f"[Speaker {segment['speaker']}]" if "speaker" in segment else ""
|
| 215 |
+
start_time = f"{segment.get('start', 0):.2f}"
|
| 216 |
+
end_time = f"{segment.get('end', 0):.2f}"
|
| 217 |
+
formatted_text += f"[{start_time}s - {end_time}s] {speaker}: {segment['text']}\n"
|
| 218 |
+
|
| 219 |
+
return formatted_text
|
| 220 |
|
| 221 |
|
| 222 |
@spaces.GPU(duration=120)
|
requirements.txt
CHANGED
|
@@ -21,4 +21,5 @@ sentencepiece
|
|
| 21 |
einops
|
| 22 |
# git+https://github.com/facebookresearch/sam2.git
|
| 23 |
matplotlib
|
| 24 |
-
simple-lama-inpainting
|
|
|
|
|
|
| 21 |
einops
|
| 22 |
# git+https://github.com/facebookresearch/sam2.git
|
| 23 |
matplotlib
|
| 24 |
+
simple-lama-inpainting
|
| 25 |
+
git+https://github.com/m-bain/whisperX.git
|