Spaces:

not-lain
/

gpu-utils

Paused

App Files Files Community

not-lain commited on Apr 4, 2025

Commit

ad44b76

1 Parent(s): 4c8ba03

switch to whisperx

Browse files

Files changed (2) hide show

app.py +41 -14
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -11,7 +11,8 @@ from PIL import Image, ImageOps
 import numpy as np
 from simple_lama_inpainting import SimpleLama
 from contextlib import contextmanager
 @contextmanager
 def float32_high_matmul_precision():
@@ -173,23 +174,49 @@ def erase(image=None, mask=None):
     return simple_lama(image, mask)
-# Initialize Whisper model
-whisper = pipeline(
-    task="automatic-speech-recognition",
-    model="openai/whisper-large-v3",
-    chunk_length_s=30,
-    device="cuda" if torch.cuda.is_available() else "cpu",
-)
 def transcribe(audio, task="transcribe"):
     if audio is None:
         raise gr.Error("No audio file submitted!")
-    text = whisper(
-        audio, batch_size=8, generate_kwargs={"task": task}, return_timestamps=True
-    )["text"]
-    return text
 @spaces.GPU(duration=120)

 import numpy as np
 from simple_lama_inpainting import SimpleLama
 from contextlib import contextmanager
+import whisperx
+import gc
 @contextmanager
 def float32_high_matmul_precision():
     return simple_lama(image, mask)
 def transcribe(audio, task="transcribe"):
     if audio is None:
         raise gr.Error("No audio file submitted!")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    compute_type = "float16"  # can be changed to "int8" if low on GPU memory
+    batch_size = 8  # reduced batch size to be conservative with memory
+    # 1. Load model and transcribe
+    model = whisperx.load_model("large-v2", device, compute_type=compute_type)
+    audio_input = whisperx.load_audio(audio)
+    result = model.transcribe(audio_input, batch_size=batch_size)
+    # Clear GPU memory
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()
+    # 2. Align whisper output
+    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+    result = whisperx.align(result["segments"], model_a, metadata, audio_input, device, return_char_alignments=False)
+    # Clear GPU memory
+    del model_a
+    gc.collect()
+    torch.cuda.empty_cache()
+    # 3. Assign speaker labels
+    diarize_model = whisperx.DiarizationPipeline(device=device)
+    diarize_segments = diarize_model(audio_input)
+    # Combine transcription with speaker diarization
+    result = whisperx.assign_word_speakers(diarize_segments, result)
+    # Format output with speaker labels and timestamps
+    formatted_text = ""
+    for segment in result["segments"]:
+        speaker = f"[Speaker {segment['speaker']}]" if "speaker" in segment else ""
+        start_time = f"{segment.get('start', 0):.2f}"
+        end_time = f"{segment.get('end', 0):.2f}"
+        formatted_text += f"[{start_time}s - {end_time}s] {speaker}: {segment['text']}\n"
+    return formatted_text
 @spaces.GPU(duration=120)

requirements.txt CHANGED Viewed

@@ -21,4 +21,5 @@ sentencepiece
 einops
 # git+https://github.com/facebookresearch/sam2.git
 matplotlib
-simple-lama-inpainting

 einops
 # git+https://github.com/facebookresearch/sam2.git
 matplotlib
+simple-lama-inpainting
+git+https://github.com/m-bain/whisperX.git