Spaces:

nsfwalex
/

whisper-transcribe-new

Runtime error

App Files Files Community

liuyang commited on Jul 22, 2025

Commit

28a7e7e

1 Parent(s): 0cb30bb

Refactor model loading in app.py to return both Whisper and diarization models, enhancing GPU utilization during transcription processes.

Browse files

Files changed (1) hide show

app.py +11 -31

app.py CHANGED Viewed

@@ -33,7 +33,6 @@ from faster_whisper.vad import VadOptions
 import requests
 import base64
 from pyannote.audio import Pipeline
-from huggingface_hub import snapshot_download
 import os, sys, importlib.util, pathlib, ctypes, tempfile, wave, math
 spec = importlib.util.find_spec("nvidia.cudnn")
@@ -49,8 +48,6 @@ try:
 except OSError as e:
     sys.exit(f"❌  Could not load {cnn_so} : {e}")
-model_cache_path = "large-v3-turbo"  # fallback to model name
 # Lazy global holder ----------------------------------------------------------
 _whisper = None
 _diarizer = None
@@ -74,30 +71,18 @@ except Exception as e:
     print(f"Could not load diarization model: {e}")
     _diarizer = None
-# ---------------------------------------------------------------------
-# Leave _load_models() UNdecorated
 def _load_models():
     global _whisper, _diarizer
     if _whisper is None:
-        _whisper = WhisperModel(model_cache_path,
-                                device="cuda",
-                                compute_type="float16")
-    if _diarizer is None:
-        _diarizer = (
-            Pipeline.from_pretrained(
-                "pyannote/speaker-diarization-3.1",
-                use_auth_token=os.getenv("HF_TOKEN"),
-            ).to(torch.device("cuda"))
         )
-    # do NOT return the models
-    return None
-# ---------------------------------------------------------------------
-# One‐shot GPU warming function
-@spaces.GPU
-def warm_models():
-    _load_models()          # runs in the GPU worker, models stay there
-    return "ready"          # <-- picklable
 # -----------------------------------------------------------------------------
 class WhisperTranscriber:
@@ -124,8 +109,7 @@ class WhisperTranscriber:
     @spaces.GPU           # each call gets a GPU slice
     def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None):
         """Transcribe the entire audio file without speaker diarization"""
-        #whisper, _ = _load_models()   # models live on the GPU
-        whisper = _whisper
         print("Transcribing full audio...")
         start_time = time.time()
@@ -218,8 +202,7 @@ class WhisperTranscriber:
     @spaces.GPU           # each call gets a GPU slice
     def transcribe_audio_segments(self, audio_segments, language=None, translate=False, prompt=None):
         """Transcribe multiple audio segments using faster_whisper"""
-        #whisper, diarizer = _load_models()   # models live on the GPU
-        whisper = _whisper
         print(f"Transcribing {len(audio_segments)} audio segments...")
         start_time = time.time()
@@ -293,9 +276,7 @@ class WhisperTranscriber:
     @spaces.GPU           # each call gets a GPU slice
     def perform_diarization(self, audio_path, num_speakers=None):
         """Perform speaker diarization"""
-        #whisper, diarizer = _load_models()   # models live on the GPU
-        whisper = _whisper
-        diarizer = _diarizer
         if diarizer is None:
             print("Diarization model not available, creating single speaker segment")
@@ -634,5 +615,4 @@ with demo:
     """)
 if __name__ == "__main__":
-    warm_models()           # prime the GPU worker once at startup
     demo.launch(debug=True)

 import requests
 import base64
 from pyannote.audio import Pipeline
 import os, sys, importlib.util, pathlib, ctypes, tempfile, wave, math
 spec = importlib.util.find_spec("nvidia.cudnn")
 except OSError as e:
     sys.exit(f"❌  Could not load {cnn_so} : {e}")
 # Lazy global holder ----------------------------------------------------------
 _whisper = None
 _diarizer = None
     print(f"Could not load diarization model: {e}")
     _diarizer = None
+@spaces.GPU   # GPU is guaranteed to exist *inside* this function
 def _load_models():
     global _whisper, _diarizer
     if _whisper is None:
+        print("Loading Whisper model...")
+        _whisper = WhisperModel(
+            "large-v3-turbo",
+            device="cuda",
+            compute_type="float16",
         )
+        print("Whisper model loaded successfully")
+    return _whisper, _diarizer
 # -----------------------------------------------------------------------------
 class WhisperTranscriber:
     @spaces.GPU           # each call gets a GPU slice
     def transcribe_full_audio(self, audio_path, language=None, translate=False, prompt=None):
         """Transcribe the entire audio file without speaker diarization"""
+        whisper, _ = _load_models()   # models live on the GPU
         print("Transcribing full audio...")
         start_time = time.time()
     @spaces.GPU           # each call gets a GPU slice
     def transcribe_audio_segments(self, audio_segments, language=None, translate=False, prompt=None):
         """Transcribe multiple audio segments using faster_whisper"""
+        whisper, diarizer = _load_models()   # models live on the GPU
         print(f"Transcribing {len(audio_segments)} audio segments...")
         start_time = time.time()
     @spaces.GPU           # each call gets a GPU slice
     def perform_diarization(self, audio_path, num_speakers=None):
         """Perform speaker diarization"""
+        whisper, diarizer = _load_models()   # models live on the GPU
         if diarizer is None:
             print("Diarization model not available, creating single speaker segment")
     """)
 if __name__ == "__main__":
     demo.launch(debug=True)