Spaces:

LEMAS-Project
/

LEMAS-Edit

Running on Zero

App Files Files Community

Approximetal commited on 9 days ago

Commit

66c9260

verified ·

1 Parent(s): 694583e

Update gradio_mix.py

Browse files

Files changed (1) hide show

gradio_mix.py +99 -29

gradio_mix.py CHANGED Viewed

@@ -68,7 +68,15 @@ def _pick_device():
     return "cuda" if torch.cuda.is_available() else "cpu"
 device = _pick_device()
-ASR_DEVICE = "cpu"  # force whisperx/pyannote to CPU to avoid cuDNN issues
 whisper_model, align_model = None, None
 tts_edit_model = None
@@ -97,51 +105,68 @@ class UVR5:
     def __init__(self, model_dir):
         # Code directory is always the local `uvr5` folder in this repo
-        code_dir = os.path.join(os.path.dirname(__file__), "uvr5")
-        self.model = self.load_model(model_dir, code_dir)
-    def load_model(self, model_dir, code_dir):
         import sys, json, os, torch
-        if code_dir not in sys.path:
-            sys.path.append(code_dir)
         from multiprocess_cuda_infer import ModelData, Inference
         # In the minimal LEMAS-TTS layout, UVR5 weights live under:
-        model_path = os.path.join(model_dir, "Kim_Vocal_1.onnx")
-        config_path = os.path.join(model_dir, "MDX-Net-Kim-Vocal1.json")
         with open(config_path, "r", encoding="utf-8") as f:
             configs = json.load(f)
         model_data = ModelData(
             model_path=model_path,
-            audio_path=model_dir,
-            result_path=model_dir,
-            device="cpu",
             process_method="MDX-Net",
             # Keep base_dir and model_dir the same so all UVR5 metadata
             # (model_data.json, model_name_mapper.json, etc.) are resolved
             # under `pretrained_models/uvr5`, matching LEMAS-TTS inference.
-            base_dir=model_dir,
             **configs,
         )
-        uvr5_model = Inference(model_data, "cpu")
         # On HF Spaces with stateless GPU, we must not initialize CUDA in the
-        # main process. UVR5's internal `load_model` checks `torch.cuda.is_available()`
-        # and may touch CUDA APIs. Temporarily spoof this to force CPU-only
-        # providers during UVR5 init.
-        orig_is_available = torch.cuda.is_available
-        torch.cuda.is_available = lambda: False
-        try:
             uvr5_model.load_model(model_path, 1)
-        finally:
-            torch.cuda.is_available = orig_is_available
-        return uvr5_model
     def denoise(self, audio_info):
         input_audio = load_wav(audio_info, sr=44100, channel=2)
-        output_audio = self.model.demix_base({0:input_audio.squeeze()}, is_match_mix=False)
         # transform = torchaudio.transforms.Resample(44100, 16000)
         # output_audio = transform(output_audio)
-        return output_audio.squeeze().T.numpy(), 44100
 class DeepFilterNet:
@@ -424,14 +449,31 @@ class MMSAlignModel:
 class WhisperxModel:
     def __init__(self, model_name):
         from whisperx import load_model
         prompt = None  # "This might be a blend of Simplified Chinese and English speech, do not translate, only transcription be allowed."
         # Use the lighter Silero VAD backend to avoid pyannote checkpoints
         # and their PyTorch 2.6 `weights_only` pickling issues.
         self.model = load_model(
-            model_name,
-            ASR_DEVICE,
             compute_type="float32",
             asr_options={
                 "suppress_numerals": False,
@@ -447,6 +489,9 @@ class WhisperxModel:
         )
     def transcribe(self, audio_info, lang=None):
         audio = load_wav(audio_info).numpy()
         if lang is None:
             lang = self.model.detect_language(audio)
@@ -541,7 +586,8 @@ def get_audio_slice(audio, words_info, start_time, end_time, max_len=10, sr=1600
 def load_models(lemas_model_name, whisper_model_name, alignment_model_name, denoise_model_name):  # , audiosr_name):
     global transcribe_model, align_model, denoise_model, text_norm, tts_edit_model
-    torch.cuda.empty_cache()
     gc.collect()
     if denoise_model_name == "UVR5":
@@ -701,9 +747,16 @@ def align(transcript, audio_info, state):
     ]
 def denoise(audio_info):
     denoised_audio, sr = denoise_model.denoise(audio_info)
-    denoised_audio = denoised_audio # .squeeze().numpy()
     return (sr, denoised_audio)
 def cancel_denoise(audio_info):
@@ -742,6 +795,7 @@ def replace_numbers_with_words(sentence, lang="en"):
             return num # In case num2words fails (unlikely with digits but just to be safe)
     return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
 @spaces.GPU
 @torch.no_grad()
 @torch.inference_mode()
@@ -754,6 +808,22 @@ def run(seed, nfe_step, speed, cfg_strength, sway_sampling_coef, ref_ratio,
     if smart_transcript and (transcribe_state is None):
         raise gr.Error("Can't use smart transcript: whisper transcript not found")
     # if mode == "Rerun":
     #     colon_position = selected_sentence.find(':')
     #     selected_sentence_idx = int(selected_sentence[:colon_position])
@@ -1259,4 +1329,4 @@ if __name__ == "__main__":
     MODELS_PATH = args.models_path
     app = get_app()
-    app.queue().launch(share=args.share, server_name=args.server_name, server_port=args.port)

     return "cuda" if torch.cuda.is_available() else "cpu"
 device = _pick_device()
+# For WhisperX ASR:
+# - On Spaces we always construct the pipeline lazily inside @spaces.GPU
+#   functions, so keep the default "cpu" here to avoid touching CUDA in
+#   the main process.
+# - Elsewhere prefer CUDA if available.
+if IS_SPACES:
+    ASR_DEVICE = "cpu"
+else:
+    ASR_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 whisper_model, align_model = None, None
 tts_edit_model = None
     def __init__(self, model_dir):
         # Code directory is always the local `uvr5` folder in this repo
+        self.code_dir = os.path.join(os.path.dirname(__file__), "uvr5")
+        self.model_dir = model_dir
+        self.model = None
+        self.device = "cpu"
+    def load_model(self, device="cpu"):
         import sys, json, os, torch
+        if self.code_dir not in sys.path:
+            sys.path.append(self.code_dir)
+        # Reuse an already-loaded model if it matches the requested device.
+        if self.model is not None and self.device == device:
+            return self.model
         from multiprocess_cuda_infer import ModelData, Inference
         # In the minimal LEMAS-TTS layout, UVR5 weights live under:
+        model_path = os.path.join(self.model_dir, "Kim_Vocal_1.onnx")
+        config_path = os.path.join(self.model_dir, "MDX-Net-Kim-Vocal1.json")
         with open(config_path, "r", encoding="utf-8") as f:
             configs = json.load(f)
         model_data = ModelData(
             model_path=model_path,
+            audio_path=self.model_dir,
+            result_path=self.model_dir,
+            device=device,
             process_method="MDX-Net",
             # Keep base_dir and model_dir the same so all UVR5 metadata
             # (model_data.json, model_name_mapper.json, etc.) are resolved
             # under `pretrained_models/uvr5`, matching LEMAS-TTS inference.
+            base_dir=self.model_dir,
             **configs,
         )
+        uvr5_model = Inference(model_data, device)
         # On HF Spaces with stateless GPU, we must not initialize CUDA in the
+        # main process. The heavy UVR5 loading happens lazily inside
+        # @spaces.GPU functions; this guard is kept only for the CPU path to
+        # avoid any accidental CUDA init.
+        if IS_SPACES and device == "cpu":
+            orig_is_available = torch.cuda.is_available
+            torch.cuda.is_available = lambda: False
+            try:
+                uvr5_model.load_model(model_path, 1)
+            finally:
+                torch.cuda.is_available = orig_is_available
+        else:
             uvr5_model.load_model(model_path, 1)
+        self.model = uvr5_model
+        self.device = device
+        return self.model
     def denoise(self, audio_info):
+        # Prefer GPU if available; on Spaces this runs inside @spaces.GPU so
+        # CUDA can be safely initialized here.
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model = self.load_model(device=device)
         input_audio = load_wav(audio_info, sr=44100, channel=2)
+        output_audio = model.demix_base({0:input_audio.squeeze()}, is_match_mix=False, device=device)
         # transform = torchaudio.transforms.Resample(44100, 16000)
         # output_audio = transform(output_audio)
+        return output_audio.squeeze().T.cpu().numpy(), 44100
 class DeepFilterNet:
 class WhisperxModel:
     def __init__(self, model_name):
+        # Lazily construct the WhisperX pipeline so that on Spaces we only
+        # touch CUDA inside @spaces.GPU workers.
+        self.model_name = model_name
+        self.model = None
+    def _ensure_model(self):
+        if self.model is not None:
+            return
         from whisperx import load_model
         prompt = None  # "This might be a blend of Simplified Chinese and English speech, do not translate, only transcription be allowed."
+        # On Spaces, this will be called from within @spaces.GPU so we can
+        # safely move the ASR to CUDA if available. Locally we respect the
+        # ASR_DEVICE hint.
+        if IS_SPACES:
+            asr_device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            asr_device = ASR_DEVICE
         # Use the lighter Silero VAD backend to avoid pyannote checkpoints
         # and their PyTorch 2.6 `weights_only` pickling issues.
         self.model = load_model(
+            self.model_name,
+            asr_device,
             compute_type="float32",
             asr_options={
                 "suppress_numerals": False,
         )
     def transcribe(self, audio_info, lang=None):
+        # Lazily init the underlying WhisperX pipeline.
+        self._ensure_model()
         audio = load_wav(audio_info).numpy()
         if lang is None:
             lang = self.model.detect_language(audio)
 def load_models(lemas_model_name, whisper_model_name, alignment_model_name, denoise_model_name):  # , audiosr_name):
     global transcribe_model, align_model, denoise_model, text_norm, tts_edit_model
+    if not IS_SPACES:
+        torch.cuda.empty_cache()
     gc.collect()
     if denoise_model_name == "UVR5":
     ]
+@spaces.GPU
+@torch.no_grad()
+@torch.inference_mode()
 def denoise(audio_info):
+    # Denoiser can be relatively heavy (especially UVR5), so schedule it on
+    # GPU workers when running on HF Spaces.
+    if denoise_model is None:
+        return audio_info
     denoised_audio, sr = denoise_model.denoise(audio_info)
+    denoised_audio = denoised_audio  # already numpy
     return (sr, denoised_audio)
 def cancel_denoise(audio_info):
             return num # In case num2words fails (unlikely with digits but just to be safe)
     return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
 @spaces.GPU
 @torch.no_grad()
 @torch.inference_mode()
     if smart_transcript and (transcribe_state is None):
         raise gr.Error("Can't use smart transcript: whisper transcript not found")
+    # On HF Spaces, keep CUDA usage inside this GPU worker: move the edit
+    # model and vocoder to GPU here (the weights were loaded on CPU).
+    if IS_SPACES and torch.cuda.is_available():
+        try:
+            if getattr(tts_edit_model, "device", "cpu") != "cuda":
+                if hasattr(tts_edit_model, "ema_model"):
+                    tts_edit_model.ema_model.to("cuda")
+                if hasattr(tts_edit_model, "vocoder"):
+                    try:
+                        tts_edit_model.vocoder.to("cuda")
+                    except Exception:
+                        pass
+                tts_edit_model.device = "cuda"
+        except Exception as e:
+            logging.warning("Failed to move LEMAS-TTS model to CUDA: %s", e)
     # if mode == "Rerun":
     #     colon_position = selected_sentence.find(':')
     #     selected_sentence_idx = int(selected_sentence[:colon_position])
     MODELS_PATH = args.models_path
     app = get_app()
+    app.queue().launch(share=args.share, server_name=args.server_name, server_port=args.port)