Spaces:

LEMAS-Project
/

LEMAS-TTS

Running on Zero

App Files Files Community

Approximetal commited on Jan 4

Commit

9f66cd3

verified ·

1 Parent(s): 661ef4d

Update inference_gradio.py

Browse files

Files changed (1) hide show

inference_gradio.py +42 -25

inference_gradio.py CHANGED Viewed

@@ -42,26 +42,24 @@ os.environ["ESPEAKNG_DATA_PATH"] = str(ESPEAK_DATA_DIR)
 class UVR5:
     """Small wrapper around the bundled uvr5 implementation for denoising."""
-    def __init__(self, model_dir: Path, code_dir: Path):
-        # Keep paths as strings; actual model is loaded lazily.
-        self.model_dir = str(model_dir)
-        self.code_dir = str(code_dir)
         self.model = None
         self.device = "cpu"
-    def load_model(self, device: str = "cpu"):
-        import sys
-        import json
-        import torch as _torch
         if self.code_dir not in sys.path:
             sys.path.append(self.code_dir)
-        if self.model is not None:
             return self.model
-        from multiprocess_cuda_infer import ModelData, Inference
         model_path = os.path.join(self.model_dir, "Kim_Vocal_1.onnx")
         config_path = os.path.join(self.model_dir, "MDX-Net-Kim-Vocal1.json")
         with open(config_path, "r", encoding="utf-8") as f:
@@ -70,24 +68,43 @@ class UVR5:
             model_path=model_path,
             audio_path=self.model_dir,
             result_path=self.model_dir,
-            device="cpu",
             process_method="MDX-Net",
-            # keep base_dir and model_dir the same (paths under `pretrained_models`)
             base_dir=self.model_dir,
             **configs,
         )
-        uvr5_model = Inference(model_data, "cpu")
-        self.model = uvr5_model.load_model(model_path, 1, device="cpu")
-        self.device = "cpu"
         return self.model
     def denoise(self, audio_info):
-        print("denoise UVR5: ", audio_info)
-        # # On Spaces, force CPU; locally prefer CUDA if available.
-        self.model = self.load_model()
         input_audio = load_wav(audio_info, sr=44100, channel=2)
-        output_audio = self.model.demix_base({0: input_audio.squeeze()}, is_match_mix=False, device="cpu")
         return output_audio.squeeze().T.cpu().numpy(), 44100
@@ -207,7 +224,7 @@ def get_available_projects():
 def infer(
     project, file_checkpoint, exp_name, ref_text, ref_audio, denoise_audio, gen_text, nfe_step, use_ema, separate_langs, frontend, speed, cfg_strength, use_acc_grl, ref_ratio, no_ref_audio, sway_sampling_coef, use_prosody_encoder, seed
 ):
-    global tts_api, last_ema
     # Resolve checkpoint path (local or HF URL)
     ckpt_path = file_checkpoint

 class UVR5:
     """Small wrapper around the bundled uvr5 implementation for denoising."""
+    def __init__(self, model_dir):
+        # Code directory is always the local `uvr5` folder in this repo
+        self.code_dir = os.path.join(os.path.dirname(__file__), "uvr5")
+        self.model_dir = model_dir
         self.model = None
         self.device = "cpu"
+    def load_model(self, device="cpu"):
+        import sys, json, os, torch
         if self.code_dir not in sys.path:
             sys.path.append(self.code_dir)
+        # Reuse an already-loaded model if it matches the requested device.
+        if self.model is not None and self.device == device:
             return self.model
+        from multiprocess_cuda_infer import ModelData, Inference
+        # In the minimal LEMAS-TTS layout, UVR5 weights live under:
         model_path = os.path.join(self.model_dir, "Kim_Vocal_1.onnx")
         config_path = os.path.join(self.model_dir, "MDX-Net-Kim-Vocal1.json")
         with open(config_path, "r", encoding="utf-8") as f:
             model_path=model_path,
             audio_path=self.model_dir,
             result_path=self.model_dir,
+            device=device,
             process_method="MDX-Net",
+            # Keep base_dir and model_dir the same so all UVR5 metadata
+            # (model_data.json, model_name_mapper.json, etc.) are resolved
+            # under `pretrained_models/uvr5`, matching LEMAS-TTS inference.
             base_dir=self.model_dir,
             **configs,
         )
+        uvr5_model = Inference(model_data, device)
+        # On HF Spaces with stateless GPU, we must not initialize CUDA in the
+        # main process. The heavy UVR5 loading happens lazily inside
+        # @spaces.GPU functions; this guard is kept only for the CPU path to
+        # avoid any accidental CUDA init.
+        if IS_SPACES and device == "cpu":
+            orig_is_available = torch.cuda.is_available
+            torch.cuda.is_available = lambda: False
+            try:
+                uvr5_model.load_model(model_path, 1)
+            finally:
+                torch.cuda.is_available = orig_is_available
+        else:
+            uvr5_model.load_model(model_path, 1)
+        self.model = uvr5_model
+        self.device = device
         return self.model
     def denoise(self, audio_info):
+        # Prefer GPU if available; on Spaces this runs inside @spaces.GPU so
+        # CUDA can be safely initialized here.
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model = self.load_model(device=device)
         input_audio = load_wav(audio_info, sr=44100, channel=2)
+        output_audio = model.demix_base({0:input_audio.squeeze()}, is_match_mix=False, device=device)
+        # transform = torchaudio.transforms.Resample(44100, 16000)
+        # output_audio = transform(output_audio)
         return output_audio.squeeze().T.cpu().numpy(), 44100
 def infer(
     project, file_checkpoint, exp_name, ref_text, ref_audio, denoise_audio, gen_text, nfe_step, use_ema, separate_langs, frontend, speed, cfg_strength, use_acc_grl, ref_ratio, no_ref_audio, sway_sampling_coef, use_prosody_encoder, seed
 ):
+    global tts_api
     # Resolve checkpoint path (local or HF URL)
     ckpt_path = file_checkpoint