Spaces:
Running
on
Zero
Running
on
Zero
Update inference_gradio.py
Browse files- inference_gradio.py +42 -25
inference_gradio.py
CHANGED
|
@@ -42,26 +42,24 @@ os.environ["ESPEAKNG_DATA_PATH"] = str(ESPEAK_DATA_DIR)
|
|
| 42 |
class UVR5:
|
| 43 |
"""Small wrapper around the bundled uvr5 implementation for denoising."""
|
| 44 |
|
| 45 |
-
def __init__(self, model_dir
|
| 46 |
-
#
|
| 47 |
-
self.
|
| 48 |
-
self.
|
| 49 |
self.model = None
|
| 50 |
self.device = "cpu"
|
| 51 |
-
|
| 52 |
-
def load_model(self, device
|
| 53 |
-
import sys
|
| 54 |
-
import json
|
| 55 |
-
import torch as _torch
|
| 56 |
-
|
| 57 |
if self.code_dir not in sys.path:
|
| 58 |
sys.path.append(self.code_dir)
|
| 59 |
-
|
| 60 |
-
|
|
|
|
| 61 |
return self.model
|
| 62 |
-
|
| 63 |
-
from multiprocess_cuda_infer import ModelData, Inference
|
| 64 |
|
|
|
|
|
|
|
| 65 |
model_path = os.path.join(self.model_dir, "Kim_Vocal_1.onnx")
|
| 66 |
config_path = os.path.join(self.model_dir, "MDX-Net-Kim-Vocal1.json")
|
| 67 |
with open(config_path, "r", encoding="utf-8") as f:
|
|
@@ -70,24 +68,43 @@ class UVR5:
|
|
| 70 |
model_path=model_path,
|
| 71 |
audio_path=self.model_dir,
|
| 72 |
result_path=self.model_dir,
|
| 73 |
-
device=
|
| 74 |
process_method="MDX-Net",
|
| 75 |
-
#
|
|
|
|
|
|
|
| 76 |
base_dir=self.model_dir,
|
| 77 |
**configs,
|
| 78 |
)
|
| 79 |
|
| 80 |
-
uvr5_model = Inference(model_data,
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
return self.model
|
| 84 |
-
|
| 85 |
def denoise(self, audio_info):
|
| 86 |
-
|
| 87 |
-
#
|
| 88 |
-
|
|
|
|
| 89 |
input_audio = load_wav(audio_info, sr=44100, channel=2)
|
| 90 |
-
output_audio =
|
|
|
|
|
|
|
| 91 |
return output_audio.squeeze().T.cpu().numpy(), 44100
|
| 92 |
|
| 93 |
|
|
@@ -207,7 +224,7 @@ def get_available_projects():
|
|
| 207 |
def infer(
|
| 208 |
project, file_checkpoint, exp_name, ref_text, ref_audio, denoise_audio, gen_text, nfe_step, use_ema, separate_langs, frontend, speed, cfg_strength, use_acc_grl, ref_ratio, no_ref_audio, sway_sampling_coef, use_prosody_encoder, seed
|
| 209 |
):
|
| 210 |
-
global tts_api
|
| 211 |
|
| 212 |
# Resolve checkpoint path (local or HF URL)
|
| 213 |
ckpt_path = file_checkpoint
|
|
|
|
| 42 |
class UVR5:
|
| 43 |
"""Small wrapper around the bundled uvr5 implementation for denoising."""
|
| 44 |
|
| 45 |
+
def __init__(self, model_dir):
|
| 46 |
+
# Code directory is always the local `uvr5` folder in this repo
|
| 47 |
+
self.code_dir = os.path.join(os.path.dirname(__file__), "uvr5")
|
| 48 |
+
self.model_dir = model_dir
|
| 49 |
self.model = None
|
| 50 |
self.device = "cpu"
|
| 51 |
+
|
| 52 |
+
def load_model(self, device="cpu"):
|
| 53 |
+
import sys, json, os, torch
|
|
|
|
|
|
|
|
|
|
| 54 |
if self.code_dir not in sys.path:
|
| 55 |
sys.path.append(self.code_dir)
|
| 56 |
+
|
| 57 |
+
# Reuse an already-loaded model if it matches the requested device.
|
| 58 |
+
if self.model is not None and self.device == device:
|
| 59 |
return self.model
|
|
|
|
|
|
|
| 60 |
|
| 61 |
+
from multiprocess_cuda_infer import ModelData, Inference
|
| 62 |
+
# In the minimal LEMAS-TTS layout, UVR5 weights live under:
|
| 63 |
model_path = os.path.join(self.model_dir, "Kim_Vocal_1.onnx")
|
| 64 |
config_path = os.path.join(self.model_dir, "MDX-Net-Kim-Vocal1.json")
|
| 65 |
with open(config_path, "r", encoding="utf-8") as f:
|
|
|
|
| 68 |
model_path=model_path,
|
| 69 |
audio_path=self.model_dir,
|
| 70 |
result_path=self.model_dir,
|
| 71 |
+
device=device,
|
| 72 |
process_method="MDX-Net",
|
| 73 |
+
# Keep base_dir and model_dir the same so all UVR5 metadata
|
| 74 |
+
# (model_data.json, model_name_mapper.json, etc.) are resolved
|
| 75 |
+
# under `pretrained_models/uvr5`, matching LEMAS-TTS inference.
|
| 76 |
base_dir=self.model_dir,
|
| 77 |
**configs,
|
| 78 |
)
|
| 79 |
|
| 80 |
+
uvr5_model = Inference(model_data, device)
|
| 81 |
+
# On HF Spaces with stateless GPU, we must not initialize CUDA in the
|
| 82 |
+
# main process. The heavy UVR5 loading happens lazily inside
|
| 83 |
+
# @spaces.GPU functions; this guard is kept only for the CPU path to
|
| 84 |
+
# avoid any accidental CUDA init.
|
| 85 |
+
if IS_SPACES and device == "cpu":
|
| 86 |
+
orig_is_available = torch.cuda.is_available
|
| 87 |
+
torch.cuda.is_available = lambda: False
|
| 88 |
+
try:
|
| 89 |
+
uvr5_model.load_model(model_path, 1)
|
| 90 |
+
finally:
|
| 91 |
+
torch.cuda.is_available = orig_is_available
|
| 92 |
+
else:
|
| 93 |
+
uvr5_model.load_model(model_path, 1)
|
| 94 |
+
|
| 95 |
+
self.model = uvr5_model
|
| 96 |
+
self.device = device
|
| 97 |
return self.model
|
| 98 |
+
|
| 99 |
def denoise(self, audio_info):
|
| 100 |
+
# Prefer GPU if available; on Spaces this runs inside @spaces.GPU so
|
| 101 |
+
# CUDA can be safely initialized here.
|
| 102 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 103 |
+
model = self.load_model(device=device)
|
| 104 |
input_audio = load_wav(audio_info, sr=44100, channel=2)
|
| 105 |
+
output_audio = model.demix_base({0:input_audio.squeeze()}, is_match_mix=False, device=device)
|
| 106 |
+
# transform = torchaudio.transforms.Resample(44100, 16000)
|
| 107 |
+
# output_audio = transform(output_audio)
|
| 108 |
return output_audio.squeeze().T.cpu().numpy(), 44100
|
| 109 |
|
| 110 |
|
|
|
|
| 224 |
def infer(
|
| 225 |
project, file_checkpoint, exp_name, ref_text, ref_audio, denoise_audio, gen_text, nfe_step, use_ema, separate_langs, frontend, speed, cfg_strength, use_acc_grl, ref_ratio, no_ref_audio, sway_sampling_coef, use_prosody_encoder, seed
|
| 226 |
):
|
| 227 |
+
global tts_api
|
| 228 |
|
| 229 |
# Resolve checkpoint path (local or HF URL)
|
| 230 |
ckpt_path = file_checkpoint
|