Spaces:
Running on Zero
Running on Zero
- app.py +37 -7
- requirements.txt +8 -1
app.py
CHANGED
|
@@ -14,6 +14,30 @@ import torch
|
|
| 14 |
import torchaudio
|
| 15 |
from initialization import download_files
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
# ---------------------------------------------------------------------------
|
| 18 |
# Model loading (lazy, singleton) / 模型懒加载(单例)
|
| 19 |
# ---------------------------------------------------------------------------
|
|
@@ -21,18 +45,22 @@ _model = None
|
|
| 21 |
_separator = None
|
| 22 |
|
| 23 |
|
| 24 |
-
|
|
|
|
| 25 |
"""加载 YingMusicSinger 模型 / Load YingMusicSinger model."""
|
| 26 |
download_files(task="infer")
|
| 27 |
global _model
|
| 28 |
if _model is None:
|
| 29 |
from src.YingMusicSinger.infer.YingMusicSinger import YingMusicSinger
|
| 30 |
|
| 31 |
-
_model = YingMusicSinger(
|
|
|
|
|
|
|
| 32 |
return _model
|
| 33 |
|
| 34 |
|
| 35 |
-
|
|
|
|
| 36 |
"""
|
| 37 |
加载 MelBandRoformer 分离模型 / Load MelBandRoformer separator.
|
| 38 |
Returns a Separator instance ready for inference.
|
|
@@ -47,7 +75,7 @@ def get_separator(device: str = "cuda:0"):
|
|
| 47 |
_separator = Separator(
|
| 48 |
config_path="ckpts/config_vocals_mel_band_roformer_kj.yaml",
|
| 49 |
checkpoint_path="ckpts/MelBandRoformer.ckpt",
|
| 50 |
-
device=device,
|
| 51 |
)
|
| 52 |
return _separator
|
| 53 |
|
|
@@ -55,6 +83,7 @@ def get_separator(device: str = "cuda:0"):
|
|
| 55 |
# ---------------------------------------------------------------------------
|
| 56 |
# Vocal separation utilities / 人声分离工具
|
| 57 |
# ---------------------------------------------------------------------------
|
|
|
|
| 58 |
def separate_vocals(
|
| 59 |
audio_path: str,
|
| 60 |
device: str = "cuda:0",
|
|
@@ -66,7 +95,7 @@ def separate_vocals(
|
|
| 66 |
Returns:
|
| 67 |
(vocals_path, accompaniment_path)
|
| 68 |
"""
|
| 69 |
-
separator = get_separator(
|
| 70 |
|
| 71 |
wav, sr = torchaudio.load(audio_path)
|
| 72 |
vocal_wav, inst_wav, out_sr = separator.separate(wav, sr)
|
|
@@ -122,6 +151,7 @@ def mix_vocal_and_accompaniment(
|
|
| 122 |
# ---------------------------------------------------------------------------
|
| 123 |
# Inference wrapper / 推理入口
|
| 124 |
# ---------------------------------------------------------------------------
|
|
|
|
| 125 |
def synthesize(
|
| 126 |
ref_audio,
|
| 127 |
melody_audio,
|
|
@@ -186,7 +216,7 @@ def synthesize(
|
|
| 186 |
actual_melody_path = melody_vocals_path
|
| 187 |
|
| 188 |
# ---- Step 2: 模型推理 / Model inference ----------------------------------
|
| 189 |
-
model = get_model(
|
| 190 |
|
| 191 |
audio_tensor, sr = model(
|
| 192 |
ref_audio_path=actual_ref_path,
|
|
@@ -203,7 +233,7 @@ def synthesize(
|
|
| 203 |
|
| 204 |
# 先保存纯人声合成结果 / Save raw vocal synthesis result
|
| 205 |
vocal_out_path = os.path.join(tempfile.mkdtemp(), "vocal_output.wav")
|
| 206 |
-
torchaudio.save(vocal_out_path, audio_tensor, sample_rate=sr)
|
| 207 |
|
| 208 |
# ---- Step 3: 混合伴奏 / Mix accompaniment (optional) ---------------------
|
| 209 |
if (
|
|
|
|
| 14 |
import torchaudio
|
| 15 |
from initialization import download_files
|
| 16 |
|
| 17 |
+
|
| 18 |
+
IS_HF_SPACE = os.environ.get("SPACE_ID") is not None
|
| 19 |
+
HF_ENABLE = False
|
| 20 |
+
LOCAL_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
import spaces
|
| 24 |
+
except ImportError:
|
| 25 |
+
spaces = None
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def gpu_decorator(fn):
|
| 29 |
+
if IS_HF_SPACE and HF_ENABLE and spaces is not None:
|
| 30 |
+
return spaces.GPU(fn)
|
| 31 |
+
return fn
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def local_move2gpu(x):
|
| 35 |
+
"""Move models to GPU on local environment. No-op on HuggingFace Spaces (ZeroGPU handles it)."""
|
| 36 |
+
if IS_HF_SPACE:
|
| 37 |
+
return x
|
| 38 |
+
return x.to(LOCAL_DEVICE)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
# ---------------------------------------------------------------------------
|
| 42 |
# Model loading (lazy, singleton) / 模型懒加载(单例)
|
| 43 |
# ---------------------------------------------------------------------------
|
|
|
|
| 45 |
_separator = None
|
| 46 |
|
| 47 |
|
| 48 |
+
@gpu_decorator
|
| 49 |
+
def get_model():
|
| 50 |
"""加载 YingMusicSinger 模型 / Load YingMusicSinger model."""
|
| 51 |
download_files(task="infer")
|
| 52 |
global _model
|
| 53 |
if _model is None:
|
| 54 |
from src.YingMusicSinger.infer.YingMusicSinger import YingMusicSinger
|
| 55 |
|
| 56 |
+
_model = YingMusicSinger.from_pretrained("ASLP-lab/YingMusic-Singer")
|
| 57 |
+
_model = local_move2gpu(_model)
|
| 58 |
+
_model.eval()
|
| 59 |
return _model
|
| 60 |
|
| 61 |
|
| 62 |
+
@gpu_decorator
|
| 63 |
+
def get_separator():
|
| 64 |
"""
|
| 65 |
加载 MelBandRoformer 分离模型 / Load MelBandRoformer separator.
|
| 66 |
Returns a Separator instance ready for inference.
|
|
|
|
| 75 |
_separator = Separator(
|
| 76 |
config_path="ckpts/config_vocals_mel_band_roformer_kj.yaml",
|
| 77 |
checkpoint_path="ckpts/MelBandRoformer.ckpt",
|
| 78 |
+
# device=device,
|
| 79 |
)
|
| 80 |
return _separator
|
| 81 |
|
|
|
|
| 83 |
# ---------------------------------------------------------------------------
|
| 84 |
# Vocal separation utilities / 人声分离工具
|
| 85 |
# ---------------------------------------------------------------------------
|
| 86 |
+
@gpu_decorator
|
| 87 |
def separate_vocals(
|
| 88 |
audio_path: str,
|
| 89 |
device: str = "cuda:0",
|
|
|
|
| 95 |
Returns:
|
| 96 |
(vocals_path, accompaniment_path)
|
| 97 |
"""
|
| 98 |
+
separator = get_separator()
|
| 99 |
|
| 100 |
wav, sr = torchaudio.load(audio_path)
|
| 101 |
vocal_wav, inst_wav, out_sr = separator.separate(wav, sr)
|
|
|
|
| 151 |
# ---------------------------------------------------------------------------
|
| 152 |
# Inference wrapper / 推理入口
|
| 153 |
# ---------------------------------------------------------------------------
|
| 154 |
+
@gpu_decorator
|
| 155 |
def synthesize(
|
| 156 |
ref_audio,
|
| 157 |
melody_audio,
|
|
|
|
| 216 |
actual_melody_path = melody_vocals_path
|
| 217 |
|
| 218 |
# ---- Step 2: 模型推理 / Model inference ----------------------------------
|
| 219 |
+
model = get_model()
|
| 220 |
|
| 221 |
audio_tensor, sr = model(
|
| 222 |
ref_audio_path=actual_ref_path,
|
|
|
|
| 233 |
|
| 234 |
# 先保存纯人声合成结果 / Save raw vocal synthesis result
|
| 235 |
vocal_out_path = os.path.join(tempfile.mkdtemp(), "vocal_output.wav")
|
| 236 |
+
torchaudio.save(vocal_out_path, audio_tensor.to("cpu"), sample_rate=sr)
|
| 237 |
|
| 238 |
# ---- Step 3: 混合伴奏 / Mix accompaniment (optional) ---------------------
|
| 239 |
if (
|
requirements.txt
CHANGED
|
@@ -189,4 +189,11 @@ xxhash==3.6.0
|
|
| 189 |
yarl==1.20.1
|
| 190 |
zhconv==1.4.3
|
| 191 |
zhon==2.1.1
|
| 192 |
-
zipp==3.23.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
yarl==1.20.1
|
| 190 |
zhconv==1.4.3
|
| 191 |
zhon==2.1.1
|
| 192 |
+
zipp==3.23.0
|
| 193 |
+
jieba==0.42.1
|
| 194 |
+
pypinyin==0.55.0
|
| 195 |
+
descript-audio-codec==1.0.0
|
| 196 |
+
cn2an==0.5.23
|
| 197 |
+
onnxruntime==1.23.2
|
| 198 |
+
phonemizer==3.3.0
|
| 199 |
+
py3langid==0.3.0
|