xjsc0 commited on
Commit
a6f2de2
·
1 Parent(s): 99cf7e1
Files changed (2) hide show
  1. app.py +37 -7
  2. requirements.txt +8 -1
app.py CHANGED
@@ -14,6 +14,30 @@ import torch
14
  import torchaudio
15
  from initialization import download_files
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # ---------------------------------------------------------------------------
18
  # Model loading (lazy, singleton) / 模型懒加载(单例)
19
  # ---------------------------------------------------------------------------
@@ -21,18 +45,22 @@ _model = None
21
  _separator = None
22
 
23
 
24
- def get_model(device: str = "cuda:0"):
 
25
  """加载 YingMusicSinger 模型 / Load YingMusicSinger model."""
26
  download_files(task="infer")
27
  global _model
28
  if _model is None:
29
  from src.YingMusicSinger.infer.YingMusicSinger import YingMusicSinger
30
 
31
- _model = YingMusicSinger(device=device)
 
 
32
  return _model
33
 
34
 
35
- def get_separator(device: str = "cuda:0"):
 
36
  """
37
  加载 MelBandRoformer 分离模型 / Load MelBandRoformer separator.
38
  Returns a Separator instance ready for inference.
@@ -47,7 +75,7 @@ def get_separator(device: str = "cuda:0"):
47
  _separator = Separator(
48
  config_path="ckpts/config_vocals_mel_band_roformer_kj.yaml",
49
  checkpoint_path="ckpts/MelBandRoformer.ckpt",
50
- device=device,
51
  )
52
  return _separator
53
 
@@ -55,6 +83,7 @@ def get_separator(device: str = "cuda:0"):
55
  # ---------------------------------------------------------------------------
56
  # Vocal separation utilities / 人声分离工具
57
  # ---------------------------------------------------------------------------
 
58
  def separate_vocals(
59
  audio_path: str,
60
  device: str = "cuda:0",
@@ -66,7 +95,7 @@ def separate_vocals(
66
  Returns:
67
  (vocals_path, accompaniment_path)
68
  """
69
- separator = get_separator(device=device)
70
 
71
  wav, sr = torchaudio.load(audio_path)
72
  vocal_wav, inst_wav, out_sr = separator.separate(wav, sr)
@@ -122,6 +151,7 @@ def mix_vocal_and_accompaniment(
122
  # ---------------------------------------------------------------------------
123
  # Inference wrapper / 推理入口
124
  # ---------------------------------------------------------------------------
 
125
  def synthesize(
126
  ref_audio,
127
  melody_audio,
@@ -186,7 +216,7 @@ def synthesize(
186
  actual_melody_path = melody_vocals_path
187
 
188
  # ---- Step 2: 模型推理 / Model inference ----------------------------------
189
- model = get_model(device=device)
190
 
191
  audio_tensor, sr = model(
192
  ref_audio_path=actual_ref_path,
@@ -203,7 +233,7 @@ def synthesize(
203
 
204
  # 先保存纯人声合成结果 / Save raw vocal synthesis result
205
  vocal_out_path = os.path.join(tempfile.mkdtemp(), "vocal_output.wav")
206
- torchaudio.save(vocal_out_path, audio_tensor, sample_rate=sr)
207
 
208
  # ---- Step 3: 混合伴奏 / Mix accompaniment (optional) ---------------------
209
  if (
 
14
  import torchaudio
15
  from initialization import download_files
16
 
17
+
18
+ IS_HF_SPACE = os.environ.get("SPACE_ID") is not None
19
+ HF_ENABLE = False
20
+ LOCAL_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
21
+
22
+ try:
23
+ import spaces
24
+ except ImportError:
25
+ spaces = None
26
+
27
+
28
+ def gpu_decorator(fn):
29
+ if IS_HF_SPACE and HF_ENABLE and spaces is not None:
30
+ return spaces.GPU(fn)
31
+ return fn
32
+
33
+
34
+ def local_move2gpu(x):
35
+ """Move models to GPU on local environment. No-op on HuggingFace Spaces (ZeroGPU handles it)."""
36
+ if IS_HF_SPACE:
37
+ return x
38
+ return x.to(LOCAL_DEVICE)
39
+
40
+
41
  # ---------------------------------------------------------------------------
42
  # Model loading (lazy, singleton) / 模型懒加载(单例)
43
  # ---------------------------------------------------------------------------
 
45
  _separator = None
46
 
47
 
48
+ @gpu_decorator
49
+ def get_model():
50
  """加载 YingMusicSinger 模型 / Load YingMusicSinger model."""
51
  download_files(task="infer")
52
  global _model
53
  if _model is None:
54
  from src.YingMusicSinger.infer.YingMusicSinger import YingMusicSinger
55
 
56
+ _model = YingMusicSinger.from_pretrained("ASLP-lab/YingMusic-Singer")
57
+ _model = local_move2gpu(_model)
58
+ _model.eval()
59
  return _model
60
 
61
 
62
+ @gpu_decorator
63
+ def get_separator():
64
  """
65
  加载 MelBandRoformer 分离模型 / Load MelBandRoformer separator.
66
  Returns a Separator instance ready for inference.
 
75
  _separator = Separator(
76
  config_path="ckpts/config_vocals_mel_band_roformer_kj.yaml",
77
  checkpoint_path="ckpts/MelBandRoformer.ckpt",
78
+ # device=device,
79
  )
80
  return _separator
81
 
 
83
  # ---------------------------------------------------------------------------
84
  # Vocal separation utilities / 人声分离工具
85
  # ---------------------------------------------------------------------------
86
+ @gpu_decorator
87
  def separate_vocals(
88
  audio_path: str,
89
  device: str = "cuda:0",
 
95
  Returns:
96
  (vocals_path, accompaniment_path)
97
  """
98
+ separator = get_separator()
99
 
100
  wav, sr = torchaudio.load(audio_path)
101
  vocal_wav, inst_wav, out_sr = separator.separate(wav, sr)
 
151
  # ---------------------------------------------------------------------------
152
  # Inference wrapper / 推理入口
153
  # ---------------------------------------------------------------------------
154
+ @gpu_decorator
155
  def synthesize(
156
  ref_audio,
157
  melody_audio,
 
216
  actual_melody_path = melody_vocals_path
217
 
218
  # ---- Step 2: 模型推理 / Model inference ----------------------------------
219
+ model = get_model()
220
 
221
  audio_tensor, sr = model(
222
  ref_audio_path=actual_ref_path,
 
233
 
234
  # 先保存纯人声合成结果 / Save raw vocal synthesis result
235
  vocal_out_path = os.path.join(tempfile.mkdtemp(), "vocal_output.wav")
236
+ torchaudio.save(vocal_out_path, audio_tensor.to("cpu"), sample_rate=sr)
237
 
238
  # ---- Step 3: 混合伴奏 / Mix accompaniment (optional) ---------------------
239
  if (
requirements.txt CHANGED
@@ -189,4 +189,11 @@ xxhash==3.6.0
189
  yarl==1.20.1
190
  zhconv==1.4.3
191
  zhon==2.1.1
192
- zipp==3.23.0
 
 
 
 
 
 
 
 
189
  yarl==1.20.1
190
  zhconv==1.4.3
191
  zhon==2.1.1
192
+ zipp==3.23.0
193
+ jieba==0.42.1
194
+ pypinyin==0.55.0
195
+ descript-audio-codec==1.0.0
196
+ cn2an==0.5.23
197
+ onnxruntime==1.23.2
198
+ phonemizer==3.3.0
199
+ py3langid==0.3.0