xjsc0 commited on
Commit
dae4a7c
·
0 Parent(s):
Files changed (3) hide show
  1. README.md +41 -0
  2. app.py +478 -0
  3. requirements.txt +202 -0
README.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: YingMusic-Singer
3
+ emoji: 🎤
4
+ colorFrom: "#FE9EC7"
5
+ colorTo: "#89D4FF"
6
+ sdk: gradio
7
+ python_version: "3.10"
8
+ app_file: app.py
9
+ tags:
10
+ - singing-voice-synthesis
11
+ - lyric-editing
12
+ - diffusion-model
13
+ - reinforcement-learning
14
+ short_description: Melody-controllable singing voice synthesis with flexible lyric editing
15
+ fullWidth: true
16
+ ---
17
+
18
+ # YingMusic-Singer
19
+ YingMusic-Singer: Controllable Singing Voice Synthesis with Flexible Lyric Manipulation and Annotation-free Melody Guidance
20
+
21
+ ## Environment Setup
22
+
23
+ ### 1. Install from Scratch
24
+ ```bash
25
+ conda create -n YingMusic-Singer python=3.10
26
+ conda activate YingMusic-Singer
27
+
28
+ # uv is much quicker
29
+ pip install uv
30
+ uv pip install -r requirements.txt
31
+ ```
32
+
33
+ ### 2. Pre-built Conda Environment for One-Click Deployment (Nvidia / AMD CPU Only)
34
+
35
+ Coming soon
36
+
37
+ ## License
38
+
39
+ The code and model weights in this project are licensed under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/), except for the following components:
40
+
41
+ The VAE model weights and inference code (in `src/YingMusic-Singer/utils/stable-audio-tools`) are derived from [Stable Audio Open](https://huggingface.co/stabilityai/stable-audio-open-1.0) by Stability AI, and are licensed under the [Stability AI Community License](./LICENSE-STABILITY).
app.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ YingMusic Singer - Gradio Web Interface
3
+ ========================================
4
+ 基于参考音色与旋律音频的歌声合成系统,支持自动分离人声与伴奏。
5
+ A singing voice synthesis system powered by YingMusicSinger,
6
+ with built-in vocal/accompaniment separation via MelBandRoformer.
7
+ """
8
+
9
+ import gradio as gr
10
+ import torch
11
+ import torchaudio
12
+ import tempfile
13
+ import os
14
+ import numpy as np
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Model loading (lazy, singleton) / 模型懒加载(单例)
18
+ # ---------------------------------------------------------------------------
19
+ _model = None
20
+ _separator = None
21
+
22
+
23
+ def get_model(device: str = "cuda:0"):
24
+ """加载 YingMusicSinger 模型 / Load YingMusicSinger model."""
25
+ global _model
26
+ if _model is None:
27
+ from src.YingMusicSinger.infer.YingMusicSinger import YingMusicSinger
28
+
29
+ _model = YingMusicSinger(device=device)
30
+ return _model
31
+
32
+
33
+ def get_separator(device: str = "cuda:0"):
34
+ """
35
+ 加载 MelBandRoformer 分离模型 / Load MelBandRoformer separator.
36
+ Returns a Separator instance ready for inference.
37
+ """
38
+ global _separator
39
+ if _separator is None:
40
+ from src.third_party.MusicSourceSeparationTraining.inference_api import (
41
+ Separator,
42
+ )
43
+
44
+ _separator = Separator(
45
+ config_path="ckpts/config_vocals_mel_band_roformer_kj.yaml",
46
+ checkpoint_path="ckpts/MelBandRoformer.ckpt",
47
+ device=device,
48
+ )
49
+ return _separator
50
+
51
+
52
+ # ---------------------------------------------------------------------------
53
+ # Vocal separation utilities / 人声分离工具
54
+ # ---------------------------------------------------------------------------
55
+ def separate_vocals(
56
+ audio_path: str,
57
+ device: str = "cuda:0",
58
+ ) -> tuple:
59
+ """
60
+ 使用 MelBandRoformer 将音频分离为人声和伴奏。
61
+ Separate audio into vocals and accompaniment using MelBandRoformer.
62
+
63
+ Returns:
64
+ (vocals_path, accompaniment_path)
65
+ """
66
+ separator = get_separator(device=device)
67
+
68
+ wav, sr = torchaudio.load(audio_path)
69
+ vocal_wav, inst_wav, out_sr = separator.separate(wav, sr)
70
+
71
+ tmp_dir = tempfile.mkdtemp()
72
+ vocals_path = os.path.join(tmp_dir, "vocals.wav")
73
+ accomp_path = os.path.join(tmp_dir, "accompaniment.wav")
74
+ torchaudio.save(vocals_path, torch.from_numpy(vocal_wav), out_sr)
75
+ torchaudio.save(accomp_path, torch.from_numpy(inst_wav), out_sr)
76
+
77
+ return vocals_path, accomp_path
78
+
79
+
80
+ def mix_vocal_and_accompaniment(
81
+ vocal_path: str,
82
+ accomp_path: str,
83
+ vocal_gain: float = 1.0,
84
+ ) -> str:
85
+ """
86
+ 将合成人声与伴奏混合为最终音频。
87
+ Mix synthesised vocals with accompaniment into a final audio file.
88
+ """
89
+ vocal_wav, vocal_sr = torchaudio.load(vocal_path)
90
+ accomp_wav, accomp_sr = torchaudio.load(accomp_path)
91
+
92
+ # 统一采样率至人声采样率 / Resample accompaniment to match vocal sample rate
93
+ if accomp_sr != vocal_sr:
94
+ accomp_wav = torchaudio.functional.resample(accomp_wav, accomp_sr, vocal_sr)
95
+
96
+ # 统一声道数 / Match channel count
97
+ if vocal_wav.shape[0] != accomp_wav.shape[0]:
98
+ if vocal_wav.shape[0] == 1:
99
+ vocal_wav = vocal_wav.expand(accomp_wav.shape[0], -1)
100
+ else:
101
+ accomp_wav = accomp_wav.expand(vocal_wav.shape[0], -1)
102
+
103
+ # 对齐长度(以较短者为准)/ Align to shorter length
104
+ min_len = min(vocal_wav.shape[1], accomp_wav.shape[1])
105
+ vocal_wav = vocal_wav[:, :min_len]
106
+ accomp_wav = accomp_wav[:, :min_len]
107
+
108
+ mixed = vocal_wav * vocal_gain + accomp_wav
109
+ # 防止 clipping / Prevent clipping
110
+ peak = mixed.abs().max()
111
+ if peak > 1.0:
112
+ mixed = mixed / peak
113
+
114
+ out_path = os.path.join(tempfile.mkdtemp(), "mixed_output.wav")
115
+ torchaudio.save(out_path, mixed, sample_rate=vocal_sr)
116
+ return out_path
117
+
118
+
119
+ # ---------------------------------------------------------------------------
120
+ # Inference wrapper / 推理入口
121
+ # ---------------------------------------------------------------------------
122
+ def synthesize(
123
+ ref_audio,
124
+ melody_audio,
125
+ ref_text,
126
+ target_text,
127
+ separate_vocals_flag,
128
+ mix_accompaniment_flag,
129
+ sil_len_to_end,
130
+ t_shift,
131
+ nfe_step,
132
+ cfg_strength,
133
+ seed,
134
+ ):
135
+ """
136
+ 主合成流程 / Main synthesis pipeline.
137
+
138
+ 1. (可选) 用 MelBandRoformer 分离参考音频和旋律音频的人声与伴奏
139
+ (Optional) Separate vocals & accompaniment from both ref and melody audio via MelBandRoformer
140
+ 2. 送入 YingMusicSinger 合成
141
+ Run YingMusicSinger inference
142
+ 3. (可选) 将合成人声与旋律音频的伴奏混合
143
+ (Optional) Mix synthesised vocals with melody accompaniment
144
+ """
145
+ import random
146
+
147
+ # ---- 输入校验 / Input validation -----------------------------------------
148
+ if ref_audio is None:
149
+ raise gr.Error("请上传参考音频 / Please upload Reference Audio")
150
+ if melody_audio is None:
151
+ raise gr.Error("请上传旋律音频 / Please upload Melody Audio")
152
+ if not ref_text.strip():
153
+ raise gr.Error("请输入参考音频对应的歌词 / Please enter Reference Text")
154
+ if not target_text.strip():
155
+ raise gr.Error("请输入目标合成歌词 / Please enter Target Text")
156
+
157
+ ref_audio_path = ref_audio if isinstance(ref_audio, str) else ref_audio[0]
158
+ melody_audio_path = (
159
+ melody_audio if isinstance(melody_audio, str) else melody_audio[0]
160
+ )
161
+
162
+ # seed = -1 means random / seed 为 -1 时随机生成
163
+ actual_seed = int(seed)
164
+ if actual_seed < 0:
165
+ actual_seed = random.randint(0, 2**31 - 1)
166
+
167
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
168
+
169
+ # ---- Step 1: 人声分离(参考音频 + 旋律音频)/ Vocal separation for both (optional) ----
170
+ melody_accomp_path = None
171
+ actual_ref_path = ref_audio_path
172
+ actual_melody_path = melody_audio_path
173
+
174
+ if separate_vocals_flag:
175
+ # 分离参考音频 / Separate reference audio
176
+ ref_vocals_path, _ = separate_vocals(ref_audio_path, device=device)
177
+ actual_ref_path = ref_vocals_path
178
+
179
+ # 分离旋律音频 / Separate melody audio
180
+ melody_vocals_path, melody_accomp_path = separate_vocals(
181
+ melody_audio_path, device=device
182
+ )
183
+ actual_melody_path = melody_vocals_path
184
+
185
+ # ---- Step 2: 模型推理 / Model inference ----------------------------------
186
+ model = get_model(device=device)
187
+
188
+ audio_tensor, sr = model(
189
+ ref_audio_path=actual_ref_path,
190
+ melody_audio_path=actual_melody_path,
191
+ ref_text=ref_text.strip(),
192
+ target_text=target_text.strip(),
193
+ lrc_align_mode="sentence_level",
194
+ sil_len_to_end=float(sil_len_to_end),
195
+ t_shift=float(t_shift),
196
+ nfe_step=int(nfe_step),
197
+ cfg_strength=float(cfg_strength),
198
+ seed=actual_seed,
199
+ )
200
+
201
+ # 先保存纯人声合成结果 / Save raw vocal synthesis result
202
+ vocal_out_path = os.path.join(tempfile.mkdtemp(), "vocal_output.wav")
203
+ torchaudio.save(vocal_out_path, audio_tensor, sample_rate=sr)
204
+
205
+ # ---- Step 3: 混合伴奏 / Mix accompaniment (optional) ---------------------
206
+ if (
207
+ separate_vocals_flag
208
+ and mix_accompaniment_flag
209
+ and melody_accomp_path is not None
210
+ ):
211
+ final_path = mix_vocal_and_accompaniment(vocal_out_path, melody_accomp_path)
212
+ return final_path
213
+ else:
214
+ return vocal_out_path
215
+
216
+
217
+ # ---------------------------------------------------------------------------
218
+ # Custom CSS / 自定义样式
219
+ # ---------------------------------------------------------------------------
220
+ CUSTOM_CSS = """
221
+ @import url('https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,300;0,9..40,500;0,9..40,700;1,9..40,400&family=Playfair+Display:wght@600;800&display=swap');
222
+
223
+ :root {
224
+ --primary: #e85d04;
225
+ --primary-light: #f48c06;
226
+ --bg-dark: #0d1117;
227
+ --surface: #161b22;
228
+ --surface-light: #21262d;
229
+ --text: #f0f6fc;
230
+ --text-muted: #8b949e;
231
+ --accent-glow: rgba(232, 93, 4, 0.15);
232
+ --border: #30363d;
233
+ }
234
+
235
+ .gradio-container {
236
+ font-family: 'DM Sans', sans-serif !important;
237
+ max-width: 1100px !important;
238
+ margin: auto !important;
239
+ }
240
+
241
+ /* ---------- Header / 头部 ---------- */
242
+ #app-header {
243
+ text-align: center;
244
+ padding: 2.5rem 1rem 1.5rem;
245
+ }
246
+ #app-header h1 {
247
+ font-family: 'Playfair Display', serif !important;
248
+ font-size: 2.6rem !important;
249
+ font-weight: 800 !important;
250
+ background: linear-gradient(135deg, #f48c06, #e85d04, #dc2f02);
251
+ -webkit-background-clip: text;
252
+ -webkit-text-fill-color: transparent;
253
+ margin-bottom: 0.3rem !important;
254
+ letter-spacing: -0.02em;
255
+ }
256
+ #app-header p {
257
+ color: var(--text-muted);
258
+ font-size: 1.05rem;
259
+ margin-top: 0;
260
+ }
261
+
262
+ /* ---------- Section labels / 分区标题 ---------- */
263
+ .section-title {
264
+ font-family: 'DM Sans', sans-serif !important;
265
+ font-weight: 700 !important;
266
+ font-size: 1rem !important;
267
+ letter-spacing: 0.06em;
268
+ text-transform: uppercase;
269
+ color: var(--primary-light) !important;
270
+ border-bottom: 2px solid var(--primary);
271
+ padding-bottom: 6px;
272
+ margin-bottom: 12px !important;
273
+ }
274
+
275
+ /* ---------- Run button / 合成按钮 ---------- */
276
+ #run-btn {
277
+ background: linear-gradient(135deg, #e85d04, #dc2f02) !important;
278
+ border: none !important;
279
+ color: #fff !important;
280
+ font-weight: 700 !important;
281
+ font-size: 1.1rem !important;
282
+ letter-spacing: 0.04em;
283
+ padding: 12px 0 !important;
284
+ border-radius: 10px !important;
285
+ transition: transform 0.15s, box-shadow 0.25s !important;
286
+ box-shadow: 0 4px 20px rgba(232, 93, 4, 0.35) !important;
287
+ }
288
+ #run-btn:hover {
289
+ transform: translateY(-1px) !important;
290
+ box-shadow: 0 6px 28px rgba(232, 93, 4, 0.5) !important;
291
+ }
292
+
293
+ /* ---------- Output audio / 输出音频 ---------- */
294
+ #output-audio {
295
+ border: 2px solid var(--primary) !important;
296
+ border-radius: 12px !important;
297
+ background: var(--accent-glow) !important;
298
+ }
299
+ """
300
+
301
+
302
+ # ---------------------------------------------------------------------------
303
+ # Build the Gradio UI / 构建界面
304
+ # ---------------------------------------------------------------------------
305
+ def build_ui():
306
+ with gr.Blocks(
307
+ css=CUSTOM_CSS, title="YingMusic Singer", theme=gr.themes.Base()
308
+ ) as demo:
309
+ # ---- Header / 头部 ----
310
+ gr.HTML(
311
+ """
312
+ <div id="app-header">
313
+ <h1>♫ YingMusic Singer</h1>
314
+ <p>基于参考音色与旋律音频的歌声合成系统 &nbsp;·&nbsp; Singing Voice Synthesis</p>
315
+ </div>
316
+ """
317
+ )
318
+
319
+ # ================================================================
320
+ # ROW 1 – 音频输入 / Audio Inputs + 歌词 / Lyrics (side by side)
321
+ # ================================================================
322
+ with gr.Row(equal_height=True):
323
+ # ---- 左栏:音频上传 / Left column: audio uploads ----
324
+ with gr.Column(scale=1):
325
+ gr.Markdown(
326
+ "#### 🎙️ 音频输入 / Audio Inputs", elem_classes="section-title"
327
+ )
328
+ ref_audio = gr.Audio(
329
+ label="参考音频 / Reference Audio(提供音色 / provides timbre)",
330
+ type="filepath",
331
+ )
332
+ melody_audio = gr.Audio(
333
+ label="旋律音频 / Melody Audio(提供旋律与时长 / provides melody & duration)",
334
+ type="filepath",
335
+ )
336
+
337
+ # ---- 右栏:歌词 / Right column: lyrics ----
338
+ with gr.Column(scale=1):
339
+ gr.Markdown("#### ✏️ 歌词 / Lyrics", elem_classes="section-title")
340
+ ref_text = gr.Textbox(
341
+ label="参考音频歌词 / Reference Text",
342
+ placeholder="例如 / e.g.:该体谅的不执着|如果那天我",
343
+ lines=5,
344
+ )
345
+ target_text = gr.Textbox(
346
+ label="目标合成歌词 / Target Text",
347
+ placeholder="例如 / e.g.:好多天|看不完你",
348
+ lines=5,
349
+ )
350
+
351
+ # ================================================================
352
+ # ROW 2 – 伴奏分离选项 / Vocal Separation Options
353
+ # ================================================================
354
+ gr.Markdown("#### 🎚️ 伴奏分离 / Vocal Separation", elem_classes="section-title")
355
+ with gr.Row():
356
+ separate_vocals_flag = gr.Checkbox(
357
+ value=True,
358
+ label="分离人声后过模型 / Separate vocals before synthesis",
359
+ info=(
360
+ "从参考音频和旋律音频中分离人声,仅用人声送入模型 / "
361
+ "Extract vocals from both reference and melody audio before feeding into the model"
362
+ ),
363
+ )
364
+ mix_accompaniment_flag = gr.Checkbox(
365
+ value=False,
366
+ interactive=True,
367
+ label="输出时混入伴奏 / Mix accompaniment into output",
368
+ info=(
369
+ "将合成人声与分离出的伴奏混合输出(需先开启人声分离)/ "
370
+ "Mix synthesised vocals with separated accompaniment (requires vocal separation)"
371
+ ),
372
+ )
373
+
374
+ # ================================================================
375
+ # ROW 3 – 高级参数 / Advanced Parameters (collapsible)
376
+ # ================================================================
377
+ with gr.Accordion("⚙️ 高级参数 / Advanced Parameters", open=False):
378
+ with gr.Row():
379
+ nfe_step = gr.Slider(
380
+ minimum=4,
381
+ maximum=128,
382
+ value=32,
383
+ step=1,
384
+ label="采样步数 / NFE Steps",
385
+ info="更多步数 = 更高质量,但更慢 / More steps = higher quality, but slower",
386
+ )
387
+ cfg_strength = gr.Slider(
388
+ minimum=0.0,
389
+ maximum=10.0,
390
+ value=3.0,
391
+ step=0.1,
392
+ label="CFG 强度 / CFG Strength",
393
+ info="Classifier‑Free Guidance 强度 / Classifier‑Free Guidance strength",
394
+ )
395
+ t_shift = gr.Slider(
396
+ minimum=0.0,
397
+ maximum=1.0,
398
+ value=0.5,
399
+ step=0.01,
400
+ label="采样时间偏移 / t‑shift",
401
+ )
402
+ with gr.Row():
403
+ sil_len_to_end = gr.Slider(
404
+ minimum=0.0,
405
+ maximum=3.0,
406
+ value=0.5,
407
+ step=0.1,
408
+ label="末尾静音时长 (秒) / Silence Padding (s)",
409
+ info="参考音频末尾追加的静音 / Silence appended after reference audio",
410
+ )
411
+ seed = gr.Number(
412
+ value=-1,
413
+ precision=0,
414
+ label="随机种子 / Random Seed",
415
+ info="-1 表示随机 / -1 means random",
416
+ )
417
+
418
+ # ================================================================
419
+ # ROW 4 – 合成按钮与输出 / Run & Output
420
+ # ================================================================
421
+ run_btn = gr.Button("🎤 开始合成 / Synthesize", elem_id="run-btn", size="lg")
422
+
423
+ output_audio = gr.Audio(
424
+ label="合成结果 / Generated Audio",
425
+ type="filepath",
426
+ elem_id="output-audio",
427
+ )
428
+
429
+ # ---- 联动:未开启分离时,禁用伴奏混合 ----
430
+ # ---- Disable mix checkbox when separation is off ----
431
+ separate_vocals_flag.change(
432
+ fn=lambda sep: gr.update(
433
+ interactive=sep, value=False if not sep else False
434
+ ),
435
+ inputs=[separate_vocals_flag],
436
+ outputs=[mix_accompaniment_flag],
437
+ )
438
+
439
+ # ---- 绑定事件 / Wire up ----
440
+ run_btn.click(
441
+ fn=synthesize,
442
+ inputs=[
443
+ ref_audio,
444
+ melody_audio,
445
+ ref_text,
446
+ target_text,
447
+ separate_vocals_flag,
448
+ mix_accompaniment_flag,
449
+ sil_len_to_end,
450
+ t_shift,
451
+ nfe_step,
452
+ cfg_strength,
453
+ seed,
454
+ ],
455
+ outputs=output_audio,
456
+ )
457
+
458
+ # ---- 页脚 / Footer ----
459
+ gr.Markdown(
460
+ """
461
+ ---
462
+ <center style="color:#8b949e; font-size:0.85rem;">
463
+ YingMusic Singer &nbsp;·&nbsp; 基于 Flow Matching + VAE / Powered by Flow Matching + VAE &nbsp;·&nbsp;
464
+ 用 <code>|</code> 分隔歌词中的乐句 / Use <code>|</code> to separate phrases in lyrics
465
+ </center>
466
+ """,
467
+ )
468
+
469
+ return demo
470
+
471
+
472
+ # ---------------------------------------------------------------------------
473
+ # Entry point / 启动入口
474
+ # ---------------------------------------------------------------------------
475
+ if __name__ == "__main__":
476
+ demo = build_ui()
477
+ demo.queue()
478
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
requirements.txt ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.3.1
2
+ accelerate==1.12.0
3
+ aiofiles==24.1.0
4
+ aiohappyeyeballs==2.6.1
5
+ aiohttp==3.12.15
6
+ aiosignal==1.4.0
7
+ alias-free-torch==0.0.6
8
+ annotated-types==0.7.0
9
+ antlr4-python3-runtime==4.9.3
10
+ anyio==4.11.0
11
+ argbind==0.3.9
12
+ argparse==1.4.0
13
+ async-timeout==5.0.1
14
+ attrs==25.3.0
15
+ audiobox-aesthetics==0.0.4
16
+ audioread==3.0.1
17
+ av==16.0.1
18
+ beartype==0.22.2
19
+ bitsandbytes==0.48.1
20
+ brotli==1.1.0
21
+ certifi==2025.8.3
22
+ cffi==2.0.0
23
+ chardet==5.2.0
24
+ charset-normalizer==3.4.3
25
+ click==8.3.0
26
+ cloudpickle==3.1.1
27
+ colorama==0.4.6
28
+ coloredlogs==15.0.1
29
+ configparser==7.2.0
30
+ cryptography==46.0.3
31
+ datasets==2.21.0
32
+ dctorch==0.1.2
33
+ decorator==5.2.1
34
+ deprecated==1.2.18
35
+ descript-audio-codec==1.0.0
36
+ descript-audiotools==0.7.2
37
+ diffusers==0.36.0
38
+ dill==0.3.8
39
+ distro==1.9.0
40
+ dlinfo==2.0.0
41
+ docstring-parser==0.17.0
42
+ easydict==1.13
43
+ einops==0.8.1
44
+ einops-exts==0.0.4
45
+ einx==0.3.0
46
+ ema-pytorch==0.7.7
47
+ encodec==0.1.1
48
+ exceptiongroup==1.3.0
49
+ fastapi==0.118.0
50
+ ffmpy==0.6.1
51
+ filelock==3.19.1
52
+ fire==0.7.1
53
+ flash-attn==2.8.3
54
+ flatten-dict==0.4.2
55
+ frozendict==2.4.6
56
+ frozenlist==1.7.0
57
+ fsspec==2024.6.1
58
+ ftfy==6.3.1
59
+ future==1.0.0
60
+ gin-config==0.5.0
61
+ gradio==5.49.0
62
+ gradio-client==1.13.3
63
+ grpcio==1.75.1
64
+ h11==0.16.0
65
+ hf-xet==1.1.10
66
+ httpcore==1.0.9
67
+ httpx==0.28.1
68
+ huggingface-hub==0.35.3
69
+ humanfriendly==10.0
70
+ hydra-core==1.3.2
71
+ idna==3.10
72
+ importlib-metadata==8.7.1
73
+ importlib-resources==6.5.2
74
+ jinja2==3.1.6
75
+ jiter==0.12.0
76
+ joblib==1.5.2
77
+ jsonmerge==1.9.2
78
+ jsonschema==4.25.1
79
+ jsonschema-specifications==2025.9.1
80
+ julius==0.2.7
81
+ k-diffusion==0.1.1.post1
82
+ lameenc==1.8.1
83
+ lazy-loader==0.4
84
+ librosa==0.10.2.post1
85
+ lightning==2.6.1
86
+ lightning-utilities==0.15.2
87
+ llvmlite==0.45.1
88
+ loguru==0.7.3
89
+ loralib==0.1.2
90
+ markdown==3.9
91
+ markdown-it-py==4.0.0
92
+ markdown2==2.5.4
93
+ markupsafe==3.0.3
94
+ mdurl==0.1.2
95
+ midi2audio==0.1.1
96
+ mido==1.3.3
97
+ ml-collections==1.1.0
98
+ modelscope==1.32.0
99
+ more-itertools==10.8.0
100
+ mpmath==1.3.0
101
+ msgpack==1.1.1
102
+ multidict==6.6.4
103
+ multiprocess==0.70.16
104
+ munch==4.0.0
105
+ muq==0.1.0
106
+ music21==9.9.1
107
+ mutagen==1.47.0
108
+ networkx==3.4.2
109
+ nnaudio==0.3.3
110
+ numba==0.62.1
111
+ numpy==2.2.6
112
+ omegaconf==2.3.0
113
+ orjson==3.11.3
114
+ packaging==25.0
115
+ pandas==2.2.3
116
+ peft==0.11.1
117
+ platformdirs==4.4.0
118
+ pooch==1.8.2
119
+ prefigure==0.0.10
120
+ pretty-midi==0.2.11
121
+ propcache==0.3.2
122
+ protobuf==6.33.2
123
+ psutil==7.1.0
124
+ pycparser==2.23
125
+ pydantic==2.11.9
126
+ pydantic-core==2.33.2
127
+ pydub==0.25.1
128
+ pylance==0.23.2
129
+ pyloudnorm==0.1.1
130
+ pylrc==0.1.2
131
+ pyparsing==3.2.5
132
+ python-dateutil==2.9.0.post0
133
+ python-multipart==0.0.20
134
+ pytorch-lightning==2.5.5
135
+ pytz==2025.2
136
+ pyyaml==6.0.3
137
+ qwen-omni-utils==0.0.8
138
+ rapidfuzz==2.13.7
139
+ referencing==0.36.2
140
+ regex==2025.9.18
141
+ requests==2.32.5
142
+ resampy==0.4.3
143
+ retrying==1.4.2
144
+ rfc3986==1.5.0
145
+ rich==14.1.0
146
+ rotary-embedding-torch==0.8.9
147
+ rpds-py==0.27.1
148
+ ruamel-yaml==0.19.1
149
+ safehttpx==0.1.6
150
+ safetensors==0.6.2
151
+ scikit-learn==1.7.2
152
+ scipy==1.15.3
153
+ segments==2.3.0
154
+ semantic-version==2.10.0
155
+ sentencepiece==0.2.1
156
+ setuptools==80.9.0
157
+ shellingham==1.5.4
158
+ six==1.17.0
159
+ sniffio==1.3.1
160
+ soundfile==0.13.1
161
+ sox==1.5.0
162
+ soxr==1.0.0
163
+ starlette==0.48.0
164
+ sympy==1.13.1
165
+ tensorboard==2.20.0
166
+ tensorboard-data-server==0.7.2
167
+ tensorboardx==2.6.4
168
+ termcolor==3.1.0
169
+ threadpoolctl==3.6.0
170
+ tokenizers==0.22.2
171
+ tomlkit==0.13.3
172
+ torch==2.6.0
173
+ torch-complex==0.4.4
174
+ torchaudio==2.6.0
175
+ torchcodec==0.10.0
176
+ torchcrepe==0.0.24
177
+ torchdiffeq==0.2.5
178
+ torchmetrics==1.8.2
179
+ torchsde==0.2.6
180
+ tqdm==4.67.1
181
+ trampoline==0.1.2
182
+ transformers==4.57.6
183
+ triton==3.2.0
184
+ typeguard==4.4.4
185
+ typer==0.19.2
186
+ typing-extensions==4.15.0
187
+ typing-inspection==0.4.2
188
+ tzdata==2025.2
189
+ unidecode==1.3.8
190
+ urllib3==2.5.0
191
+ uvicorn==0.37.0
192
+ vector-quantize-pytorch==1.23.2
193
+ vocos==0.1.0
194
+ wandb==0.22.1
195
+ websockets==15.0.1
196
+ wrapt==1.17.3
197
+ x-transformers==2.1.2
198
+ xxhash==3.6.0
199
+ yarl==1.20.1
200
+ zhconv==1.4.3
201
+ zhon==2.1.1
202
+ zipp==3.23.0