ayf3 commited on
Commit
966d861
·
verified ·
1 Parent(s): c77cc0b

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +53 -74
app.py CHANGED
@@ -1,7 +1,7 @@
1
  #!/usr/bin/env python3
2
  """
3
- NumberBlocks One Voice Cloning Space - VoxCPM V3 (v2)
4
- Fixed: audio preprocessing to ensure correct format for VoxCPM2
5
  """
6
 
7
  import os
@@ -9,13 +9,9 @@ import gradio as gr
9
  import tempfile
10
  import soundfile as sf
11
  import traceback
12
- import librosa
13
- import numpy as np
14
  from pathlib import Path
15
 
16
- # Target sample rate for VoxCPM2
17
- TARGET_SR = 24000
18
-
19
  HF_TOKEN = os.environ.get("HF_TOKEN", os.environ.get("HUGGINGFACE_TOKEN"))
20
 
21
  def load_model():
@@ -29,7 +25,26 @@ def load_model():
29
  print(f"PyTorch version: {torch.__version__}")
30
  print(f"CUDA available: {torch.cuda.is_available()}")
31
 
 
32
  model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False, optimize=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  print("Model loaded successfully!")
34
  return model, device, None
35
  except Exception as e:
@@ -37,7 +52,7 @@ def load_model():
37
  traceback.print_exc()
38
  return None, "cpu", str(e)
39
 
40
- # Global model state
41
  MODEL_STATE = {
42
  "model": None,
43
  "device": "cpu",
@@ -61,45 +76,13 @@ def ensure_model():
61
  MODEL_STATE["loading"] = False
62
  return MODEL_STATE["model"], MODEL_STATE["device"], MODEL_STATE["error"]
63
 
64
- def preprocess_audio(audio_path):
65
- """Preprocess audio to ensure correct format for VoxCPM2.
66
-
67
- VoxCPM2 expects:
68
- - Sample rate: 24kHz (model's _encode_sample_rate)
69
- - Mono channel
70
- - Float32 WAV format
71
-
72
- Returns path to preprocessed temp WAV file.
73
- """
74
- print(f"Preprocessing audio: {audio_path}")
75
-
76
- # Load with librosa (handles resampling automatically)
77
- audio, sr = librosa.load(audio_path, sr=TARGET_SR, mono=True)
78
-
79
- # Ensure float32
80
- audio = audio.astype(np.float32)
81
-
82
- # Normalize amplitude
83
- max_val = np.abs(audio).max()
84
- if max_val > 0:
85
- audio = audio / max_val * 0.95
86
-
87
- # Ensure minimum length (at least 1 second)
88
- min_samples = TARGET_SR # 1 second
89
- if len(audio) < min_samples:
90
- audio = np.pad(audio, (0, min_samples - len(audio)))
91
-
92
- # Save to temp file
93
- tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
94
- sf.write(tmp.name, audio, TARGET_SR)
95
- print(f"Preprocessed: {len(audio)/TARGET_SR:.2f}s at {TARGET_SR}Hz, saved to {tmp.name}")
96
-
97
- return tmp.name
98
-
99
  def generate_audio(text, reference_audio, cfg_value=2.0, steps=10):
100
  """生成音频"""
101
- if not text or not text.strip():
102
- return None, "❌ 请输入文本"
 
 
 
103
 
104
  try:
105
  model, device, error = ensure_model()
@@ -108,21 +91,22 @@ def generate_audio(text, reference_audio, cfg_value=2.0, steps=10):
108
  if model is None:
109
  return None, "❌ 模型正在加载中,请稍候..."
110
 
111
- # Preprocess reference audio if provided
112
- temp_files = []
113
- if reference_audio:
114
- try:
115
- ref_path = preprocess_audio(reference_audio)
116
- temp_files.append(ref_path)
117
- print(f"Using preprocessed reference audio")
118
- except Exception as e:
119
- return None, f"❌ 参考音频预处理失败: {e}"
120
- else:
121
- return None, "❌ 请上传参考音频"
122
 
123
  print(f"Generating with text: {text[:50]}...")
 
124
 
125
- # Generate audio
126
  import time
127
  t0 = time.time()
128
  wav = model.generate(
@@ -133,23 +117,17 @@ def generate_audio(text, reference_audio, cfg_value=2.0, steps=10):
133
  )
134
  elapsed = time.time() - t0
135
 
136
- # Get sample rate from model
137
  sample_rate = model.tts_model.sample_rate
138
-
139
- # Save output
140
  output_path = "/tmp/voxcpm_output.wav"
141
  sf.write(output_path, wav, sample_rate)
142
 
143
  duration = len(wav) / sample_rate
144
- msg = f"✅ 生成成功! 时长: {duration:.2f}s, 耗时: {elapsed:.1f}s, 采样率: {sample_rate}Hz"
145
  print(msg)
146
 
147
- # Cleanup temp files
148
- for f in temp_files:
149
- try:
150
- os.unlink(f)
151
- except:
152
- pass
153
 
154
  return output_path, msg
155
 
@@ -159,16 +137,16 @@ def generate_audio(text, reference_audio, cfg_value=2.0, steps=10):
159
  traceback.print_exc()
160
  return None, error_msg
161
 
162
- # Preset texts
163
  PRESET_TEXTS = {
164
  "问候": "Hello! I am One! I am the first Numberblock, and I love being number one!",
165
- "计数": "One, two, three, four, five! Counting is so much fun!",
166
  "情感": "Sometimes I feel a little lonely being just one, but then I remember that one is the start of everything!",
167
  }
168
 
169
- # Create Gradio interface
170
  with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
171
- gr.Markdown("# 🎭 NumberBlocks One Voice Cloning (VoxCPM V3)")
172
  gr.Markdown("### 使用 VoxCPM 2 模型克隆 One 的声音")
173
 
174
  with gr.Row():
@@ -186,7 +164,7 @@ with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
186
 
187
  with gr.Column():
188
  ref_audio_input = gr.Audio(
189
- label="参考音频 (One 的声音, 建议 5-15 秒清晰语音)",
190
  type="filepath"
191
  )
192
 
@@ -221,14 +199,15 @@ with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
221
  gr.Markdown("---")
222
  gr.Markdown("### 说明")
223
  gr.Markdown("""
224
- - **参考音频**: 上传 One 的声音片段(建议 5-15 秒清晰语音,自动预处理为 24kHz 单声道
225
  - **CFG Value**: 控制音色相似度,默认 2.0,越高越像参考音色
226
  - **推理步数**: 默认 10,越高质量越好但生成越慢
227
  - **模型**: VoxCPM 2 (openbmb/VoxCPM2)
228
- - **注意**: 当前运行在 CPU 上,生成速较慢
229
  """)
230
 
231
  if __name__ == "__main__":
 
232
  import threading
233
  def preload():
234
  print("Preloading VoxCPM model...")
 
1
  #!/usr/bin/env python3
2
  """
3
+ NumberBlocks One Voice Cloning Space - VoxCPM V4
4
+ Fix: Force float32 on CPU to avoid bfloat16 dimension errors in MiniCPM4 attention
5
  """
6
 
7
  import os
 
9
  import tempfile
10
  import soundfile as sf
11
  import traceback
 
 
12
  from pathlib import Path
13
 
14
+ # 环境变量检查
 
 
15
  HF_TOKEN = os.environ.get("HF_TOKEN", os.environ.get("HUGGINGFACE_TOKEN"))
16
 
17
  def load_model():
 
25
  print(f"PyTorch version: {torch.__version__}")
26
  print(f"CUDA available: {torch.cuda.is_available()}")
27
 
28
+ # Load model (optimize=False to avoid torch.compile issues)
29
  model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False, optimize=False)
30
+
31
+ # CRITICAL FIX: Force float32 on CPU
32
+ # VoxCPM2 uses bfloat16 by default, which causes "Dimension out of range" errors
33
+ # in MiniCPM4's scaled_dot_product_attention on CPU
34
+ if device == "cpu":
35
+ print("Converting model to float32 for CPU compatibility...")
36
+ model.tts_model = model.tts_model.to(torch.float32)
37
+ # Also fix KV caches (they are created with config dtype = bfloat16)
38
+ if hasattr(model.tts_model, 'base_lm') and hasattr(model.tts_model.base_lm, 'kv_cache'):
39
+ if model.tts_model.base_lm.kv_cache is not None:
40
+ model.tts_model.base_lm.kv_cache.kv_cache = model.tts_model.base_lm.kv_cache.kv_cache.to(torch.float32)
41
+ print(" base_lm KV cache converted to float32")
42
+ if hasattr(model.tts_model, 'residual_lm') and hasattr(model.tts_model.residual_lm, 'kv_cache'):
43
+ if model.tts_model.residual_lm.kv_cache is not None:
44
+ model.tts_model.residual_lm.kv_cache.kv_cache = model.tts_model.residual_lm.kv_cache.kv_cache.to(torch.float32)
45
+ print(" residual_lm KV cache converted to float32")
46
+ print("Model conversion to float32 complete!")
47
+
48
  print("Model loaded successfully!")
49
  return model, device, None
50
  except Exception as e:
 
52
  traceback.print_exc()
53
  return None, "cpu", str(e)
54
 
55
+ # 全局模型状态
56
  MODEL_STATE = {
57
  "model": None,
58
  "device": "cpu",
 
76
  MODEL_STATE["loading"] = False
77
  return MODEL_STATE["model"], MODEL_STATE["device"], MODEL_STATE["error"]
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def generate_audio(text, reference_audio, cfg_value=2.0, steps=10):
80
  """生成音频"""
81
+ if not text or not reference_audio:
82
+ return None, "❌ 请输入文本和参考音频"
83
+
84
+ if not text.strip():
85
+ return None, "❌ 文本不能为空"
86
 
87
  try:
88
  model, device, error = ensure_model()
 
91
  if model is None:
92
  return None, "❌ 模型正在加载中,请稍候..."
93
 
94
+ # 读取参考音频
95
+ ref_audio, sr = sf.read(reference_audio)
96
+
97
+ # 如果是立体声,转换为单声道
98
+ if len(ref_audio.shape) > 1:
99
+ ref_audio = ref_audio[:, 0]
100
+
101
+ # 保存到临时文件
102
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
103
+ sf.write(tmp.name, ref_audio, sr)
104
+ ref_path = tmp.name
105
 
106
  print(f"Generating with text: {text[:50]}...")
107
+ print(f"Reference audio: {len(ref_audio)/sr:.2f}s at {sr}Hz")
108
 
109
+ # 生成音频
110
  import time
111
  t0 = time.time()
112
  wav = model.generate(
 
117
  )
118
  elapsed = time.time() - t0
119
 
120
+ # 保存输出
121
  sample_rate = model.tts_model.sample_rate
 
 
122
  output_path = "/tmp/voxcpm_output.wav"
123
  sf.write(output_path, wav, sample_rate)
124
 
125
  duration = len(wav) / sample_rate
126
+ msg = f"✅ 生成成功! 时长: {duration:.2f}s, 耗时: {elapsed:.1f}s, 设备: {device}"
127
  print(msg)
128
 
129
+ # 清理临时文件
130
+ os.unlink(ref_path)
 
 
 
 
131
 
132
  return output_path, msg
133
 
 
137
  traceback.print_exc()
138
  return None, error_msg
139
 
140
+ # 预设文本
141
  PRESET_TEXTS = {
142
  "问候": "Hello! I am One! I am the first Numberblock, and I love being number one!",
143
+ "计数": "One, two, three, four, five! Counting is so much fun! I can count all the way to ten!",
144
  "情感": "Sometimes I feel a little lonely being just one, but then I remember that one is the start of everything!",
145
  }
146
 
147
+ # 创建 Gradio 界面
148
  with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
149
+ gr.Markdown("# 🎭 NumberBlocks One Voice Cloning (VoxCPM V4)")
150
  gr.Markdown("### 使用 VoxCPM 2 模型克隆 One 的声音")
151
 
152
  with gr.Row():
 
164
 
165
  with gr.Column():
166
  ref_audio_input = gr.Audio(
167
+ label="参考音频 (One 的声音)",
168
  type="filepath"
169
  )
170
 
 
199
  gr.Markdown("---")
200
  gr.Markdown("### 说明")
201
  gr.Markdown("""
202
+ - **参考音频**: 上传 One 的声音片段(建议 5-15 秒清晰语音)
203
  - **CFG Value**: 控制音色相似度,默认 2.0,越高越像参考音色
204
  - **推理步数**: 默认 10,越高质量越好但生成越慢
205
  - **模型**: VoxCPM 2 (openbmb/VoxCPM2)
206
+ - **V4 修复**: CPU 上使用 float32 避免 bfloat16 维错误
207
  """)
208
 
209
  if __name__ == "__main__":
210
+ # 启动时预加载模型
211
  import threading
212
  def preload():
213
  print("Preloading VoxCPM model...")