Numberblocks1Voice

Sleeping

App Files Files Community

ayf3 commited on Apr 9

Commit

966d861

verified ·

1 Parent(s): c77cc0b

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +53 -74

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
-NumberBlocks One Voice Cloning Space - VoxCPM V3 (v2)
-Fixed: audio preprocessing to ensure correct format for VoxCPM2
 """
 import os
@@ -9,13 +9,9 @@ import gradio as gr
 import tempfile
 import soundfile as sf
 import traceback
-import librosa
-import numpy as np
 from pathlib import Path
-# Target sample rate for VoxCPM2
-TARGET_SR = 24000
 HF_TOKEN = os.environ.get("HF_TOKEN", os.environ.get("HUGGINGFACE_TOKEN"))
 def load_model():
@@ -29,7 +25,26 @@ def load_model():
         print(f"PyTorch version: {torch.__version__}")
         print(f"CUDA available: {torch.cuda.is_available()}")
         model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False, optimize=False)
         print("Model loaded successfully!")
         return model, device, None
     except Exception as e:
@@ -37,7 +52,7 @@ def load_model():
         traceback.print_exc()
         return None, "cpu", str(e)
-# Global model state
 MODEL_STATE = {
     "model": None,
     "device": "cpu",
@@ -61,45 +76,13 @@ def ensure_model():
             MODEL_STATE["loading"] = False
     return MODEL_STATE["model"], MODEL_STATE["device"], MODEL_STATE["error"]
-def preprocess_audio(audio_path):
-    """Preprocess audio to ensure correct format for VoxCPM2.
-    VoxCPM2 expects:
-    - Sample rate: 24kHz (model's _encode_sample_rate)
-    - Mono channel
-    - Float32 WAV format
-    Returns path to preprocessed temp WAV file.
-    """
-    print(f"Preprocessing audio: {audio_path}")
-    # Load with librosa (handles resampling automatically)
-    audio, sr = librosa.load(audio_path, sr=TARGET_SR, mono=True)
-    # Ensure float32
-    audio = audio.astype(np.float32)
-    # Normalize amplitude
-    max_val = np.abs(audio).max()
-    if max_val > 0:
-        audio = audio / max_val * 0.95
-    # Ensure minimum length (at least 1 second)
-    min_samples = TARGET_SR  # 1 second
-    if len(audio) < min_samples:
-        audio = np.pad(audio, (0, min_samples - len(audio)))
-    # Save to temp file
-    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-    sf.write(tmp.name, audio, TARGET_SR)
-    print(f"Preprocessed: {len(audio)/TARGET_SR:.2f}s at {TARGET_SR}Hz, saved to {tmp.name}")
-    return tmp.name
 def generate_audio(text, reference_audio, cfg_value=2.0, steps=10):
     """生成音频"""
-    if not text or not text.strip():
-        return None, "❌ 请输入文本"
     try:
         model, device, error = ensure_model()
@@ -108,21 +91,22 @@ def generate_audio(text, reference_audio, cfg_value=2.0, steps=10):
         if model is None:
             return None, "❌ 模型正在加载中，请稍候..."
-        # Preprocess reference audio if provided
-        temp_files = []
-        if reference_audio:
-            try:
-                ref_path = preprocess_audio(reference_audio)
-                temp_files.append(ref_path)
-                print(f"Using preprocessed reference audio")
-            except Exception as e:
-                return None, f"❌ 参考音频预处理失败: {e}"
-        else:
-            return None, "❌ 请上传参考音频"
         print(f"Generating with text: {text[:50]}...")
-        # Generate audio
         import time
         t0 = time.time()
         wav = model.generate(
@@ -133,23 +117,17 @@ def generate_audio(text, reference_audio, cfg_value=2.0, steps=10):
         )
         elapsed = time.time() - t0
-        # Get sample rate from model
         sample_rate = model.tts_model.sample_rate
-        # Save output
         output_path = "/tmp/voxcpm_output.wav"
         sf.write(output_path, wav, sample_rate)
         duration = len(wav) / sample_rate
-        msg = f"✅ 生成成功! 时长: {duration:.2f}s, 耗时: {elapsed:.1f}s, 采样率: {sample_rate}Hz"
         print(msg)
-        # Cleanup temp files
-        for f in temp_files:
-            try:
-                os.unlink(f)
-            except:
-                pass
         return output_path, msg
@@ -159,16 +137,16 @@ def generate_audio(text, reference_audio, cfg_value=2.0, steps=10):
         traceback.print_exc()
         return None, error_msg
-# Preset texts
 PRESET_TEXTS = {
     "问候": "Hello! I am One! I am the first Numberblock, and I love being number one!",
-    "计数": "One, two, three, four, five! Counting is so much fun!",
     "情感": "Sometimes I feel a little lonely being just one, but then I remember that one is the start of everything!",
 }
-# Create Gradio interface
 with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
-    gr.Markdown("# 🎭 NumberBlocks One Voice Cloning (VoxCPM V3)")
     gr.Markdown("### 使用 VoxCPM 2 模型克隆 One 的声音")
     with gr.Row():
@@ -186,7 +164,7 @@ with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
         with gr.Column():
             ref_audio_input = gr.Audio(
-                label="参考音频 (One 的声音, 建议 5-15 秒清晰语音)",
                 type="filepath"
             )
@@ -221,14 +199,15 @@ with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
     gr.Markdown("---")
     gr.Markdown("### 说明")
     gr.Markdown("""
-    - **参考音频**: 上传 One 的声音片段（建议 5-15 秒清晰语音，自动预处理为 24kHz 单声道）
     - **CFG Value**: 控制音色相似度，默认 2.0，越高越像参考音色
     - **推理步数**: 默认 10，越高质量越好但生成越慢
     - **模型**: VoxCPM 2 (openbmb/VoxCPM2)
-    - **注意**: 当前运行在 CPU 上，生成速度较慢
     """)
 if __name__ == "__main__":
     import threading
     def preload():
         print("Preloading VoxCPM model...")

 #!/usr/bin/env python3
 """
+NumberBlocks One Voice Cloning Space - VoxCPM V4
+Fix: Force float32 on CPU to avoid bfloat16 dimension errors in MiniCPM4 attention
 """
 import os
 import tempfile
 import soundfile as sf
 import traceback
 from pathlib import Path
+# 环境变量检查
 HF_TOKEN = os.environ.get("HF_TOKEN", os.environ.get("HUGGINGFACE_TOKEN"))
 def load_model():
         print(f"PyTorch version: {torch.__version__}")
         print(f"CUDA available: {torch.cuda.is_available()}")
+        # Load model (optimize=False to avoid torch.compile issues)
         model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False, optimize=False)
+        # CRITICAL FIX: Force float32 on CPU
+        # VoxCPM2 uses bfloat16 by default, which causes "Dimension out of range" errors
+        # in MiniCPM4's scaled_dot_product_attention on CPU
+        if device == "cpu":
+            print("Converting model to float32 for CPU compatibility...")
+            model.tts_model = model.tts_model.to(torch.float32)
+            # Also fix KV caches (they are created with config dtype = bfloat16)
+            if hasattr(model.tts_model, 'base_lm') and hasattr(model.tts_model.base_lm, 'kv_cache'):
+                if model.tts_model.base_lm.kv_cache is not None:
+                    model.tts_model.base_lm.kv_cache.kv_cache = model.tts_model.base_lm.kv_cache.kv_cache.to(torch.float32)
+                    print("  base_lm KV cache converted to float32")
+            if hasattr(model.tts_model, 'residual_lm') and hasattr(model.tts_model.residual_lm, 'kv_cache'):
+                if model.tts_model.residual_lm.kv_cache is not None:
+                    model.tts_model.residual_lm.kv_cache.kv_cache = model.tts_model.residual_lm.kv_cache.kv_cache.to(torch.float32)
+                    print("  residual_lm KV cache converted to float32")
+            print("Model conversion to float32 complete!")
         print("Model loaded successfully!")
         return model, device, None
     except Exception as e:
         traceback.print_exc()
         return None, "cpu", str(e)
+# 全局模型状态
 MODEL_STATE = {
     "model": None,
     "device": "cpu",
             MODEL_STATE["loading"] = False
     return MODEL_STATE["model"], MODEL_STATE["device"], MODEL_STATE["error"]
 def generate_audio(text, reference_audio, cfg_value=2.0, steps=10):
     """生成音频"""
+    if not text or not reference_audio:
+        return None, "❌ 请输入文本和参考音频"
+    if not text.strip():
+        return None, "❌ 文本不能为空"
     try:
         model, device, error = ensure_model()
         if model is None:
             return None, "❌ 模型正在加载中，请稍候..."
+        # 读取参考音频
+        ref_audio, sr = sf.read(reference_audio)
+        # 如果是立体声，转换为单声道
+        if len(ref_audio.shape) > 1:
+            ref_audio = ref_audio[:, 0]
+        # 保存到临时文件
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+            sf.write(tmp.name, ref_audio, sr)
+            ref_path = tmp.name
         print(f"Generating with text: {text[:50]}...")
+        print(f"Reference audio: {len(ref_audio)/sr:.2f}s at {sr}Hz")
+        # 生成音频
         import time
         t0 = time.time()
         wav = model.generate(
         )
         elapsed = time.time() - t0
+        # 保存输出
         sample_rate = model.tts_model.sample_rate
         output_path = "/tmp/voxcpm_output.wav"
         sf.write(output_path, wav, sample_rate)
         duration = len(wav) / sample_rate
+        msg = f"✅ 生成成功! 时长: {duration:.2f}s, 耗时: {elapsed:.1f}s, 设备: {device}"
         print(msg)
+        # 清理临时文件
+        os.unlink(ref_path)
         return output_path, msg
         traceback.print_exc()
         return None, error_msg
+# 预设文本
 PRESET_TEXTS = {
     "问候": "Hello! I am One! I am the first Numberblock, and I love being number one!",
+    "计数": "One, two, three, four, five! Counting is so much fun! I can count all the way to ten!",
     "情感": "Sometimes I feel a little lonely being just one, but then I remember that one is the start of everything!",
 }
+# 创建 Gradio 界面
 with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
+    gr.Markdown("# 🎭 NumberBlocks One Voice Cloning (VoxCPM V4)")
     gr.Markdown("### 使用 VoxCPM 2 模型克隆 One 的声音")
     with gr.Row():
         with gr.Column():
             ref_audio_input = gr.Audio(
+                label="参考音频 (One 的声音)",
                 type="filepath"
             )
     gr.Markdown("---")
     gr.Markdown("### 说明")
     gr.Markdown("""
+    - **参考音频**: 上传 One 的声音片段（建议 5-15 秒清晰语音）
     - **CFG Value**: 控制音色相似度，默认 2.0，越高越像参考音色
     - **推理步数**: 默认 10，越高质量越好但生成越慢
     - **模型**: VoxCPM 2 (openbmb/VoxCPM2)
+    - **V4 修复**: CPU 上使用 float32 避免 bfloat16 维度错误
     """)
 if __name__ == "__main__":
+    # 启动时预加载模型
     import threading
     def preload():
         print("Preloading VoxCPM model...")