Spaces:

Ryanus
/

ChatterboxTTS

Sleeping

App Files Files Community

Ryanus commited on Jul 5, 2025

Commit

cad7a1b

verified ·

1 Parent(s): 5b4f81b

Create app.py

Browse files

Files changed (1) hide show

app.py +218 -0

app.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import random
+import numpy as np
+import torch
+import gradio as gr
+import logging
+from pathlib import Path
+import sys
+import re
+from typing import List
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# 強制 torch.load 使用 CPU
+original_torch_load = torch.load
+def patched_torch_load(f, map_location=None, **kwargs):
+    if map_location is None:
+        map_location = 'cpu'
+    logger.info(f"🔧 Loading with map_location={map_location}")
+    return original_torch_load(f, map_location=map_location, **kwargs)
+torch.load = patched_torch_load
+if 'torch' in sys.modules:
+    sys.modules['torch'].load = patched_torch_load
+logger.info("✅ Applied torch.load device mapping patch")
+DEVICE = "cpu"
+logger.info("🚀 Running on CPU")
+MODEL = None
+def get_or_load_model():
+    global MODEL, DEVICE
+    if MODEL is None:
+        print("Model not loaded, initializing...")
+        try:
+            try:
+                from chatterbox.src.chatterbox.tts import ChatterboxTTS
+                logger.info("✅ Using official chatterbox.src import path")
+            except ImportError:
+                from chatterbox import ChatterboxTTS
+                logger.info("✅ Using chatterbox direct import path")
+            MODEL = ChatterboxTTS.from_pretrained("cpu")
+            MODEL.device = "cpu"
+            logger.info(f"✅ Model loaded successfully on {DEVICE}")
+        except Exception as e:
+            logger.error(f"❌ Error loading model: {e}")
+            raise
+    return MODEL
+def set_seed(seed: int):
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+def split_text_into_chunks(text: str, max_chars: int = 250) -> List[str]:
+    if len(text) <= max_chars:
+        return [text]
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(sentence) > max_chars:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+                current_chunk = ""
+            parts = re.split(r'(?<=,)\s+', sentence)
+            for part in parts:
+                if len(part) > max_chars:
+                    words = part.split()
+                    word_chunk = ""
+                    for word in words:
+                        if len(word_chunk + " " + word) <= max_chars:
+                            word_chunk += " " + word if word_chunk else word
+                        else:
+                            if word_chunk:
+                                chunks.append(word_chunk.strip())
+                            word_chunk = word
+                    if word_chunk:
+                        chunks.append(word_chunk.strip())
+                else:
+                    if len(current_chunk + " " + part) <= max_chars:
+                        current_chunk += " " + part if current_chunk else part
+                    else:
+                        if current_chunk:
+                            chunks.append(current_chunk.strip())
+                        current_chunk = part
+        else:
+            if len(current_chunk + " " + sentence) <= max_chars:
+                current_chunk += " " + sentence if current_chunk else sentence
+            else:
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                current_chunk = sentence
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return [chunk for chunk in chunks if chunk.strip()]
+def generate_tts_audio(
+    text_input: str,
+    audio_prompt_path_input: str,
+    exaggeration_input: float,
+    temperature_input: float,
+    seed_num_input: int,
+    cfgw_input: float,
+    chunk_size: int = 250
+) -> tuple[int, np.ndarray]:
+    try:
+        current_model = get_or_load_model()
+        if current_model is None:
+            raise RuntimeError("TTS model is not loaded.")
+        if seed_num_input != 0:
+            set_seed(int(seed_num_input))
+        text_chunks = split_text_into_chunks(text_input, chunk_size)
+        logger.info(f"Processing {len(text_chunks)} text chunk(s)")
+        generated_wavs = []
+        for i, chunk in enumerate(text_chunks):
+            logger.info(f"Generating chunk {i+1}/{len(text_chunks)}: '{chunk[:50]}...'")
+            wav = current_model.generate(
+                chunk,
+                audio_prompt_path=audio_prompt_path_input,
+                exaggeration=exaggeration_input,
+                temperature=temperature_input,
+                cfg_weight=cfgw_input,
+            )
+            generated_wavs.append(wav)
+        if len(generated_wavs) > 1:
+            silence_samples = int(0.3 * current_model.sr)
+            silence = torch.zeros(1, silence_samples, dtype=generated_wavs[0].dtype)
+            final_wav = generated_wavs[0]
+            for wav_chunk in generated_wavs[1:]:
+                final_wav = torch.cat([final_wav, silence, wav_chunk], dim=1)
+        else:
+            final_wav = generated_wavs[0]
+        return (current_model.sr, final_wav.squeeze(0).numpy())
+    except Exception as e:
+        logger.error(f"❌ Generation failed: {e}")
+        raise gr.Error(f"Generation failed: {str(e)}")
+with gr.Blocks(title="🎙️ Chatterbox-TTS (CPU)", theme=gr.themes.Soft()) as demo:
+    gr.HTML("""
+    <div style="text-align: center; padding: 20px;">
+    <h1>🎙️ Chatterbox-TTS Demo (CPU)</h1>
+    <p style="font-size: 18px; color: #666;">
+    Generate high-quality speech from text with reference audio styling<br>
+    <strong>Running on CPU (Huggingface Space)!</strong>
+    </p>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column():
+            text = gr.Textbox(
+                value="Hello! This is a test of the Chatterbox-TTS voice cloning system running on CPU.",
+                label="Text to synthesize (supports long text with automatic chunking)",
+                max_lines=10,
+                lines=5
+            )
+            ref_wav = gr.Audio(
+                type="filepath",
+                label="Reference Audio File (Optional - 6+ seconds recommended)",
+                sources=["upload", "microphone"]
+            )
+            exaggeration = gr.Slider(
+                0.25, 2, step=0.05,
+                label="Exaggeration (Neutral = 0.5, extreme values can be unstable)",
+                value=0.5
+            )
+            cfg_weight = gr.Slider(
+                0.2, 1, step=0.05,
+                label="CFG/Pace",
+                value=0.5
+            )
+            with gr.Accordion("⚙️ Advanced Options", open=False):
+                chunk_size = gr.Slider(
+                    100, 400, step=25,
+                    label="Chunk Size (characters per chunk for long text)",
+                    value=250
+                )
+                seed_num = gr.Number(
+                    value=0,
+                    label="Random seed (0 for random)",
+                    precision=0
+                )
+                temp = gr.Slider(
+                    0.05, 5, step=0.05,
+                    label="Temperature",
+                    value=0.8
+                )
+            run_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
+        with gr.Column():
+            audio_output = gr.Audio(label="Generated Speech")
+    run_btn.click(
+        fn=generate_tts_audio,
+        inputs=[text, ref_wav, exaggeration, temp, seed_num, cfg_weight, chunk_size],
+        outputs=[audio_output],
+        show_progress=True
+    )
+    gr.Examples(
+        examples=[
+            ["Hello! This is a test of voice cloning technology running on CPU."],
+            ["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet. Now we can test longer text with multiple sentences to see how the chunking works."],
+            ["Welcome to the future of voice synthesis! With Chatterbox, you can clone any voice in seconds. The technology uses advanced neural networks to capture the unique characteristics of a speaker's voice. This includes their tone, accent, speaking rhythm, and emotional expressiveness. The result is incredibly natural-sounding speech that maintains the original speaker's identity."],
+        ],
+        inputs=[text],
+        label="📝 Example Texts"
+    )
+def main():
+    try:
+        logger.info("Loading model at startup...")
+        get_or_load_model()
+        logger.info("✅ Startup model loading complete!")
+        demo.launch(server_name="0.0.0.0", server_port=7860, share=True, debug=True, show_error=True)
+    except Exception as e:
+        logger.error(f"❌ CRITICAL: Failed to load model on startup: {e}")
+        print(f"Application may not function properly. Error: {e}")
+        demo.launch(server_name="0.0.0.0", server_port=7860, share=True, debug=True, show_error=True)
+if __name__ == "__main__":
+    main()