Spaces:

Ryanus
/

ChatterboxTTS

Sleeping

App Files Files Community

Ryanus commited on Jul 6, 2025

Commit

cd40891

verified ·

1 Parent(s): cad7a1b

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -211

app.py CHANGED Viewed

@@ -1,218 +1,49 @@
-import random
-import numpy as np
 import torch
 import gradio as gr
-import logging
 from pathlib import Path
-import sys
-import re
-from typing import List
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# 強制 torch.load 使用 CPU
-original_torch_load = torch.load
-def patched_torch_load(f, map_location=None, **kwargs):
-    if map_location is None:
-        map_location = 'cpu'
-    logger.info(f"🔧 Loading with map_location={map_location}")
-    return original_torch_load(f, map_location=map_location, **kwargs)
-torch.load = patched_torch_load
-if 'torch' in sys.modules:
-    sys.modules['torch'].load = patched_torch_load
-logger.info("✅ Applied torch.load device mapping patch")
-DEVICE = "cpu"
-logger.info("🚀 Running on CPU")
-MODEL = None
-def get_or_load_model():
-    global MODEL, DEVICE
-    if MODEL is None:
-        print("Model not loaded, initializing...")
-        try:
-            try:
-                from chatterbox.src.chatterbox.tts import ChatterboxTTS
-                logger.info("✅ Using official chatterbox.src import path")
-            except ImportError:
-                from chatterbox import ChatterboxTTS
-                logger.info("✅ Using chatterbox direct import path")
-            MODEL = ChatterboxTTS.from_pretrained("cpu")
-            MODEL.device = "cpu"
-            logger.info(f"✅ Model loaded successfully on {DEVICE}")
-        except Exception as e:
-            logger.error(f"❌ Error loading model: {e}")
-            raise
-    return MODEL
-def set_seed(seed: int):
-    torch.manual_seed(seed)
-    random.seed(seed)
-    np.random.seed(seed)
-def split_text_into_chunks(text: str, max_chars: int = 250) -> List[str]:
-    if len(text) <= max_chars:
-        return [text]
-    sentences = re.split(r'(?<=[.!?])\s+', text)
-    chunks = []
-    current_chunk = ""
-    for sentence in sentences:
-        if len(sentence) > max_chars:
-            if current_chunk:
-                chunks.append(current_chunk.strip())
-                current_chunk = ""
-            parts = re.split(r'(?<=,)\s+', sentence)
-            for part in parts:
-                if len(part) > max_chars:
-                    words = part.split()
-                    word_chunk = ""
-                    for word in words:
-                        if len(word_chunk + " " + word) <= max_chars:
-                            word_chunk += " " + word if word_chunk else word
-                        else:
-                            if word_chunk:
-                                chunks.append(word_chunk.strip())
-                            word_chunk = word
-                    if word_chunk:
-                        chunks.append(word_chunk.strip())
-                else:
-                    if len(current_chunk + " " + part) <= max_chars:
-                        current_chunk += " " + part if current_chunk else part
-                    else:
-                        if current_chunk:
-                            chunks.append(current_chunk.strip())
-                        current_chunk = part
-        else:
-            if len(current_chunk + " " + sentence) <= max_chars:
-                current_chunk += " " + sentence if current_chunk else sentence
-            else:
-                if current_chunk:
-                    chunks.append(current_chunk.strip())
-                current_chunk = sentence
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    return [chunk for chunk in chunks if chunk.strip()]
-def generate_tts_audio(
-    text_input: str,
-    audio_prompt_path_input: str,
-    exaggeration_input: float,
-    temperature_input: float,
-    seed_num_input: int,
-    cfgw_input: float,
-    chunk_size: int = 250
-) -> tuple[int, np.ndarray]:
-    try:
-        current_model = get_or_load_model()
-        if current_model is None:
-            raise RuntimeError("TTS model is not loaded.")
-        if seed_num_input != 0:
-            set_seed(int(seed_num_input))
-        text_chunks = split_text_into_chunks(text_input, chunk_size)
-        logger.info(f"Processing {len(text_chunks)} text chunk(s)")
-        generated_wavs = []
-        for i, chunk in enumerate(text_chunks):
-            logger.info(f"Generating chunk {i+1}/{len(text_chunks)}: '{chunk[:50]}...'")
-            wav = current_model.generate(
-                chunk,
-                audio_prompt_path=audio_prompt_path_input,
-                exaggeration=exaggeration_input,
-                temperature=temperature_input,
-                cfg_weight=cfgw_input,
-            )
-            generated_wavs.append(wav)
-        if len(generated_wavs) > 1:
-            silence_samples = int(0.3 * current_model.sr)
-            silence = torch.zeros(1, silence_samples, dtype=generated_wavs[0].dtype)
-            final_wav = generated_wavs[0]
-            for wav_chunk in generated_wavs[1:]:
-                final_wav = torch.cat([final_wav, silence, wav_chunk], dim=1)
-        else:
-            final_wav = generated_wavs[0]
-        return (current_model.sr, final_wav.squeeze(0).numpy())
-    except Exception as e:
-        logger.error(f"❌ Generation failed: {e}")
-        raise gr.Error(f"Generation failed: {str(e)}")
-with gr.Blocks(title="🎙️ Chatterbox-TTS (CPU)", theme=gr.themes.Soft()) as demo:
-    gr.HTML("""
-    <div style="text-align: center; padding: 20px;">
-    <h1>🎙️ Chatterbox-TTS Demo (CPU)</h1>
-    <p style="font-size: 18px; color: #666;">
-    Generate high-quality speech from text with reference audio styling<br>
-    <strong>Running on CPU (Huggingface Space)!</strong>
-    </p>
-    </div>
-    """)
-    with gr.Row():
-        with gr.Column():
-            text = gr.Textbox(
-                value="Hello! This is a test of the Chatterbox-TTS voice cloning system running on CPU.",
-                label="Text to synthesize (supports long text with automatic chunking)",
-                max_lines=10,
-                lines=5
-            )
-            ref_wav = gr.Audio(
-                type="filepath",
-                label="Reference Audio File (Optional - 6+ seconds recommended)",
-                sources=["upload", "microphone"]
-            )
-            exaggeration = gr.Slider(
-                0.25, 2, step=0.05,
-                label="Exaggeration (Neutral = 0.5, extreme values can be unstable)",
-                value=0.5
-            )
-            cfg_weight = gr.Slider(
-                0.2, 1, step=0.05,
-                label="CFG/Pace",
-                value=0.5
-            )
-            with gr.Accordion("⚙️ Advanced Options", open=False):
-                chunk_size = gr.Slider(
-                    100, 400, step=25,
-                    label="Chunk Size (characters per chunk for long text)",
-                    value=250
-                )
-                seed_num = gr.Number(
-                    value=0,
-                    label="Random seed (0 for random)",
-                    precision=0
-                )
-                temp = gr.Slider(
-                    0.05, 5, step=0.05,
-                    label="Temperature",
-                    value=0.8
-                )
-            run_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
-        with gr.Column():
-            audio_output = gr.Audio(label="Generated Speech")
-    run_btn.click(
-        fn=generate_tts_audio,
-        inputs=[text, ref_wav, exaggeration, temp, seed_num, cfg_weight, chunk_size],
-        outputs=[audio_output],
-        show_progress=True
     )
-    gr.Examples(
-        examples=[
-            ["Hello! This is a test of voice cloning technology running on CPU."],
-            ["The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet. Now we can test longer text with multiple sentences to see how the chunking works."],
-            ["Welcome to the future of voice synthesis! With Chatterbox, you can clone any voice in seconds. The technology uses advanced neural networks to capture the unique characteristics of a speaker's voice. This includes their tone, accent, speaking rhythm, and emotional expressiveness. The result is incredibly natural-sounding speech that maintains the original speaker's identity."],
-        ],
-        inputs=[text],
-        label="📝 Example Texts"
     )
-def main():
-    try:
-        logger.info("Loading model at startup...")
-        get_or_load_model()
-        logger.info("✅ Startup model loading complete!")
-        demo.launch(server_name="0.0.0.0", server_port=7860, share=True, debug=True, show_error=True)
-    except Exception as e:
-        logger.error(f"❌ CRITICAL: Failed to load model on startup: {e}")
-        print(f"Application may not function properly. Error: {e}")
-        demo.launch(server_name="0.0.0.0", server_port=7860, share=True, debug=True, show_error=True)
 if __name__ == "__main__":
-    main()

+import os
+import time
 import torch
 import gradio as gr
 from pathlib import Path
+import torchaudio
+from chatterbox.tts import ChatterboxTTS
+# 初始化儲存資料夾
+OUTPUT_DIR = Path("outputs")
+OUTPUT_DIR.mkdir(exist_ok=True)
+# 載入模型
+model = ChatterboxTTS.from_pretrained(device="cpu")
+def tts_and_save(text, ref_wav, exaggeration, temperature, seed, cfg_weight):
+    if seed != 0:
+        torch.manual_seed(int(seed))
+    wav = model.generate(
+        text,
+        audio_prompt_path=ref_wav,
+        exaggeration=exaggeration,
+        temperature=temperature,
+        cfg_weight=cfg_weight,
     )
+    timestamp = time.strftime("%Y%m%d_%H%M%S")
+    filename = OUTPUT_DIR / f"tts_{timestamp}.wav"
+    torchaudio.save(str(filename), wav.cpu(), model.sr)
+    return (model.sr, wav.squeeze(0).numpy()), str(filename)
+with gr.Blocks() as demo:
+    text = gr.Textbox(label="輸入文字")
+    ref_wav = gr.Audio(label="參考語音（可選）", sources=["upload", "microphone"], type="filepath")
+    exaggeration = gr.Slider(0.25, 2, value=0.5, step=0.05, label="Exaggeration")
+    cfg_weight = gr.Slider(0.2, 1, value=0.5, step=0.05, label="CFG/Pace")
+    temperature = gr.Slider(0.05, 5, value=0.8, step=0.05, label="Temperature")
+    seed = gr.Number(value=0, label="隨機種子 (0=隨機)", precision=0)
+    btn = gr.Button("生成並自動儲存")
+    output_audio = gr.Audio(label="語音預覽")
+    saved_path = gr.Textbox(label="儲存路徑", interactive=False)
+    btn.click(
+        tts_and_save,
+        inputs=[text, ref_wav, exaggeration, temperature, seed, cfg_weight],
+        outputs=[output_audio, saved_path]
     )
 if __name__ == "__main__":
+    demo.launch()