Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| import torchaudio as ta | |
| import numpy as np | |
| import re | |
| import gradio as gr | |
| from chatterbox.tts import ChatterboxTTS | |
| import tempfile | |
| import shutil | |
| import warnings | |
| # Determine the best available device | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| if DEVICE == "cpu" and torch.backends.mps.is_available(): | |
| DEVICE = "mps" # Use Apple Silicon GPU if available | |
| print(f"Using device: {DEVICE}") | |
| # Hide diffusers LoRA deprecation noise in the UI logs | |
| warnings.filterwarnings( | |
| "ignore", | |
| message=r".*LoRACompatibleLinear.*", | |
| category=FutureWarning, | |
| ) | |
| # Monkey patch torch.load to always use map_location | |
| original_torch_load = torch.load | |
| def patched_torch_load(*args, **kwargs): | |
| if 'map_location' not in kwargs: | |
| kwargs['map_location'] = torch.device(DEVICE) | |
| return original_torch_load(*args, **kwargs) | |
| # Apply the patch | |
| torch.load = patched_torch_load | |
| def load_model(): | |
| try: | |
| model = ChatterboxTTS.from_pretrained(DEVICE) | |
| return model | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| return None | |
| # Initialize model globally | |
| MODEL = load_model() | |
| def split_text_into_chunks(text, max_length=800): | |
| # First, clean up the text | |
| text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space | |
| text = text.strip() | |
| # Split by sentences first (better for natural pauses) | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| # If this sentence alone exceeds max_length, split it further | |
| if len(sentence) > max_length: | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = "" | |
| # Split long sentence by phrases (commas, semicolons) | |
| phrases = re.split(r'(?<=[,;])\s+', sentence) | |
| phrase_chunk = "" | |
| for phrase in phrases: | |
| if len(phrase_chunk) + len(phrase) + 1 > max_length: | |
| if phrase_chunk: | |
| chunks.append(phrase_chunk.strip()) | |
| phrase_chunk = phrase | |
| else: | |
| if phrase_chunk: | |
| phrase_chunk += ", " + phrase | |
| else: | |
| phrase_chunk = phrase | |
| if phrase_chunk: | |
| current_chunk = phrase_chunk | |
| # If adding this sentence would exceed max_length, save current chunk and start a new one | |
| elif len(current_chunk) + len(sentence) + 1 > max_length: | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence | |
| else: | |
| if current_chunk: | |
| current_chunk += " " + sentence | |
| else: | |
| current_chunk = sentence | |
| # Add the last chunk if it's not empty | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks | |
| def resolve_file_path(file_obj): | |
| if file_obj is None: | |
| return None | |
| if isinstance(file_obj, str): | |
| return file_obj | |
| if isinstance(file_obj, dict): | |
| return file_obj.get("name") or file_obj.get("path") | |
| if hasattr(file_obj, "name"): | |
| return file_obj.name | |
| return None | |
| def generate_audio(text, text_file, voice_file, exaggeration, temperature, cfg_weight, progress=gr.Progress()): | |
| if MODEL is None: | |
| return None, "Error: Model failed to load." | |
| # Handle text input source | |
| text_file_path = resolve_file_path(text_file) | |
| if text_file_path: | |
| try: | |
| with open(text_file_path, 'r', encoding='utf-8') as f: | |
| text = f.read() | |
| except Exception as e: | |
| return None, f"Error reading text file: {str(e)}" | |
| if not text: | |
| return None, "Error: No text provided." | |
| voice_file_path = resolve_file_path(voice_file) | |
| if voice_file_path is None: | |
| return None, "Error: No voice reference file provided." | |
| # Create a temporary directory for processing | |
| temp_dir = tempfile.mkdtemp() | |
| try: | |
| chunks = split_text_into_chunks(text, max_length=800) | |
| progress(0, desc=f"Split text into {len(chunks)} chunks") | |
| audio_files = [] | |
| for i, chunk in enumerate(chunks): | |
| progress((i) / len(chunks), desc=f"Generating chunk {i+1}/{len(chunks)}") | |
| # Generate audio for this chunk | |
| wav = MODEL.generate( | |
| chunk, | |
| audio_prompt_path=voice_file_path, | |
| exaggeration=exaggeration, | |
| temperature=temperature, | |
| cfg_weight=cfg_weight, | |
| ) | |
| # Save the audio file | |
| output_file = os.path.join(temp_dir, f"chunk_{i+1}.wav") | |
| ta.save(output_file, wav, MODEL.sr) | |
| audio_files.append(output_file) | |
| progress(0.9, desc="Concatenating audio...") | |
| # Load and concatenate | |
| waveforms = [] | |
| sample_rate = MODEL.sr | |
| for file in audio_files: | |
| waveform, sr = ta.load(file) | |
| if sr != sample_rate: | |
| waveform = ta.functional.resample(waveform, sr, sample_rate) | |
| waveforms.append(waveform) | |
| if not waveforms: | |
| return None, "Error: No audio generated." | |
| concatenated = torch.cat(waveforms, dim=1) | |
| # Save final output | |
| final_output_path = "output.wav" | |
| ta.save(final_output_path, concatenated, sample_rate) | |
| return final_output_path, f"Successfully generated audio from {len(chunks)} chunks." | |
| except Exception as e: | |
| return None, f"Error during generation: {str(e)}" | |
| finally: | |
| # Cleanup temp dir | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| # Gradio Interface with Custom Theme | |
| custom_css = """ | |
| @import url("https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;600&family=Unbounded:wght@500;700&display=swap"); | |
| :root { | |
| --bg: #f6f1e8; | |
| --panel: #ffffff; | |
| --panel-border: #eadfcf; | |
| --ink: #1c2326; | |
| --muted: #58646b; | |
| --accent: #ff6b35; | |
| --accent-2: #1b998b; | |
| --accent-3: #ffb100; | |
| --shadow: 0 20px 40px rgba(31, 35, 38, 0.12); | |
| } | |
| body, | |
| .gradio-container { | |
| background: | |
| radial-gradient(900px 600px at 10% 0%, #ffe6c2 0%, rgba(255, 230, 194, 0) 60%), | |
| radial-gradient(800px 500px at 90% 10%, #cdeff0 0%, rgba(205, 239, 240, 0) 55%), | |
| linear-gradient(180deg, var(--bg) 0%, #fdf8f1 100%); | |
| color: var(--ink); | |
| font-family: "Space Grotesk", sans-serif; | |
| } | |
| .gradio-container { | |
| max-width: 1100px; | |
| margin: 0 auto; | |
| padding: 2.5rem 1.5rem 3rem; | |
| } | |
| h1, | |
| h2, | |
| h3, | |
| .hero-title { | |
| font-family: "Unbounded", "Space Grotesk", sans-serif; | |
| letter-spacing: -0.02em; | |
| } | |
| .hero { | |
| position: relative; | |
| padding: 2.2rem; | |
| border-radius: 24px; | |
| background: linear-gradient(135deg, rgba(255, 107, 53, 0.12), rgba(27, 153, 139, 0.12)); | |
| border: 1px solid rgba(255, 107, 53, 0.18); | |
| box-shadow: var(--shadow); | |
| overflow: hidden; | |
| animation: rise 0.6s ease both; | |
| } | |
| .hero::after { | |
| content: ""; | |
| position: absolute; | |
| inset: 0; | |
| background: radial-gradient(500px 200px at 80% 0%, rgba(255, 177, 0, 0.2), transparent 70%); | |
| pointer-events: none; | |
| } | |
| .hero-badge { | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 0.4rem; | |
| padding: 0.3rem 0.8rem; | |
| border-radius: 999px; | |
| font-size: 0.75rem; | |
| text-transform: uppercase; | |
| letter-spacing: 0.08em; | |
| background: rgba(255, 177, 0, 0.2); | |
| border: 1px solid rgba(255, 177, 0, 0.4); | |
| color: var(--ink); | |
| } | |
| .hero-title { | |
| margin: 0.8rem 0 0.4rem 0; | |
| font-size: 2.4rem; | |
| } | |
| .hero-sub { | |
| max-width: 620px; | |
| font-size: 1rem; | |
| color: var(--muted); | |
| margin: 0 0 1.4rem 0; | |
| } | |
| .hero-stats { | |
| display: flex; | |
| flex-wrap: wrap; | |
| gap: 0.8rem; | |
| } | |
| .hero-stat { | |
| background: rgba(255, 255, 255, 0.6); | |
| border: 1px solid rgba(234, 223, 207, 0.9); | |
| border-radius: 14px; | |
| padding: 0.6rem 0.9rem; | |
| min-width: 150px; | |
| } | |
| .hero-stat span { | |
| display: block; | |
| font-size: 0.65rem; | |
| text-transform: uppercase; | |
| letter-spacing: 0.1em; | |
| color: var(--muted); | |
| } | |
| .hero-stat strong { | |
| font-size: 0.95rem; | |
| } | |
| .panel { | |
| background: var(--panel); | |
| border: 1px solid var(--panel-border); | |
| border-radius: 20px; | |
| padding: 1.4rem; | |
| box-shadow: var(--shadow); | |
| animation: rise 0.7s ease both; | |
| } | |
| .panel-delayed { | |
| animation-delay: 0.12s; | |
| } | |
| .section-title { | |
| margin: 0 0 0.5rem 0; | |
| font-size: 1rem; | |
| text-transform: uppercase; | |
| letter-spacing: 0.1em; | |
| color: var(--muted); | |
| } | |
| .helper-text { | |
| font-size: 0.85rem; | |
| color: var(--muted); | |
| } | |
| #generate-btn { | |
| width: 100%; | |
| border-radius: 999px; | |
| background: linear-gradient(120deg, var(--accent), #ff9f1c); | |
| border: none; | |
| color: #1a1a1a; | |
| font-weight: 600; | |
| box-shadow: 0 12px 24px rgba(255, 107, 53, 0.3); | |
| } | |
| #generate-btn:hover { | |
| transform: translateY(-1px); | |
| } | |
| .compact-uploader .upload-box, | |
| .compact-uploader .file-preview, | |
| .compact-audio .upload-box, | |
| .compact-audio .audio-container { | |
| min-height: 56px; | |
| padding: 0.4rem 0.55rem; | |
| border-radius: 10px; | |
| } | |
| .compact-uploader svg, | |
| .compact-audio svg { | |
| width: 16px; | |
| height: 16px; | |
| } | |
| .compact-audio .audio-container { | |
| gap: 0.6rem; | |
| } | |
| .compact-audio .audio-container button { | |
| width: 28px; | |
| height: 28px; | |
| } | |
| .gradio-container audio { | |
| width: 100%; | |
| } | |
| .note { | |
| background: rgba(27, 153, 139, 0.08); | |
| border: 1px solid rgba(27, 153, 139, 0.2); | |
| border-radius: 14px; | |
| padding: 0.9rem 1rem; | |
| color: var(--muted); | |
| } | |
| @keyframes rise { | |
| from { | |
| opacity: 0; | |
| transform: translateY(12px); | |
| } | |
| to { | |
| opacity: 1; | |
| transform: translateY(0); | |
| } | |
| } | |
| """ | |
| custom_theme = gr.themes.Soft( | |
| primary_hue="orange", | |
| secondary_hue="teal", | |
| neutral_hue="stone", | |
| ) | |
| with gr.Blocks(theme=custom_theme, css=custom_css, title="Chatterbox TTS Studio") as demo: | |
| gr.HTML(""" | |
| <section class="hero"> | |
| <div class="hero-badge">Voice Studio</div> | |
| <h1 class="hero-title">Chatterbox TTS</h1> | |
| <p class="hero-sub"> | |
| Turn long-form text into expressive speech using a single voice sample. | |
| Upload a short WAV and generate a polished narration in minutes. | |
| </p> | |
| <div class="hero-stats"> | |
| <div class="hero-stat"> | |
| <span>Input</span> | |
| <strong>Text or TXT</strong> | |
| </div> | |
| <div class="hero-stat"> | |
| <span>Voice</span> | |
| <strong>10-30s WAV</strong> | |
| </div> | |
| <div class="hero-stat"> | |
| <span>Output</span> | |
| <strong>Single WAV</strong> | |
| </div> | |
| </div> | |
| </section> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1, elem_classes=["panel"]): | |
| gr.Markdown("### Text Input") | |
| text_input = gr.Textbox( | |
| label="Your Text", | |
| lines=8, | |
| placeholder="Paste the text you want to convert to speech..." | |
| ) | |
| file_input = gr.File( | |
| label="Or Upload Text File (.txt)", | |
| file_types=[".txt"], | |
| type="filepath", | |
| elem_classes=["compact-uploader"] | |
| ) | |
| gr.Markdown("### Voice Reference") | |
| voice_input = gr.Audio( | |
| label="Voice Sample (WAV)", | |
| type="filepath", | |
| sources=["upload"], | |
| elem_classes=["compact-audio"] | |
| ) | |
| with gr.Accordion("Advanced Options", open=False): | |
| exaggeration = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.7, | |
| label="Exaggeration (0=subtle, 1=pronounced)" | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.5, | |
| value=0.8, | |
| label="Temperature (lower=consistent, higher=varied)" | |
| ) | |
| cfg_weight = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.5, | |
| label="CFG Weight (prompt adherence)" | |
| ) | |
| generate_btn = gr.Button("Generate Audio", variant="primary", size="lg", elem_id="generate-btn") | |
| with gr.Column(scale=1, elem_classes=["panel", "panel-delayed"]): | |
| gr.Markdown("### Generated Audio") | |
| status_output = gr.Textbox( | |
| label="Status", | |
| placeholder="Ready to generate...", | |
| lines=2, | |
| interactive=False | |
| ) | |
| audio_output = gr.Audio( | |
| label="Your Generated Speech", | |
| type="filepath", | |
| interactive=False, | |
| elem_classes=["compact-audio"] | |
| ) | |
| gr.Markdown(""" | |
| <div class="note"> | |
| <strong>Generation flow:</strong> | |
| Text is split into clean chunks, each chunk is synthesized with your voice sample, | |
| and the results are stitched into a single WAV file. | |
| </div> | |
| """) | |
| with gr.Accordion("Quick Start", open=False, elem_classes=["panel"]): | |
| gr.Markdown(""" | |
| **Step-by-step:** | |
| 1. Paste text or upload a TXT file | |
| 2. Upload a clear WAV voice sample (10-30 seconds) | |
| 3. Adjust the sliders if needed | |
| 4. Click Generate Audio and wait for the result | |
| **Pro tips:** | |
| - Use clean, noise-free recordings for the best cloning | |
| - Longer scripts are auto-split into ~800 character chunks | |
| - Lower temperature sounds more consistent, higher sounds more varied | |
| """) | |
| generate_btn.click( | |
| fn=generate_audio, | |
| inputs=[text_input, file_input, voice_input, exaggeration, temperature, cfg_weight], | |
| outputs=[audio_output, status_output], | |
| api_name=False | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(show_api=False) | |