worship-agent / app.py
NextDrought's picture
Keep fixing
d89679b verified
import os
import torch
import torchaudio as ta
import numpy as np
import re
import gradio as gr
from chatterbox.tts import ChatterboxTTS
import tempfile
import shutil
import warnings
# Determine the best available device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
if DEVICE == "cpu" and torch.backends.mps.is_available():
DEVICE = "mps" # Use Apple Silicon GPU if available
print(f"Using device: {DEVICE}")
# Hide diffusers LoRA deprecation noise in the UI logs
warnings.filterwarnings(
"ignore",
message=r".*LoRACompatibleLinear.*",
category=FutureWarning,
)
# Monkey patch torch.load to always use map_location
original_torch_load = torch.load
def patched_torch_load(*args, **kwargs):
if 'map_location' not in kwargs:
kwargs['map_location'] = torch.device(DEVICE)
return original_torch_load(*args, **kwargs)
# Apply the patch
torch.load = patched_torch_load
def load_model():
try:
model = ChatterboxTTS.from_pretrained(DEVICE)
return model
except Exception as e:
print(f"Error loading model: {e}")
return None
# Initialize model globally
MODEL = load_model()
def split_text_into_chunks(text, max_length=800):
# First, clean up the text
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
text = text.strip()
# Split by sentences first (better for natural pauses)
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current_chunk = ""
for sentence in sentences:
# If this sentence alone exceeds max_length, split it further
if len(sentence) > max_length:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = ""
# Split long sentence by phrases (commas, semicolons)
phrases = re.split(r'(?<=[,;])\s+', sentence)
phrase_chunk = ""
for phrase in phrases:
if len(phrase_chunk) + len(phrase) + 1 > max_length:
if phrase_chunk:
chunks.append(phrase_chunk.strip())
phrase_chunk = phrase
else:
if phrase_chunk:
phrase_chunk += ", " + phrase
else:
phrase_chunk = phrase
if phrase_chunk:
current_chunk = phrase_chunk
# If adding this sentence would exceed max_length, save current chunk and start a new one
elif len(current_chunk) + len(sentence) + 1 > max_length:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence
else:
if current_chunk:
current_chunk += " " + sentence
else:
current_chunk = sentence
# Add the last chunk if it's not empty
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def resolve_file_path(file_obj):
if file_obj is None:
return None
if isinstance(file_obj, str):
return file_obj
if isinstance(file_obj, dict):
return file_obj.get("name") or file_obj.get("path")
if hasattr(file_obj, "name"):
return file_obj.name
return None
def generate_audio(text, text_file, voice_file, exaggeration, temperature, cfg_weight, progress=gr.Progress()):
if MODEL is None:
return None, "Error: Model failed to load."
# Handle text input source
text_file_path = resolve_file_path(text_file)
if text_file_path:
try:
with open(text_file_path, 'r', encoding='utf-8') as f:
text = f.read()
except Exception as e:
return None, f"Error reading text file: {str(e)}"
if not text:
return None, "Error: No text provided."
voice_file_path = resolve_file_path(voice_file)
if voice_file_path is None:
return None, "Error: No voice reference file provided."
# Create a temporary directory for processing
temp_dir = tempfile.mkdtemp()
try:
chunks = split_text_into_chunks(text, max_length=800)
progress(0, desc=f"Split text into {len(chunks)} chunks")
audio_files = []
for i, chunk in enumerate(chunks):
progress((i) / len(chunks), desc=f"Generating chunk {i+1}/{len(chunks)}")
# Generate audio for this chunk
wav = MODEL.generate(
chunk,
audio_prompt_path=voice_file_path,
exaggeration=exaggeration,
temperature=temperature,
cfg_weight=cfg_weight,
)
# Save the audio file
output_file = os.path.join(temp_dir, f"chunk_{i+1}.wav")
ta.save(output_file, wav, MODEL.sr)
audio_files.append(output_file)
progress(0.9, desc="Concatenating audio...")
# Load and concatenate
waveforms = []
sample_rate = MODEL.sr
for file in audio_files:
waveform, sr = ta.load(file)
if sr != sample_rate:
waveform = ta.functional.resample(waveform, sr, sample_rate)
waveforms.append(waveform)
if not waveforms:
return None, "Error: No audio generated."
concatenated = torch.cat(waveforms, dim=1)
# Save final output
final_output_path = "output.wav"
ta.save(final_output_path, concatenated, sample_rate)
return final_output_path, f"Successfully generated audio from {len(chunks)} chunks."
except Exception as e:
return None, f"Error during generation: {str(e)}"
finally:
# Cleanup temp dir
shutil.rmtree(temp_dir, ignore_errors=True)
# Gradio Interface with Custom Theme
custom_css = """
@import url("https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;600&family=Unbounded:wght@500;700&display=swap");
:root {
--bg: #f6f1e8;
--panel: #ffffff;
--panel-border: #eadfcf;
--ink: #1c2326;
--muted: #58646b;
--accent: #ff6b35;
--accent-2: #1b998b;
--accent-3: #ffb100;
--shadow: 0 20px 40px rgba(31, 35, 38, 0.12);
}
body,
.gradio-container {
background:
radial-gradient(900px 600px at 10% 0%, #ffe6c2 0%, rgba(255, 230, 194, 0) 60%),
radial-gradient(800px 500px at 90% 10%, #cdeff0 0%, rgba(205, 239, 240, 0) 55%),
linear-gradient(180deg, var(--bg) 0%, #fdf8f1 100%);
color: var(--ink);
font-family: "Space Grotesk", sans-serif;
}
.gradio-container {
max-width: 1100px;
margin: 0 auto;
padding: 2.5rem 1.5rem 3rem;
}
h1,
h2,
h3,
.hero-title {
font-family: "Unbounded", "Space Grotesk", sans-serif;
letter-spacing: -0.02em;
}
.hero {
position: relative;
padding: 2.2rem;
border-radius: 24px;
background: linear-gradient(135deg, rgba(255, 107, 53, 0.12), rgba(27, 153, 139, 0.12));
border: 1px solid rgba(255, 107, 53, 0.18);
box-shadow: var(--shadow);
overflow: hidden;
animation: rise 0.6s ease both;
}
.hero::after {
content: "";
position: absolute;
inset: 0;
background: radial-gradient(500px 200px at 80% 0%, rgba(255, 177, 0, 0.2), transparent 70%);
pointer-events: none;
}
.hero-badge {
display: inline-flex;
align-items: center;
gap: 0.4rem;
padding: 0.3rem 0.8rem;
border-radius: 999px;
font-size: 0.75rem;
text-transform: uppercase;
letter-spacing: 0.08em;
background: rgba(255, 177, 0, 0.2);
border: 1px solid rgba(255, 177, 0, 0.4);
color: var(--ink);
}
.hero-title {
margin: 0.8rem 0 0.4rem 0;
font-size: 2.4rem;
}
.hero-sub {
max-width: 620px;
font-size: 1rem;
color: var(--muted);
margin: 0 0 1.4rem 0;
}
.hero-stats {
display: flex;
flex-wrap: wrap;
gap: 0.8rem;
}
.hero-stat {
background: rgba(255, 255, 255, 0.6);
border: 1px solid rgba(234, 223, 207, 0.9);
border-radius: 14px;
padding: 0.6rem 0.9rem;
min-width: 150px;
}
.hero-stat span {
display: block;
font-size: 0.65rem;
text-transform: uppercase;
letter-spacing: 0.1em;
color: var(--muted);
}
.hero-stat strong {
font-size: 0.95rem;
}
.panel {
background: var(--panel);
border: 1px solid var(--panel-border);
border-radius: 20px;
padding: 1.4rem;
box-shadow: var(--shadow);
animation: rise 0.7s ease both;
}
.panel-delayed {
animation-delay: 0.12s;
}
.section-title {
margin: 0 0 0.5rem 0;
font-size: 1rem;
text-transform: uppercase;
letter-spacing: 0.1em;
color: var(--muted);
}
.helper-text {
font-size: 0.85rem;
color: var(--muted);
}
#generate-btn {
width: 100%;
border-radius: 999px;
background: linear-gradient(120deg, var(--accent), #ff9f1c);
border: none;
color: #1a1a1a;
font-weight: 600;
box-shadow: 0 12px 24px rgba(255, 107, 53, 0.3);
}
#generate-btn:hover {
transform: translateY(-1px);
}
.compact-uploader .upload-box,
.compact-uploader .file-preview,
.compact-audio .upload-box,
.compact-audio .audio-container {
min-height: 56px;
padding: 0.4rem 0.55rem;
border-radius: 10px;
}
.compact-uploader svg,
.compact-audio svg {
width: 16px;
height: 16px;
}
.compact-audio .audio-container {
gap: 0.6rem;
}
.compact-audio .audio-container button {
width: 28px;
height: 28px;
}
.gradio-container audio {
width: 100%;
}
.note {
background: rgba(27, 153, 139, 0.08);
border: 1px solid rgba(27, 153, 139, 0.2);
border-radius: 14px;
padding: 0.9rem 1rem;
color: var(--muted);
}
@keyframes rise {
from {
opacity: 0;
transform: translateY(12px);
}
to {
opacity: 1;
transform: translateY(0);
}
}
"""
custom_theme = gr.themes.Soft(
primary_hue="orange",
secondary_hue="teal",
neutral_hue="stone",
)
with gr.Blocks(theme=custom_theme, css=custom_css, title="Chatterbox TTS Studio") as demo:
gr.HTML("""
<section class="hero">
<div class="hero-badge">Voice Studio</div>
<h1 class="hero-title">Chatterbox TTS</h1>
<p class="hero-sub">
Turn long-form text into expressive speech using a single voice sample.
Upload a short WAV and generate a polished narration in minutes.
</p>
<div class="hero-stats">
<div class="hero-stat">
<span>Input</span>
<strong>Text or TXT</strong>
</div>
<div class="hero-stat">
<span>Voice</span>
<strong>10-30s WAV</strong>
</div>
<div class="hero-stat">
<span>Output</span>
<strong>Single WAV</strong>
</div>
</div>
</section>
""")
with gr.Row():
with gr.Column(scale=1, elem_classes=["panel"]):
gr.Markdown("### Text Input")
text_input = gr.Textbox(
label="Your Text",
lines=8,
placeholder="Paste the text you want to convert to speech..."
)
file_input = gr.File(
label="Or Upload Text File (.txt)",
file_types=[".txt"],
type="filepath",
elem_classes=["compact-uploader"]
)
gr.Markdown("### Voice Reference")
voice_input = gr.Audio(
label="Voice Sample (WAV)",
type="filepath",
sources=["upload"],
elem_classes=["compact-audio"]
)
with gr.Accordion("Advanced Options", open=False):
exaggeration = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.7,
label="Exaggeration (0=subtle, 1=pronounced)"
)
temperature = gr.Slider(
minimum=0.1,
maximum=1.5,
value=0.8,
label="Temperature (lower=consistent, higher=varied)"
)
cfg_weight = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.5,
label="CFG Weight (prompt adherence)"
)
generate_btn = gr.Button("Generate Audio", variant="primary", size="lg", elem_id="generate-btn")
with gr.Column(scale=1, elem_classes=["panel", "panel-delayed"]):
gr.Markdown("### Generated Audio")
status_output = gr.Textbox(
label="Status",
placeholder="Ready to generate...",
lines=2,
interactive=False
)
audio_output = gr.Audio(
label="Your Generated Speech",
type="filepath",
interactive=False,
elem_classes=["compact-audio"]
)
gr.Markdown("""
<div class="note">
<strong>Generation flow:</strong>
Text is split into clean chunks, each chunk is synthesized with your voice sample,
and the results are stitched into a single WAV file.
</div>
""")
with gr.Accordion("Quick Start", open=False, elem_classes=["panel"]):
gr.Markdown("""
**Step-by-step:**
1. Paste text or upload a TXT file
2. Upload a clear WAV voice sample (10-30 seconds)
3. Adjust the sliders if needed
4. Click Generate Audio and wait for the result
**Pro tips:**
- Use clean, noise-free recordings for the best cloning
- Longer scripts are auto-split into ~800 character chunks
- Lower temperature sounds more consistent, higher sounds more varied
""")
generate_btn.click(
fn=generate_audio,
inputs=[text_input, file_input, voice_input, exaggeration, temperature, cfg_weight],
outputs=[audio_output, status_output],
api_name=False
)
if __name__ == "__main__":
demo.launch(show_api=False)