Spaces:

NextDrought
/

worship-agent

Sleeping

App Files Files Community

worship-agent / app.py

NextDrought

Keep fixing

d89679b verified 4 months ago

raw

history blame contribute delete

14.4 kB

	import os
	import torch
	import torchaudio as ta
	import numpy as np
	import re
	import gradio as gr
	from chatterbox.tts import ChatterboxTTS
	import tempfile
	import shutil
	import warnings

	# Determine the best available device
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	if DEVICE == "cpu" and torch.backends.mps.is_available():
	DEVICE = "mps" # Use Apple Silicon GPU if available
	print(f"Using device: {DEVICE}")

	# Hide diffusers LoRA deprecation noise in the UI logs
	warnings.filterwarnings(
	"ignore",
	message=r".LoRACompatibleLinear.",
	category=FutureWarning,
	)

	# Monkey patch torch.load to always use map_location
	original_torch_load = torch.load

	def patched_torch_load(args, *kwargs):
	if 'map_location' not in kwargs:
	kwargs['map_location'] = torch.device(DEVICE)
	return original_torch_load(args, *kwargs)

	# Apply the patch
	torch.load = patched_torch_load

	def load_model():
	try:
	model = ChatterboxTTS.from_pretrained(DEVICE)
	return model
	except Exception as e:
	print(f"Error loading model: {e}")
	return None

	# Initialize model globally
	MODEL = load_model()

	def split_text_into_chunks(text, max_length=800):
	# First, clean up the text
	text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
	text = text.strip()

	# Split by sentences first (better for natural pauses)
	sentences = re.split(r'(?<=[.!?])\s+', text)

	chunks = []
	current_chunk = ""

	for sentence in sentences:
	# If this sentence alone exceeds max_length, split it further
	if len(sentence) > max_length:
	if current_chunk:
	chunks.append(current_chunk.strip())
	current_chunk = ""

	# Split long sentence by phrases (commas, semicolons)
	phrases = re.split(r'(?<=[,;])\s+', sentence)
	phrase_chunk = ""

	for phrase in phrases:
	if len(phrase_chunk) + len(phrase) + 1 > max_length:
	if phrase_chunk:
	chunks.append(phrase_chunk.strip())
	phrase_chunk = phrase
	else:
	if phrase_chunk:
	phrase_chunk += ", " + phrase
	else:
	phrase_chunk = phrase

	if phrase_chunk:
	current_chunk = phrase_chunk

	# If adding this sentence would exceed max_length, save current chunk and start a new one
	elif len(current_chunk) + len(sentence) + 1 > max_length:
	if current_chunk:
	chunks.append(current_chunk.strip())
	current_chunk = sentence
	else:
	if current_chunk:
	current_chunk += " " + sentence
	else:
	current_chunk = sentence

	# Add the last chunk if it's not empty
	if current_chunk:
	chunks.append(current_chunk.strip())

	return chunks

	def resolve_file_path(file_obj):
	if file_obj is None:
	return None
	if isinstance(file_obj, str):
	return file_obj
	if isinstance(file_obj, dict):
	return file_obj.get("name") or file_obj.get("path")
	if hasattr(file_obj, "name"):
	return file_obj.name
	return None

	def generate_audio(text, text_file, voice_file, exaggeration, temperature, cfg_weight, progress=gr.Progress()):
	if MODEL is None:
	return None, "Error: Model failed to load."

	# Handle text input source
	text_file_path = resolve_file_path(text_file)
	if text_file_path:
	try:
	with open(text_file_path, 'r', encoding='utf-8') as f:
	text = f.read()
	except Exception as e:
	return None, f"Error reading text file: {str(e)}"

	if not text:
	return None, "Error: No text provided."

	voice_file_path = resolve_file_path(voice_file)
	if voice_file_path is None:
	return None, "Error: No voice reference file provided."

	# Create a temporary directory for processing
	temp_dir = tempfile.mkdtemp()
	try:
	chunks = split_text_into_chunks(text, max_length=800)
	progress(0, desc=f"Split text into {len(chunks)} chunks")

	audio_files = []

	for i, chunk in enumerate(chunks):
	progress((i) / len(chunks), desc=f"Generating chunk {i+1}/{len(chunks)}")

	# Generate audio for this chunk
	wav = MODEL.generate(
	chunk,
	audio_prompt_path=voice_file_path,
	exaggeration=exaggeration,
	temperature=temperature,
	cfg_weight=cfg_weight,
	)

	# Save the audio file
	output_file = os.path.join(temp_dir, f"chunk_{i+1}.wav")
	ta.save(output_file, wav, MODEL.sr)
	audio_files.append(output_file)

	progress(0.9, desc="Concatenating audio...")

	# Load and concatenate
	waveforms = []
	sample_rate = MODEL.sr

	for file in audio_files:
	waveform, sr = ta.load(file)
	if sr != sample_rate:
	waveform = ta.functional.resample(waveform, sr, sample_rate)
	waveforms.append(waveform)

	if not waveforms:
	return None, "Error: No audio generated."

	concatenated = torch.cat(waveforms, dim=1)

	# Save final output
	final_output_path = "output.wav"
	ta.save(final_output_path, concatenated, sample_rate)

	return final_output_path, f"Successfully generated audio from {len(chunks)} chunks."

	except Exception as e:
	return None, f"Error during generation: {str(e)}"
	finally:
	# Cleanup temp dir
	shutil.rmtree(temp_dir, ignore_errors=True)

	# Gradio Interface with Custom Theme
	custom_css = """
	@import url("https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;600&family=Unbounded:wght@500;700&display=swap");

	:root {
	--bg: #f6f1e8;
	--panel: #ffffff;
	--panel-border: #eadfcf;
	--ink: #1c2326;
	--muted: #58646b;
	--accent: #ff6b35;
	--accent-2: #1b998b;
	--accent-3: #ffb100;
	--shadow: 0 20px 40px rgba(31, 35, 38, 0.12);
	}

	body,
	.gradio-container {
	background:
	radial-gradient(900px 600px at 10% 0%, #ffe6c2 0%, rgba(255, 230, 194, 0) 60%),
	radial-gradient(800px 500px at 90% 10%, #cdeff0 0%, rgba(205, 239, 240, 0) 55%),
	linear-gradient(180deg, var(--bg) 0%, #fdf8f1 100%);
	color: var(--ink);
	font-family: "Space Grotesk", sans-serif;
	}

	.gradio-container {
	max-width: 1100px;
	margin: 0 auto;
	padding: 2.5rem 1.5rem 3rem;
	}

	h1,
	h2,
	h3,
	.hero-title {
	font-family: "Unbounded", "Space Grotesk", sans-serif;
	letter-spacing: -0.02em;
	}

	.hero {
	position: relative;
	padding: 2.2rem;
	border-radius: 24px;
	background: linear-gradient(135deg, rgba(255, 107, 53, 0.12), rgba(27, 153, 139, 0.12));
	border: 1px solid rgba(255, 107, 53, 0.18);
	box-shadow: var(--shadow);
	overflow: hidden;
	animation: rise 0.6s ease both;
	}

	.hero::after {
	content: "";
	position: absolute;
	inset: 0;
	background: radial-gradient(500px 200px at 80% 0%, rgba(255, 177, 0, 0.2), transparent 70%);
	pointer-events: none;
	}

	.hero-badge {
	display: inline-flex;
	align-items: center;
	gap: 0.4rem;
	padding: 0.3rem 0.8rem;
	border-radius: 999px;
	font-size: 0.75rem;
	text-transform: uppercase;
	letter-spacing: 0.08em;
	background: rgba(255, 177, 0, 0.2);
	border: 1px solid rgba(255, 177, 0, 0.4);
	color: var(--ink);
	}

	.hero-title {
	margin: 0.8rem 0 0.4rem 0;
	font-size: 2.4rem;
	}

	.hero-sub {
	max-width: 620px;
	font-size: 1rem;
	color: var(--muted);
	margin: 0 0 1.4rem 0;
	}

	.hero-stats {
	display: flex;
	flex-wrap: wrap;
	gap: 0.8rem;
	}

	.hero-stat {
	background: rgba(255, 255, 255, 0.6);
	border: 1px solid rgba(234, 223, 207, 0.9);
	border-radius: 14px;
	padding: 0.6rem 0.9rem;
	min-width: 150px;
	}

	.hero-stat span {
	display: block;
	font-size: 0.65rem;
	text-transform: uppercase;
	letter-spacing: 0.1em;
	color: var(--muted);
	}

	.hero-stat strong {
	font-size: 0.95rem;
	}

	.panel {
	background: var(--panel);
	border: 1px solid var(--panel-border);
	border-radius: 20px;
	padding: 1.4rem;
	box-shadow: var(--shadow);
	animation: rise 0.7s ease both;
	}

	.panel-delayed {
	animation-delay: 0.12s;
	}

	.section-title {
	margin: 0 0 0.5rem 0;
	font-size: 1rem;
	text-transform: uppercase;
	letter-spacing: 0.1em;
	color: var(--muted);
	}

	.helper-text {
	font-size: 0.85rem;
	color: var(--muted);
	}

	#generate-btn {
	width: 100%;
	border-radius: 999px;
	background: linear-gradient(120deg, var(--accent), #ff9f1c);
	border: none;
	color: #1a1a1a;
	font-weight: 600;
	box-shadow: 0 12px 24px rgba(255, 107, 53, 0.3);
	}

	#generate-btn:hover {
	transform: translateY(-1px);
	}

	.compact-uploader .upload-box,
	.compact-uploader .file-preview,
	.compact-audio .upload-box,
	.compact-audio .audio-container {
	min-height: 56px;
	padding: 0.4rem 0.55rem;
	border-radius: 10px;
	}

	.compact-uploader svg,
	.compact-audio svg {
	width: 16px;
	height: 16px;
	}

	.compact-audio .audio-container {
	gap: 0.6rem;
	}

	.compact-audio .audio-container button {
	width: 28px;
	height: 28px;
	}

	.gradio-container audio {
	width: 100%;
	}

	.note {
	background: rgba(27, 153, 139, 0.08);
	border: 1px solid rgba(27, 153, 139, 0.2);
	border-radius: 14px;
	padding: 0.9rem 1rem;
	color: var(--muted);
	}

	@keyframes rise {
	from {
	opacity: 0;
	transform: translateY(12px);
	}
	to {
	opacity: 1;
	transform: translateY(0);
	}
	}
	"""

	custom_theme = gr.themes.Soft(
	primary_hue="orange",
	secondary_hue="teal",
	neutral_hue="stone",
	)

	with gr.Blocks(theme=custom_theme, css=custom_css, title="Chatterbox TTS Studio") as demo:
	gr.HTML("""
	<section class="hero">
	<div class="hero-badge">Voice Studio</div>
	<h1 class="hero-title">Chatterbox TTS</h1>
	<p class="hero-sub">
	Turn long-form text into expressive speech using a single voice sample.
	Upload a short WAV and generate a polished narration in minutes.
	</p>
	<div class="hero-stats">
	<div class="hero-stat">
	<span>Input</span>
	<strong>Text or TXT</strong>
	</div>
	<div class="hero-stat">
	<span>Voice</span>
	<strong>10-30s WAV</strong>
	</div>
	<div class="hero-stat">
	<span>Output</span>
	<strong>Single WAV</strong>
	</div>
	</div>
	</section>
	""")

	with gr.Row():
	with gr.Column(scale=1, elem_classes=["panel"]):
	gr.Markdown("### Text Input")
	text_input = gr.Textbox(
	label="Your Text",
	lines=8,
	placeholder="Paste the text you want to convert to speech..."
	)

	file_input = gr.File(
	label="Or Upload Text File (.txt)",
	file_types=[".txt"],
	type="filepath",
	elem_classes=["compact-uploader"]
	)

	gr.Markdown("### Voice Reference")
	voice_input = gr.Audio(
	label="Voice Sample (WAV)",
	type="filepath",
	sources=["upload"],
	elem_classes=["compact-audio"]
	)

	with gr.Accordion("Advanced Options", open=False):
	exaggeration = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.7,
	label="Exaggeration (0=subtle, 1=pronounced)"
	)

	temperature = gr.Slider(
	minimum=0.1,
	maximum=1.5,
	value=0.8,
	label="Temperature (lower=consistent, higher=varied)"
	)

	cfg_weight = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.5,
	label="CFG Weight (prompt adherence)"
	)

	generate_btn = gr.Button("Generate Audio", variant="primary", size="lg", elem_id="generate-btn")

	with gr.Column(scale=1, elem_classes=["panel", "panel-delayed"]):
	gr.Markdown("### Generated Audio")

	status_output = gr.Textbox(
	label="Status",
	placeholder="Ready to generate...",
	lines=2,
	interactive=False
	)

	audio_output = gr.Audio(
	label="Your Generated Speech",
	type="filepath",
	interactive=False,
	elem_classes=["compact-audio"]
	)

	gr.Markdown("""
	<div class="note">
	<strong>Generation flow:</strong>
	Text is split into clean chunks, each chunk is synthesized with your voice sample,
	and the results are stitched into a single WAV file.
	</div>
	""")

	with gr.Accordion("Quick Start", open=False, elem_classes=["panel"]):
	gr.Markdown("""
	Step-by-step:
	1. Paste text or upload a TXT file
	2. Upload a clear WAV voice sample (10-30 seconds)
	3. Adjust the sliders if needed
	4. Click Generate Audio and wait for the result

	Pro tips:
	- Use clean, noise-free recordings for the best cloning
	- Longer scripts are auto-split into ~800 character chunks
	- Lower temperature sounds more consistent, higher sounds more varied
	""")

	generate_btn.click(
	fn=generate_audio,
	inputs=[text_input, file_input, voice_input, exaggeration, temperature, cfg_weight],
	outputs=[audio_output, status_output],
	api_name=False
	)

	if __name__ == "__main__":
	demo.launch(show_api=False)