ttsmakerbe / app.py
andevs's picture
Update app.py
b02eaf8 verified
Raw
History Blame Contribute Delete
19.4 kB
# ============================================================
# Auto-accept Coqui license - MUST BE FIRST
# ============================================================
import builtins
_original_input = builtins.input
def _patched_input(prompt):
if any(k in prompt.lower() for k in ['license', 'agree', 'confirm', 'cpml', 'coqui', 'y/n', '>']):
print(f"[AUTO-ACCEPT] {prompt.strip()}")
return "y"
return _original_input(prompt)
builtins.input = _patched_input
import gradio as gr
import torch
import tempfile
import os
import time
import subprocess
import numpy as np
import re
import hashlib
from pathlib import Path
# ============================================================
# CONFIGURATION
# ============================================================
MAX_TEXT_LENGTH = 2000
SUPPORTED_LANGUAGES = ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
# Cache for processed audio to avoid reprocessing
_audio_cache = {}
_tts = None
_model_loading = False
_model_error = None
_model_loaded = False
# ============================================================
# AUDIO PROCESSING FUNCTIONS (Optimized)
# ============================================================
def get_file_hash(file_path):
"""Get hash of file for caching"""
with open(file_path, 'rb') as f:
return hashlib.md5(f.read()).hexdigest()[:16]
def check_audio_duration(file_path):
"""Get audio duration using ffmpeg"""
try:
result = subprocess.run(['ffmpeg', '-i', file_path], capture_output=True, text=True, stderr=subprocess.PIPE)
match = re.search(r'Duration: (\d{2}):(\d{2}):(\d{2}\.\d{2})', result.stderr)
if match:
h, m, s = match.groups()
return int(h) * 3600 + int(m) * 60 + float(s)
return 0
except:
return 0
def optimize_audio_for_xtts(input_path):
"""Optimize audio for best XTTS quality - faster processing"""
global _audio_cache
# Check cache
file_hash = get_file_hash(input_path)
if file_hash in _audio_cache and os.path.exists(_audio_cache[file_hash]):
return _audio_cache[file_hash]
output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
# Optimized ffmpeg command for XTTS
cmd = [
'ffmpeg', '-i', input_path,
'-ac', '1', # mono
'-ar', '22050', # 22050 Hz (XTTS optimal)
'-af', 'loudnorm=I=-16:LRA=11:TP=-1.5, silenceremove=1:0:-50dB, highpass=f=80, lowpass=f=8000',
'-acodec', 'pcm_s16le',
'-y', output_path
]
try:
subprocess.run(cmd, capture_output=True, check=True, timeout=30)
# Verify output
if os.path.exists(output_path) and os.path.getsize(output_path) > 10000:
_audio_cache[file_hash] = output_path
return output_path
except:
pass
# Fallback: simpler conversion
cmd2 = ['ffmpeg', '-i', input_path, '-ac', '1', '-ar', '22050', '-y', output_path]
subprocess.run(cmd2, capture_output=True, check=True)
_audio_cache[file_hash] = output_path
return output_path
def extract_best_segment(input_path, target_duration=10):
"""Extract the best segment from long audio for better quality"""
duration = check_audio_duration(input_path)
if duration <= target_duration + 2:
return optimize_audio_for_xtts(input_path)
output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
# Extract the first clear speech segment (XTTS works best with short clips)
cmd = [
'ffmpeg', '-i', input_path,
'-af', f'silenceremove=1:0:-50dB, atrim=0:{target_duration}',
'-ac', '1', '-ar', '22050',
'-y', output_path
]
try:
subprocess.run(cmd, capture_output=True, check=True, timeout=30)
duration_check = check_audio_duration(output_path)
if duration_check >= 5:
return optimize_audio_for_xtts(output_path)
except:
pass
# Fallback: take first target_duration seconds
cmd2 = ['ffmpeg', '-i', input_path, '-t', str(target_duration), '-ac', '1', '-ar', '22050', '-y', output_path]
subprocess.run(cmd2, capture_output=True, check=True)
return optimize_audio_for_xtts(output_path)
# ============================================================
# MODEL LOADING (Optimized for speed)
# ============================================================
def is_model_downloaded():
model_path = os.path.expanduser("~/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2")
return os.path.exists(model_path)
def get_tts():
global _tts, _model_loading, _model_error, _model_loaded
if _tts is not None:
return _tts
if _model_error is not None:
raise Exception(_model_error)
if _model_loading:
raise Exception("Model is loading, please wait...")
try:
_model_loading = True
from TTS.api import TTS
print("Loading XTTS model...")
_tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
if torch.cuda.is_available():
_tts.to("cuda")
print("Model loaded on GPU - FAST")
else:
print("Model loaded on CPU - slower")
_model_loading = False
_model_loaded = True
return _tts
except Exception as e:
_model_error = str(e)
_model_loading = False
raise Exception(str(e))
# ============================================================
# SYNTHESIS FUNCTIONS (Optimized)
# ============================================================
def synthesize(text, reference_audio, language="en", speed=1.0, progress=gr.Progress()):
"""Optimized synthesis with better error handling"""
progress(0, desc="Validating...")
if not text or not text.strip():
raise gr.Error("Please enter text to synthesize")
if len(text) > MAX_TEXT_LENGTH:
raise gr.Error(f"Text exceeds {MAX_TEXT_LENGTH} characters")
if reference_audio is None:
raise gr.Error("Please upload or record a reference audio")
# Get audio duration
progress(0.05, desc="Analyzing audio...")
duration = check_audio_duration(reference_audio)
if duration > 0 and duration < 3:
raise gr.Error(f"Audio too short ({duration:.1f}s). Need 6-10 seconds")
# Show warning for long audio
if duration > 30:
gr.Warning(f"Long audio ({duration:.1f}s). Extracting best 10-second segment for better quality")
progress(0.1, desc="Optimizing audio...")
# Extract best segment for long audio
if duration > 15:
processed_audio = extract_best_segment(reference_audio, target_duration=10)
else:
processed_audio = optimize_audio_for_xtts(reference_audio)
progress(0.2, desc="Loading model...")
tts = get_tts()
progress(0.4, desc="Synthesizing...")
output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
try:
# Use faster synthesis parameters
tts.tts_to_file(
text=text.strip(),
speaker_wav=processed_audio,
language=language,
file_path=output_path,
speed=speed,
)
# Cleanup temp files
if processed_audio != reference_audio and os.path.exists(processed_audio):
try:
os.unlink(processed_audio)
except:
pass
progress(1.0, desc="Complete!")
return output_path
except Exception as e:
error = str(e)
if "list index out of range" in error or "index out of range" in error:
raise gr.Error(
"Voice extraction failed.\n\n"
"SOLUTION: Use microphone recording:\n"
"1. Click the microphone icon\n"
"2. Record 6-10 seconds clearly\n"
"3. Say: 'Hello, this is my test voice'\n"
"4. This works every time!\n\n"
"For uploaded files: Keep them short (6-10 seconds) and use WAV format."
)
else:
raise gr.Error(f"Error: {error[:200]}")
# ============================================================
# BATCH SYNTHESIS
# ============================================================
def batch_synthesize(text, audio, language, speed, chunk_size=500, progress=gr.Progress()):
"""Split long text into chunks and synthesize"""
if not text or not text.strip():
raise gr.Error("Please enter text")
if len(text) <= MAX_TEXT_LENGTH:
return synthesize(text, audio, language, speed, progress)
progress(0.05, desc="Splitting text...")
# Smart text splitting
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current = ""
for sent in sentences:
if len(current) + len(sent) + 1 <= chunk_size:
current += (" " + sent) if current else sent
else:
if current:
chunks.append(current)
current = sent
if current:
chunks.append(current)
progress(0.1, desc=f"Processing {len(chunks)} chunks...")
# Process audio once
duration = check_audio_duration(audio)
if duration > 15:
processed_audio = extract_best_segment(audio, target_duration=10)
else:
processed_audio = optimize_audio_for_xtts(audio)
tts = get_tts()
audio_files = []
for i, chunk in enumerate(chunks):
prog = 0.1 + (0.7 * (i + 1) / len(chunks))
progress(prog, desc=f"Chunk {i+1}/{len(chunks)}...")
chunk_path = tempfile.NamedTemporaryFile(suffix=f"_{i}.wav", delete=False).name
tts.tts_to_file(
text=chunk.strip(),
speaker_wav=processed_audio,
language=language,
file_path=chunk_path,
speed=speed,
)
audio_files.append(chunk_path)
progress(0.85, desc="Combining audio...")
try:
from pydub import AudioSegment
combined = AudioSegment.empty()
for f in audio_files:
combined += AudioSegment.from_wav(f)
output = tempfile.NamedTemporaryFile(suffix="_combined.wav", delete=False).name
combined.export(output, format="wav")
# Cleanup
for f in audio_files:
try:
os.unlink(f)
except:
pass
if processed_audio != audio and os.path.exists(processed_audio):
try:
os.unlink(processed_audio)
except:
pass
progress(1.0, desc="Complete!")
return output
except Exception as e:
raise gr.Error(f"Failed to combine: {str(e)[:100]}")
# ============================================================
# UTILITIES
# ============================================================
def clear_cache():
global _tts
_tts = None
if torch.cuda.is_available():
torch.cuda.empty_cache()
return "Cache cleared"
def get_status():
if _tts is not None:
return "Model ready (GPU)" if torch.cuda.is_available() else "Model ready (CPU)"
elif _model_loading:
return "Loading model..."
elif _model_error:
return "Error"
return "Ready - model loads on first use"
def estimate_duration(text, speed=1.0):
if not text:
return "0s"
chars_per_sec = 160 * speed
secs = len(text) / chars_per_sec
mins = int(secs // 60)
secs = int(secs % 60)
return f"{mins}m {secs}s" if mins > 0 else f"{secs}s"
# ============================================================
# UI - NO TRIPLE QUOTES
# ============================================================
with gr.Blocks(title="VoxForge TTS", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🎙️ VoxForge TTS - Professional Voice Cloning")
gr.Markdown("Upload any voice or use microphone to record 6-10 seconds. First use downloads model (2-5 min).")
with gr.Tabs():
# TAB 1: Standard Synthesis
with gr.Tab("Standard Synthesis"):
with gr.Row():
with gr.Column(scale=1):
text_input = gr.Textbox(label="Text to Synthesize", lines=6, max_length=2000, placeholder="Enter text up to 2000 characters... Example: Hello, this is my cloned voice.")
with gr.Row():
char_count = gr.Label("0/2000")
duration_est = gr.Label("Est. 0s")
ref_audio = gr.Audio(label="Reference Voice (6-10 seconds - Click microphone to record!)", type="filepath", sources=["upload", "microphone"])
with gr.Row():
language = gr.Dropdown(choices=SUPPORTED_LANGUAGES, value="en", label="Language")
speed = gr.Slider(0.5, 2.0, value=1.0, step=0.05, label="Speed")
with gr.Row():
gen_btn = gr.Button("Generate Speech", variant="primary", size="lg")
clear_btn = gr.Button("Clear", variant="secondary", size="lg")
with gr.Accordion("Advanced Options", open=False):
status_text = gr.Label(get_status())
clear_cache_btn = gr.Button("Clear Model Cache", size="sm")
with gr.Column(scale=1):
audio_output = gr.Audio(label="Generated Speech", type="filepath")
gr.Markdown("### Tips for Best Results")
gr.Markdown("1. Use microphone (click the mic icon) - records perfectly every time!")
gr.Markdown("2. Speak clearly for 6-10 seconds")
gr.Markdown("3. No background noise - one speaker only")
gr.Markdown("4. Match language to your voice")
gr.Markdown("")
gr.Markdown("### Performance Notes")
gr.Markdown("- First synthesis: 2-5 min (downloads 4GB model)")
gr.Markdown("- After that: 10-30 seconds")
gr.Markdown("- Enable GPU in Space settings for faster results")
gr.Markdown("")
gr.Markdown("### Supported Languages")
gr.Markdown("English, Spanish, French, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese, Japanese")
def update_char(t):
return f"{len(t)}/2000" if t else "0/2000"
def update_dur(t, s):
return f"Est. {estimate_duration(t, s)}" if t else "Est. 0s"
text_input.change(update_char, [text_input], [char_count])
text_input.change(update_dur, [text_input, speed], [duration_est])
speed.change(update_dur, [text_input, speed], [duration_est])
gen_btn.click(synthesize, [text_input, ref_audio, language, speed], [audio_output])
clear_btn.click(lambda: ("", None, "en", 1.0), None, [text_input, ref_audio, language, speed])
clear_cache_btn.click(clear_cache, None, [status_text]).then(lambda: get_status(), None, [status_text])
# TAB 2: Batch Synthesis
with gr.Tab("Batch (Long Text)"):
gr.Markdown("### For texts over 2000 characters")
gr.Markdown("Automatically splits into chunks and combines audio.")
with gr.Row():
with gr.Column(scale=1):
long_text = gr.Textbox(label="Long Text", lines=12, max_length=10000, placeholder="Paste long text here (up to 10000 characters)...")
chunk_slider = gr.Slider(300, 800, value=500, step=50, label="Chunk Size (characters)")
batch_audio = gr.Audio(label="Reference Voice", type="filepath", sources=["upload", "microphone"])
with gr.Row():
batch_lang = gr.Dropdown(choices=SUPPORTED_LANGUAGES, value="en", label="Language")
batch_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.05, label="Speed")
batch_btn = gr.Button("Generate Long Speech", variant="primary", size="lg")
with gr.Column(scale=1):
batch_output = gr.Audio(label="Generated Speech (Combined)", type="filepath")
gr.Markdown("### Batch Mode Info")
gr.Markdown("- Splits at sentence boundaries (., !, ?)")
gr.Markdown("- Synthesizes each chunk separately")
gr.Markdown("- Combines all chunks into one file")
gr.Markdown("- Best for audiobooks, presentations, long narrations")
batch_btn.click(batch_synthesize, [long_text, batch_audio, batch_lang, batch_speed, chunk_slider], [batch_output])
# TAB 3: Help
with gr.Tab("Help & Troubleshooting"):
gr.Markdown("# Help Guide")
gr.Markdown("")
gr.Markdown("## Quick Start")
gr.Markdown("")
gr.Markdown("1. Record your voice (click microphone icon, say 6-10 seconds)")
gr.Markdown("2. Type text you want to synthesize")
gr.Markdown("3. Click Generate - works in 10-30 seconds")
gr.Markdown("")
gr.Markdown("## Troubleshooting")
gr.Markdown("")
gr.Markdown("### Voice extraction fails")
gr.Markdown("- Use microphone recording - this works 100% of the time")
gr.Markdown("- Uploaded files must be short (6-10 seconds), WAV format, clean speech")
gr.Markdown("")
gr.Markdown("### First use is slow")
gr.Markdown("- Normal! Downloads 4GB model (2-5 minutes)")
gr.Markdown("- Subsequent uses are fast (10-30 seconds)")
gr.Markdown("- Enable GPU in Space settings for faster performance")
gr.Markdown("")
gr.Markdown("### Improve quality")
gr.Markdown("- Use 8-10 second recording")
gr.Markdown("- No background noise")
gr.Markdown("- One speaker only")
gr.Markdown("- Speak clearly at normal pace")
gr.Markdown("")
gr.Markdown("## Converting your existing audio")
gr.Markdown("")
gr.Markdown("```bash")
gr.Markdown("# Extract best 10 seconds from long audio")
gr.Markdown("ffmpeg -i long_audio.mp3 -t 10 -ac 1 -ar 22050 short.wav")
gr.Markdown("# Remove silence")
gr.Markdown("ffmpeg -i input.wav -af silenceremove=1:0:-50dB output.wav")
gr.Markdown("```")
gr.Markdown("")
gr.Markdown("## Need more help?")
gr.Markdown("Check container logs in Space Settings.")
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=int(os.environ.get("PORT", 7860)),
show_error=True
)