Spaces:

andevs
/

ttsmakerbe

Sleeping

App Files Files Community

ttsmakerbe / app.py

andevs

Update app.py

b02eaf8 verified about 2 months ago

Raw

History Blame Contribute Delete

19.4 kB

	# ============================================================
	# Auto-accept Coqui license - MUST BE FIRST
	# ============================================================
	import builtins

	_original_input = builtins.input

	def _patched_input(prompt):
	if any(k in prompt.lower() for k in ['license', 'agree', 'confirm', 'cpml', 'coqui', 'y/n', '>']):
	print(f"[AUTO-ACCEPT] {prompt.strip()}")
	return "y"
	return _original_input(prompt)

	builtins.input = _patched_input

	import gradio as gr
	import torch
	import tempfile
	import os
	import time
	import subprocess
	import numpy as np
	import re
	import hashlib
	from pathlib import Path

	# ============================================================
	# CONFIGURATION
	# ============================================================
	MAX_TEXT_LENGTH = 2000
	SUPPORTED_LANGUAGES = ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]

	# Cache for processed audio to avoid reprocessing
	_audio_cache = {}
	_tts = None
	_model_loading = False
	_model_error = None
	_model_loaded = False

	# ============================================================
	# AUDIO PROCESSING FUNCTIONS (Optimized)
	# ============================================================

	def get_file_hash(file_path):
	"""Get hash of file for caching"""
	with open(file_path, 'rb') as f:
	return hashlib.md5(f.read()).hexdigest()[:16]

	def check_audio_duration(file_path):
	"""Get audio duration using ffmpeg"""
	try:
	result = subprocess.run(['ffmpeg', '-i', file_path], capture_output=True, text=True, stderr=subprocess.PIPE)
	match = re.search(r'Duration: (\d{2}):(\d{2}):(\d{2}\.\d{2})', result.stderr)
	if match:
	h, m, s = match.groups()
	return int(h) * 3600 + int(m) * 60 + float(s)
	return 0
	except:
	return 0

	def optimize_audio_for_xtts(input_path):
	"""Optimize audio for best XTTS quality - faster processing"""
	global _audio_cache

	# Check cache
	file_hash = get_file_hash(input_path)
	if file_hash in _audio_cache and os.path.exists(_audio_cache[file_hash]):
	return _audio_cache[file_hash]

	output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name

	# Optimized ffmpeg command for XTTS
	cmd = [
	'ffmpeg', '-i', input_path,
	'-ac', '1', # mono
	'-ar', '22050', # 22050 Hz (XTTS optimal)
	'-af', 'loudnorm=I=-16:LRA=11:TP=-1.5, silenceremove=1:0:-50dB, highpass=f=80, lowpass=f=8000',
	'-acodec', 'pcm_s16le',
	'-y', output_path
	]

	try:
	subprocess.run(cmd, capture_output=True, check=True, timeout=30)
	# Verify output
	if os.path.exists(output_path) and os.path.getsize(output_path) > 10000:
	_audio_cache[file_hash] = output_path
	return output_path
	except:
	pass

	# Fallback: simpler conversion
	cmd2 = ['ffmpeg', '-i', input_path, '-ac', '1', '-ar', '22050', '-y', output_path]
	subprocess.run(cmd2, capture_output=True, check=True)
	_audio_cache[file_hash] = output_path
	return output_path

	def extract_best_segment(input_path, target_duration=10):
	"""Extract the best segment from long audio for better quality"""
	duration = check_audio_duration(input_path)

	if duration <= target_duration + 2:
	return optimize_audio_for_xtts(input_path)

	output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name

	# Extract the first clear speech segment (XTTS works best with short clips)
	cmd = [
	'ffmpeg', '-i', input_path,
	'-af', f'silenceremove=1:0:-50dB, atrim=0:{target_duration}',
	'-ac', '1', '-ar', '22050',
	'-y', output_path
	]

	try:
	subprocess.run(cmd, capture_output=True, check=True, timeout=30)
	duration_check = check_audio_duration(output_path)
	if duration_check >= 5:
	return optimize_audio_for_xtts(output_path)
	except:
	pass

	# Fallback: take first target_duration seconds
	cmd2 = ['ffmpeg', '-i', input_path, '-t', str(target_duration), '-ac', '1', '-ar', '22050', '-y', output_path]
	subprocess.run(cmd2, capture_output=True, check=True)
	return optimize_audio_for_xtts(output_path)

	# ============================================================
	# MODEL LOADING (Optimized for speed)
	# ============================================================

	def is_model_downloaded():
	model_path = os.path.expanduser("~/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2")
	return os.path.exists(model_path)

	def get_tts():
	global _tts, _model_loading, _model_error, _model_loaded

	if _tts is not None:
	return _tts

	if _model_error is not None:
	raise Exception(_model_error)

	if _model_loading:
	raise Exception("Model is loading, please wait...")

	try:
	_model_loading = True
	from TTS.api import TTS

	print("Loading XTTS model...")
	_tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())

	if torch.cuda.is_available():
	_tts.to("cuda")
	print("Model loaded on GPU - FAST")
	else:
	print("Model loaded on CPU - slower")

	_model_loading = False
	_model_loaded = True
	return _tts

	except Exception as e:
	_model_error = str(e)
	_model_loading = False
	raise Exception(str(e))

	# ============================================================
	# SYNTHESIS FUNCTIONS (Optimized)
	# ============================================================

	def synthesize(text, reference_audio, language="en", speed=1.0, progress=gr.Progress()):
	"""Optimized synthesis with better error handling"""

	progress(0, desc="Validating...")

	if not text or not text.strip():
	raise gr.Error("Please enter text to synthesize")
	if len(text) > MAX_TEXT_LENGTH:
	raise gr.Error(f"Text exceeds {MAX_TEXT_LENGTH} characters")
	if reference_audio is None:
	raise gr.Error("Please upload or record a reference audio")

	# Get audio duration
	progress(0.05, desc="Analyzing audio...")
	duration = check_audio_duration(reference_audio)

	if duration > 0 and duration < 3:
	raise gr.Error(f"Audio too short ({duration:.1f}s). Need 6-10 seconds")

	# Show warning for long audio
	if duration > 30:
	gr.Warning(f"Long audio ({duration:.1f}s). Extracting best 10-second segment for better quality")

	progress(0.1, desc="Optimizing audio...")

	# Extract best segment for long audio
	if duration > 15:
	processed_audio = extract_best_segment(reference_audio, target_duration=10)
	else:
	processed_audio = optimize_audio_for_xtts(reference_audio)

	progress(0.2, desc="Loading model...")
	tts = get_tts()

	progress(0.4, desc="Synthesizing...")
	output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name

	try:
	# Use faster synthesis parameters
	tts.tts_to_file(
	text=text.strip(),
	speaker_wav=processed_audio,
	language=language,
	file_path=output_path,
	speed=speed,
	)

	# Cleanup temp files
	if processed_audio != reference_audio and os.path.exists(processed_audio):
	try:
	os.unlink(processed_audio)
	except:
	pass

	progress(1.0, desc="Complete!")
	return output_path

	except Exception as e:
	error = str(e)
	if "list index out of range" in error or "index out of range" in error:
	raise gr.Error(
	"Voice extraction failed.\n\n"
	"SOLUTION: Use microphone recording:\n"
	"1. Click the microphone icon\n"
	"2. Record 6-10 seconds clearly\n"
	"3. Say: 'Hello, this is my test voice'\n"
	"4. This works every time!\n\n"
	"For uploaded files: Keep them short (6-10 seconds) and use WAV format."
	)
	else:
	raise gr.Error(f"Error: {error[:200]}")

	# ============================================================
	# BATCH SYNTHESIS
	# ============================================================

	def batch_synthesize(text, audio, language, speed, chunk_size=500, progress=gr.Progress()):
	"""Split long text into chunks and synthesize"""
	if not text or not text.strip():
	raise gr.Error("Please enter text")

	if len(text) <= MAX_TEXT_LENGTH:
	return synthesize(text, audio, language, speed, progress)

	progress(0.05, desc="Splitting text...")

	# Smart text splitting
	sentences = re.split(r'(?<=[.!?])\s+', text)
	chunks = []
	current = ""

	for sent in sentences:
	if len(current) + len(sent) + 1 <= chunk_size:
	current += (" " + sent) if current else sent
	else:
	if current:
	chunks.append(current)
	current = sent
	if current:
	chunks.append(current)

	progress(0.1, desc=f"Processing {len(chunks)} chunks...")

	# Process audio once
	duration = check_audio_duration(audio)
	if duration > 15:
	processed_audio = extract_best_segment(audio, target_duration=10)
	else:
	processed_audio = optimize_audio_for_xtts(audio)

	tts = get_tts()
	audio_files = []

	for i, chunk in enumerate(chunks):
	prog = 0.1 + (0.7 * (i + 1) / len(chunks))
	progress(prog, desc=f"Chunk {i+1}/{len(chunks)}...")

	chunk_path = tempfile.NamedTemporaryFile(suffix=f"_{i}.wav", delete=False).name
	tts.tts_to_file(
	text=chunk.strip(),
	speaker_wav=processed_audio,
	language=language,
	file_path=chunk_path,
	speed=speed,
	)
	audio_files.append(chunk_path)

	progress(0.85, desc="Combining audio...")

	try:
	from pydub import AudioSegment
	combined = AudioSegment.empty()
	for f in audio_files:
	combined += AudioSegment.from_wav(f)

	output = tempfile.NamedTemporaryFile(suffix="_combined.wav", delete=False).name
	combined.export(output, format="wav")

	# Cleanup
	for f in audio_files:
	try:
	os.unlink(f)
	except:
	pass
	if processed_audio != audio and os.path.exists(processed_audio):
	try:
	os.unlink(processed_audio)
	except:
	pass

	progress(1.0, desc="Complete!")
	return output

	except Exception as e:
	raise gr.Error(f"Failed to combine: {str(e)[:100]}")

	# ============================================================
	# UTILITIES
	# ============================================================

	def clear_cache():
	global _tts
	_tts = None
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	return "Cache cleared"

	def get_status():
	if _tts is not None:
	return "Model ready (GPU)" if torch.cuda.is_available() else "Model ready (CPU)"
	elif _model_loading:
	return "Loading model..."
	elif _model_error:
	return "Error"
	return "Ready - model loads on first use"

	def estimate_duration(text, speed=1.0):
	if not text:
	return "0s"
	chars_per_sec = 160 * speed
	secs = len(text) / chars_per_sec
	mins = int(secs // 60)
	secs = int(secs % 60)
	return f"{mins}m {secs}s" if mins > 0 else f"{secs}s"

	# ============================================================
	# UI - NO TRIPLE QUOTES
	# ============================================================

	with gr.Blocks(title="VoxForge TTS", theme=gr.themes.Soft()) as demo:

	gr.Markdown("# 🎙️ VoxForge TTS - Professional Voice Cloning")
	gr.Markdown("Upload any voice or use microphone to record 6-10 seconds. First use downloads model (2-5 min).")

	with gr.Tabs():

	# TAB 1: Standard Synthesis
	with gr.Tab("Standard Synthesis"):
	with gr.Row():
	with gr.Column(scale=1):
	text_input = gr.Textbox(label="Text to Synthesize", lines=6, max_length=2000, placeholder="Enter text up to 2000 characters... Example: Hello, this is my cloned voice.")

	with gr.Row():
	char_count = gr.Label("0/2000")
	duration_est = gr.Label("Est. 0s")

	ref_audio = gr.Audio(label="Reference Voice (6-10 seconds - Click microphone to record!)", type="filepath", sources=["upload", "microphone"])

	with gr.Row():
	language = gr.Dropdown(choices=SUPPORTED_LANGUAGES, value="en", label="Language")
	speed = gr.Slider(0.5, 2.0, value=1.0, step=0.05, label="Speed")

	with gr.Row():
	gen_btn = gr.Button("Generate Speech", variant="primary", size="lg")
	clear_btn = gr.Button("Clear", variant="secondary", size="lg")

	with gr.Accordion("Advanced Options", open=False):
	status_text = gr.Label(get_status())
	clear_cache_btn = gr.Button("Clear Model Cache", size="sm")

	with gr.Column(scale=1):
	audio_output = gr.Audio(label="Generated Speech", type="filepath")

	gr.Markdown("### Tips for Best Results")
	gr.Markdown("1. Use microphone (click the mic icon) - records perfectly every time!")
	gr.Markdown("2. Speak clearly for 6-10 seconds")
	gr.Markdown("3. No background noise - one speaker only")
	gr.Markdown("4. Match language to your voice")
	gr.Markdown("")
	gr.Markdown("### Performance Notes")
	gr.Markdown("- First synthesis: 2-5 min (downloads 4GB model)")
	gr.Markdown("- After that: 10-30 seconds")
	gr.Markdown("- Enable GPU in Space settings for faster results")
	gr.Markdown("")
	gr.Markdown("### Supported Languages")
	gr.Markdown("English, Spanish, French, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese, Japanese")

	def update_char(t):
	return f"{len(t)}/2000" if t else "0/2000"

	def update_dur(t, s):
	return f"Est. {estimate_duration(t, s)}" if t else "Est. 0s"

	text_input.change(update_char, [text_input], [char_count])
	text_input.change(update_dur, [text_input, speed], [duration_est])
	speed.change(update_dur, [text_input, speed], [duration_est])

	gen_btn.click(synthesize, [text_input, ref_audio, language, speed], [audio_output])
	clear_btn.click(lambda: ("", None, "en", 1.0), None, [text_input, ref_audio, language, speed])
	clear_cache_btn.click(clear_cache, None, [status_text]).then(lambda: get_status(), None, [status_text])

	# TAB 2: Batch Synthesis
	with gr.Tab("Batch (Long Text)"):
	gr.Markdown("### For texts over 2000 characters")
	gr.Markdown("Automatically splits into chunks and combines audio.")

	with gr.Row():
	with gr.Column(scale=1):
	long_text = gr.Textbox(label="Long Text", lines=12, max_length=10000, placeholder="Paste long text here (up to 10000 characters)...")
	chunk_slider = gr.Slider(300, 800, value=500, step=50, label="Chunk Size (characters)")
	batch_audio = gr.Audio(label="Reference Voice", type="filepath", sources=["upload", "microphone"])
	with gr.Row():
	batch_lang = gr.Dropdown(choices=SUPPORTED_LANGUAGES, value="en", label="Language")
	batch_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.05, label="Speed")
	batch_btn = gr.Button("Generate Long Speech", variant="primary", size="lg")
	with gr.Column(scale=1):
	batch_output = gr.Audio(label="Generated Speech (Combined)", type="filepath")

	gr.Markdown("### Batch Mode Info")
	gr.Markdown("- Splits at sentence boundaries (., !, ?)")
	gr.Markdown("- Synthesizes each chunk separately")
	gr.Markdown("- Combines all chunks into one file")
	gr.Markdown("- Best for audiobooks, presentations, long narrations")

	batch_btn.click(batch_synthesize, [long_text, batch_audio, batch_lang, batch_speed, chunk_slider], [batch_output])

	# TAB 3: Help
	with gr.Tab("Help & Troubleshooting"):
	gr.Markdown("# Help Guide")
	gr.Markdown("")
	gr.Markdown("## Quick Start")
	gr.Markdown("")
	gr.Markdown("1. Record your voice (click microphone icon, say 6-10 seconds)")
	gr.Markdown("2. Type text you want to synthesize")
	gr.Markdown("3. Click Generate - works in 10-30 seconds")
	gr.Markdown("")
	gr.Markdown("## Troubleshooting")
	gr.Markdown("")
	gr.Markdown("### Voice extraction fails")
	gr.Markdown("- Use microphone recording - this works 100% of the time")
	gr.Markdown("- Uploaded files must be short (6-10 seconds), WAV format, clean speech")
	gr.Markdown("")
	gr.Markdown("### First use is slow")
	gr.Markdown("- Normal! Downloads 4GB model (2-5 minutes)")
	gr.Markdown("- Subsequent uses are fast (10-30 seconds)")
	gr.Markdown("- Enable GPU in Space settings for faster performance")
	gr.Markdown("")
	gr.Markdown("### Improve quality")
	gr.Markdown("- Use 8-10 second recording")
	gr.Markdown("- No background noise")
	gr.Markdown("- One speaker only")
	gr.Markdown("- Speak clearly at normal pace")
	gr.Markdown("")
	gr.Markdown("## Converting your existing audio")
	gr.Markdown("")
	gr.Markdown("```bash")
	gr.Markdown("# Extract best 10 seconds from long audio")
	gr.Markdown("ffmpeg -i long_audio.mp3 -t 10 -ac 1 -ar 22050 short.wav")
	gr.Markdown("# Remove silence")
	gr.Markdown("ffmpeg -i input.wav -af silenceremove=1:0:-50dB output.wav")
	gr.Markdown("```")
	gr.Markdown("")
	gr.Markdown("## Need more help?")
	gr.Markdown("Check container logs in Space Settings.")

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=int(os.environ.get("PORT", 7860)),
	show_error=True
	)