Spaces:

crackuser
/

voiceclone-dev

Running

App Files Files Community

voiceclone-dev / app.py

crackuser

Update app.py

930a8ef verified 8 months ago

raw

history blame

15.3 kB

	import gradio as gr
	import torch
	import torchaudio
	import tempfile
	import os
	import logging

	# Setup logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Device detection
	DEVICE = "cpu"
	if torch.cuda.is_available():
	DEVICE = "cuda"
	logger.info("🚀 Running on CUDA GPU")
	else:
	logger.info("🚀 Running on CPU")

	print(f"🚀 Running on device: {DEVICE}")

	# Global model variables
	ENGLISH_MODEL = None
	MULTILINGUAL_MODEL = None

	def load_chatterbox_models():
	"""Load Chatterbox models"""
	global ENGLISH_MODEL, MULTILINGUAL_MODEL

	try:
	from chatterbox import ChatterboxTTS
	from chatterbox.tts import ChatterboxMultilingualTTS

	print("🔄 Loading Chatterbox models...")
	ENGLISH_MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
	MULTILINGUAL_MODEL = ChatterboxMultilingualTTS.from_pretrained(device=DEVICE)
	print("✅ Models loaded successfully!")
	return True
	except Exception as e:
	print(f"❌ Failed to load Chatterbox models: {e}")
	return False

	def voice_to_voice_cloning(reference_audio, input_audio, language="en", exaggeration=0.5, cfg=0.5):
	"""
	🎤 VOICE-TO-VOICE CLONING FUNCTION
	Takes input audio content and transforms it using reference voice
	"""
	try:
	if not reference_audio:
	return None, "❌ Please upload reference audio (voice to clone)!"

	if not input_audio:
	return None, "❌ Please upload input audio (content to transform)!"

	print("🔄 Starting Voice-to-Voice cloning...")

	# Step 1: Extract text from input audio using Whisper
	try:
	import whisper
	print("🎤 Transcribing input audio...")
	whisper_model = whisper.load_model("base")
	result = whisper_model.transcribe(input_audio)
	extracted_text = result["text"]
	print(f"📝 Extracted text: {extracted_text}")
	except Exception as e:
	print(f"⚠️ Whisper failed: {e}")
	extracted_text = "Voice cloning demonstration using uploaded audio content."

	# Step 2: Load Chatterbox models if not loaded
	if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None:
	if not load_chatterbox_models():
	return None, "❌ Chatterbox models failed to load!"

	# Step 3: Generate voice using Chatterbox
	print("🎭 Generating cloned voice...")
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
	output_path = tmp_file.name

	# Use appropriate model based on language
	if language == "en":
	model = ENGLISH_MODEL
	wav = model.generate(
	extracted_text,
	audio_prompt_path=reference_audio,
	exaggeration=exaggeration,
	cfg=cfg
	)
	else:
	model = MULTILINGUAL_MODEL
	wav = model.generate(
	extracted_text,
	audio_prompt_path=reference_audio,
	language_id=language,
	exaggeration=exaggeration,
	cfg=cfg
	)

	# Step 4: Save generated audio
	torchaudio.save(output_path, wav.cpu(), model.sr)

	if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
	return output_path, f"✅ Voice-to-Voice Cloning Complete!\n🎤 Transformed audio content: '{extracted_text[:100]}...'\n🎛️ Settings: Emotion={exaggeration}, CFG={cfg}\n📊 Language: {language}"
	else:
	return None, "❌ Generated audio file is empty!"

	except Exception as e:
	return None, f"❌ Voice-to-Voice cloning error: {str(e)}"

	def text_to_voice_cloning(reference_audio, input_text, language="en", exaggeration=0.5, cfg=0.5):
	"""
	📝 TEXT-TO-VOICE CLONING FUNCTION
	Generates speech from text using reference voice
	"""
	try:
	if not reference_audio:
	return None, "❌ Please upload reference audio!"

	if not input_text or not input_text.strip():
	return None, "❌ Please enter text to convert!"

	print("🔄 Starting Text-to-Voice cloning...")
	print(f"📝 Text to convert: {input_text}")

	# Load Chatterbox models if not loaded
	if ENGLISH_MODEL is None or MULTILINGUAL_MODEL is None:
	if not load_chatterbox_models():
	return None, "❌ Chatterbox models failed to load!"

	# Generate speech using Chatterbox
	print("🎭 Generating speech...")
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
	output_path = tmp_file.name

	# Use appropriate model based on language
	if language == "en":
	model = ENGLISH_MODEL
	wav = model.generate(
	input_text,
	audio_prompt_path=reference_audio,
	exaggeration=exaggeration,
	cfg=cfg
	)
	else:
	model = MULTILINGUAL_MODEL
	wav = model.generate(
	input_text,
	audio_prompt_path=reference_audio,
	language_id=language,
	exaggeration=exaggeration,
	cfg=cfg
	)

	# Save generated audio
	torchaudio.save(output_path, wav.cpu(), model.sr)

	if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
	return output_path, f"✅ Text-to-Voice Complete!\n📝 Generated speech: '{input_text[:100]}...'\n🎛️ Settings: Emotion={exaggeration}, CFG={cfg}\n📊 Language: {language}"
	else:
	return None, "❌ Generated audio file is empty!"

	except Exception as e:
	return None, f"❌ Text-to-Voice error: {str(e)}"

	# Try to load models at startup
	try:
	models_loaded = load_chatterbox_models()
	startup_message = "✅ Chatterbox Models Ready!" if models_loaded else "⚠️ Models will load on first use"
	except Exception as e:
	models_loaded = False
	startup_message = f"⚠️ Model loading will be attempted on first use: {str(e)}"

	# Create Gradio interface with tabs
	with gr.Blocks(
	title="🎭 Complete Voice Cloning Studio",
	theme=gr.themes.Soft(primary_hue="purple", secondary_hue="pink")
	) as demo:

	# Header
	gr.HTML("""
	<div style="text-align: center; padding: 20px;">
	<h1 style="color: #8B5CF6; margin-bottom: 10px;">🎭 Complete Voice Cloning Studio</h1>
	<p style="color: #666; font-size: 18px;">Voice-to-Voice & Text-to-Speech with Chatterbox AI</p>
	<p style="color: #888; font-size: 14px;">Both functionalities included - Choose your input method below</p>
	</div>
	""")

	# Model Status
	gr.HTML(f"""
	<div style="text-align: center; padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 20px;">
	<strong>🤖 Chatterbox Status:</strong> {startup_message}
	</div>
	""")

	# Reference Voice (shared across both tabs)
	gr.HTML("<h3 style='color: #8B5CF6; text-align: center;'>🎤 Reference Voice (Voice to Clone)</h3>")
	reference_audio = gr.Audio(
	label="Upload Reference Audio (5+ seconds of clear speech)",
	type="filepath",
	sources=["upload", "microphone"]
	)
	gr.HTML("<p style='color: #666; text-align: center; margin-bottom: 20px;'>📌 This voice will be cloned and applied to your content</p>")

	# Tabs for different input methods
	with gr.Tabs():
	# TAB 1: VOICE-TO-VOICE CLONING
	with gr.TabItem("🎵 Voice-to-Voice Cloning"):
	gr.HTML("""
	<div style="padding: 15px; background: #f0f8ff; border-radius: 10px; margin-bottom: 15px;">
	<h4 style="color: #4169E1; margin-bottom: 10px;">🎤 Voice-to-Voice Process:</h4>
	<p style="margin: 0;">1. Upload reference voice (person to clone)<br>
	2. Upload input audio (content to transform)<br>
	3. AI extracts speech content from input<br>
	4. Reference voice applied to extracted content</p>
	</div>
	""")

	with gr.Row():
	with gr.Column():
	input_audio = gr.Audio(
	label="Input Audio (Content to Transform)",
	type="filepath",
	sources=["upload", "microphone"]
	)

	with gr.Row():
	voice_language = gr.Dropdown(
	choices=[
	("🇺🇸 English", "en"),
	("🇪🇸 Spanish", "es"),
	("🇫🇷 French", "fr"),
	("🇩🇪 German", "de"),
	("🇮🇹 Italian", "it"),
	("🇧🇷 Portuguese", "pt"),
	("🇨🇳 Chinese", "zh"),
	("🇯🇵 Japanese", "ja"),
	("🇰🇷 Korean", "ko"),
	("🇷🇺 Russian", "ru")
	],
	value="en",
	label="Output Language"
	)

	voice_exaggeration = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	step=0.1,
	value=0.5,
	label="🎭 Emotion Exaggeration"
	)

	voice_cfg = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	step=0.1,
	value=0.5,
	label="🎛️ CFG Scale (Accuracy)"
	)

	voice_clone_btn = gr.Button(
	"🎤 Transform Voice (Audio → Cloned Audio)",
	variant="primary",
	size="lg"
	)

	with gr.Column():
	voice_output_audio = gr.Audio(
	label="Voice-to-Voice Result",
	type="filepath"
	)

	voice_status = gr.Textbox(
	label="Voice-to-Voice Status",
	lines=6,
	interactive=False
	)

	# TAB 2: TEXT-TO-VOICE CLONING
	with gr.TabItem("📝 Text-to-Speech Cloning"):
	gr.HTML("""
	<div style="padding: 15px; background: #f0fff0; border-radius: 10px; margin-bottom: 15px;">
	<h4 style="color: #228B22; margin-bottom: 10px;">📝 Text-to-Speech Process:</h4>
	<p style="margin: 0;">1. Upload reference voice (person to clone)<br>
	2. Enter text to convert to speech<br>
	3. AI generates speech in cloned voice<br>
	4. Download high-quality audio result</p>
	</div>
	""")

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Text to Convert to Speech",
	placeholder="Enter the text you want to speak in the cloned voice...",
	lines=5,
	max_lines=8
	)

	with gr.Row():
	text_language = gr.Dropdown(
	choices=[
	("🇺🇸 English", "en"),
	("🇪🇸 Spanish", "es"),
	("🇫🇷 French", "fr"),
	("🇩🇪 German", "de"),
	("🇮🇹 Italian", "it"),
	("🇧🇷 Portuguese", "pt"),
	("🇨🇳 Chinese", "zh"),
	("🇯🇵 Japanese", "ja")
	],
	value="en",
	label="Speech Language"
	)

	text_exaggeration = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	step=0.1,
	value=0.5,
	label="🎭 Emotion Exaggeration"
	)

	text_cfg = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	step=0.1,
	value=0.5,
	label="🎛️ CFG Scale (Accuracy)"
	)

	text_clone_btn = gr.Button(
	"📝 Generate Speech (Text → Cloned Audio)",
	variant="secondary",
	size="lg"
	)

	with gr.Column():
	text_output_audio = gr.Audio(
	label="Text-to-Speech Result",
	type="filepath"
	)

	text_status = gr.Textbox(
	label="Text-to-Speech Status",
	lines=6,
	interactive=False
	)

	# Examples Section
	with gr.Accordion("💡 Example Texts", open=False):
	examples = [
	"Hello, this is a demonstration of AI voice cloning technology using Chatterbox.",
	"The weather is beautiful today, perfect for a walk in the park with friends.",
	"Artificial intelligence is revolutionizing the way we create and share content.",
	"This advanced voice cloning system can generate natural speech in multiple languages."
	]

	gr.Examples(
	examples=examples,
	inputs=text_input,
	label="Click to use these example texts:"
	)

	# Event Handlers - BOTH FUNCTIONS CONNECTED
	voice_clone_btn.click(
	fn=voice_to_voice_cloning,
	inputs=[reference_audio, input_audio, voice_language, voice_exaggeration, voice_cfg],
	outputs=[voice_output_audio, voice_status],
	show_progress=True
	)

	text_clone_btn.click(
	fn=text_to_voice_cloning,
	inputs=[reference_audio, text_input, text_language, text_exaggeration, text_cfg],
	outputs=[text_output_audio, text_status],
	show_progress=True
	)

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False
	)