Spaces:

anuj6316
/

Audio_Voice_Cloning

Sleeping

App Files Files Community

Audio_Voice_Cloning / 11Labs_V01.py

anuj6316

Upload 3 files

e964f21 verified 11 months ago

raw

history blame contribute delete

26 kB

	import gradio as gr
	from elevenlabs import ElevenLabs
	from google import genai
	from dotenv import load_dotenv
	import base64
	import os
	import tempfile
	import shutil
	import json
	from pathlib import Path

	# Load environment variables
	load_dotenv()

	class AudioVoiceCloner:
	def __init__(self):
	# Get API keys from environment variables for security
	self.elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY")
	self.google_api_key = os.getenv("GOOGLE_API_KEY")

	if not self.elevenlabs_api_key:
	raise ValueError("Please set ELEVENLABS_API_KEY environment variable")
	if not self.google_api_key:
	raise ValueError("Please set GOOGLE_API_KEY environment variable")

	self.elevenlabs_client = ElevenLabs(api_key=self.elevenlabs_api_key)
	self.gemini_client = genai.Client(api_key=self.google_api_key)

	def transcribe_audio(self, audio_file_path):
	"""Transcribe audio using Gemini API - ONLY the spoken content."""
	try:
	# Upload file to Gemini
	uploaded_file = self.gemini_client.files.upload(file=audio_file_path)

	# Generate transcription - ONLY the content, no analysis
	response = self.gemini_client.models.generate_content(
	model="gemini-2.0-flash",
	contents=[
	'Listen to the audio carefully and transcribe ONLY the spoken words. '
	'Focus on the main speaker and ignore background noise or music. '
	'Do NOT include any analysis, commentary, or descriptions about the voice, tone, or pace. '
	'Do NOT mention speaking characteristics like "fast", "slow", "emphasis", etc. '
	'Provide ONLY the exact words that were spoken, nothing else. '
	'If parts are unclear, indicate [unclear] but do not make up content.',
	uploaded_file
	]
	)

	# Clean up the uploaded file
	try:
	self.gemini_client.files.delete(uploaded_file.name)
	except:
	pass # If deletion fails, continue anyway

	return response.text.strip(), "Transcription completed successfully!"

	except Exception as e:
	return "", f"Error during transcription: {str(e)}"

	def analyze_voice_characteristics(self, audio_file_path):
	"""Analyze voice characteristics in the background for speech tuning (not for transcription)."""
	try:
	# Upload file to Gemini
	uploaded_file = self.gemini_client.files.upload(file=audio_file_path)

	# Generate voice characteristics analysis for internal use only
	response = self.gemini_client.models.generate_content(
	model="gemini-2.0-flash",
	contents=[
	'Analyze the speaking characteristics of this audio for voice synthesis purposes. '
	'I need technical data about the voice patterns, NOT transcription. '
	'Focus on analyzing:\n'
	'1. Speaking pace and rhythm patterns\n'
	'2. Pause locations and durations\n'
	'3. Emphasis and stress patterns\n'
	'4. Breathing patterns\n'
	'5. Tone variations\n'
	'6. Pitch patterns\n\n'
	'Provide analysis in JSON format:\n'
	'{\n'
	' "pace": "fast/normal/slow with specific sections",\n'
	' "pause_pattern": "description of pause locations and timing",\n'
	' "emphasis_style": "description of how emphasis is applied",\n'
	' "breathing_pattern": "natural breathing breaks detected",\n'
	' "tone_characteristics": "description of voice tone and variations",\n'
	' "pitch_pattern": "description of pitch variations",\n'
	' "speech_timing_markers": "SSML-compatible timing instructions"\n'
	'}\n\n'
	'This analysis will be used to configure voice synthesis parameters, not for transcription.',
	uploaded_file
	]
	)

	# Clean up the uploaded file
	try:
	self.gemini_client.files.delete(uploaded_file.name)
	except:
	pass

	return response.text.strip()

	except Exception as e:
	return "{}" # Return empty JSON on error

	def create_natural_speech_with_characteristics(self, original_text, voice_characteristics):
	"""Apply voice characteristics to create natural-sounding speech."""
	try:
	# Parse voice characteristics
	try:
	char_data = json.loads(voice_characteristics)
	except:
	char_data = {}

	# Apply characteristics to the original text for natural speech
	enhanced_text = original_text

	# Add natural pauses based on analysis
	if "pause_pattern" in char_data:
	# Insert natural pauses at sentence boundaries and logical breaks
	enhanced_text = enhanced_text.replace(". ", ". <break time='0.5s'/>")
	enhanced_text = enhanced_text.replace("? ", "? <break time='0.5s'/>")
	enhanced_text = enhanced_text.replace("! ", "! <break time='0.5s'/>")
	enhanced_text = enhanced_text.replace(", ", ", <break time='0.2s'/>")

	# Apply pace characteristics
	if "pace" in char_data:
	pace_info = char_data["pace"].lower()
	if "slow" in pace_info:
	enhanced_text = f'<prosody rate="slow">{enhanced_text}</prosody>'
	elif "fast" in pace_info:
	enhanced_text = f'<prosody rate="fast">{enhanced_text}</prosody>'

	# Wrap in SSML speak tags for ElevenLabs
	enhanced_text = f'<speak>{enhanced_text}</speak>'

	return enhanced_text, char_data

	except Exception as e:
	# If processing fails, return original text
	return original_text, {}

	def clone_voice_with_natural_characteristics(self, input_audio_path, text_to_speak, voice_analysis=None):
	"""Clone voice and apply natural speech characteristics in background."""
	try:
	# Find similar voice from uploaded audio
	with open(input_audio_path, 'rb') as audio_file:
	similar_voices_response = self.elevenlabs_client.voices.find_similar_voices(
	audio_file=audio_file
	)

	# Extract the first voice from the response
	if not similar_voices_response or not hasattr(similar_voices_response, 'voices') or not similar_voices_response.voices:
	return None, "No similar voice found. Please try with a clearer audio sample."

	# Get the first (most similar) voice
	voice_id = similar_voices_response.voices[0].voice_id

	# Apply voice characteristics to text if analysis is provided
	if voice_analysis:
	enhanced_text, characteristics = self.create_natural_speech_with_characteristics(text_to_speak, voice_analysis)
	else:
	enhanced_text = text_to_speak
	characteristics = {}

	# Generate speech with enhanced characteristics
	try:
	# Try with SSML for better natural speech
	audio_generator = self.elevenlabs_client.text_to_speech.convert(
	voice_id=voice_id,
	text=enhanced_text,
	model_id="eleven_multilingual_v2", # Better model for SSML support
	voice_settings={
	"stability": 0.75,
	"similarity_boost": 0.8,
	"style": 0.1, # Add slight style variation
	"use_speaker_boost": True
	}
	)
	except:
	# Fallback: Strip SSML tags and use basic generation
	import re
	clean_text = re.sub(r'<[^>]+>', '', enhanced_text)
	audio_generator = self.elevenlabs_client.text_to_speech.convert(
	voice_id=voice_id,
	text=clean_text,
	voice_settings={
	"stability": 0.75,
	"similarity_boost": 0.85,
	"style": 0.1,
	"use_speaker_boost": True
	}
	)

	# Convert generator to bytes
	audio_bytes = b""
	for chunk in audio_generator:
	audio_bytes += chunk

	# Save the generated audio
	output_path = tempfile.mktemp(suffix='.wav')
	with open(output_path, 'wb') as f:
	f.write(audio_bytes)

	return output_path, "Voice cloned with natural speech characteristics!"

	except Exception as e:
	return None, f"Error during voice cloning: {str(e)}"

	def complete_voice_reproduction(self, input_audio_path):
	"""Complete workflow: transcribe content and reproduce with same voice characteristics."""
	try:
	# Step 1: Get the spoken content (transcription)
	transcription, transcribe_msg = self.transcribe_audio(input_audio_path)
	if not transcription:
	return None, "", f"Transcription failed: {transcribe_msg}"

	# Step 2: Analyze voice characteristics in background
	voice_analysis = self.analyze_voice_characteristics(input_audio_path)

	# Step 3: Clone voice and generate speech with natural characteristics
	output_audio, clone_msg = self.clone_voice_with_natural_characteristics(
	input_audio_path, transcription, voice_analysis
	)

	if output_audio:
	return output_audio, transcription, "✅ SUCCESS! Voice cloned with natural speech patterns preserved."
	else:
	return None, transcription, f"❌ Voice cloning failed: {clone_msg}"

	except Exception as e:
	return None, "", f"Unexpected error: {str(e)}"

	def save_audio_from_base64(self, audio_base64, filename):
	"""Save base64 audio data to a file."""
	try:
	audio_bytes = base64.b64decode(audio_base64)
	with open(filename, 'wb') as f:
	f.write(audio_bytes)
	return True
	except Exception as e:
	print(f"Error saving audio: {e}")
	return False

	def clone_voice_and_generate(self, input_audio_path, text_to_speak):
	"""Clone voice from input audio and generate speech with given text."""
	try:
	# Analyze voice characteristics for better synthesis
	voice_analysis = self.analyze_voice_characteristics(input_audio_path)

	# Clone voice with natural characteristics
	output_audio, message = self.clone_voice_with_natural_characteristics(
	input_audio_path, text_to_speak, voice_analysis
	)

	return output_audio, message

	except Exception as e:
	return None, f"Unexpected error: {str(e)}"

	def create_enhanced_voice_cloning_interface():
	"""Create enhanced Gradio interface with transcription and voice cloning."""

	# Initialize the voice cloner
	try:
	cloner = AudioVoiceCloner()
	except ValueError as e:
	def error_function(*args):
	return None, str(e)

	# Create a simple error interface if API keys are missing
	with gr.Blocks(title="Enhanced Audio Voice Cloning") as demo:
	gr.Markdown("# Enhanced Audio Voice Cloning Tool")
	gr.Markdown("⚠️ Error: Please set your API keys in environment variables")
	gr.Markdown("Required environment variables:")
	gr.Markdown("- `ELEVENLABS_API_KEY` from [ElevenLabs Dashboard](https://elevenlabs.io/)")
	gr.Markdown("- `GOOGLE_API_KEY` from [Google AI Studio](https://aistudio.google.com/)")

	return demo

	def transcribe_only(input_audio):
	"""Transcribe audio only - just the spoken content."""
	if input_audio is None:
	return "", "Please upload an audio file."

	try:
	transcription, message = cloner.transcribe_audio(input_audio)
	return transcription, message
	except Exception as e:
	return "", f"Unexpected error: {str(e)}"

	def process_audio_with_transcription(input_audio):
	"""Auto-transcribe and populate text field."""
	if input_audio is None:
	return "", "Please upload an audio file."

	try:
	transcription, _ = cloner.transcribe_audio(input_audio)
	return transcription, "Auto-transcription completed! You can edit the text if needed."
	except Exception as e:
	return "", f"Error in auto-transcription: {str(e)}"

	def process_complete_reproduction(input_audio):
	"""Complete workflow: transcribe and reproduce with same voice and natural characteristics."""
	if input_audio is None:
	return None, "", "Please upload an audio file."

	try:
	output_audio, transcription, message = cloner.complete_voice_reproduction(input_audio)
	return output_audio, transcription, message

	except Exception as e:
	return None, "", f"Unexpected error: {str(e)}"

	def process_voice_cloning(input_audio, text_input):
	"""Process the uploaded audio and generate cloned voice with natural characteristics."""
	if input_audio is None:
	return None, "Please upload an audio file."

	if not text_input or text_input.strip() == "":
	return None, "Please enter text to be spoken."

	try:
	# Process the audio with voice characteristic analysis
	output_audio, message = cloner.clone_voice_and_generate(input_audio, text_input.strip())
	return output_audio, message
	except Exception as e:
	return None, f"Unexpected error: {str(e)}"

	# Create the Gradio interface
	with gr.Blocks(title="Enhanced Audio Voice Cloning", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🎙️ Enhanced Audio Voice Cloning Tool")
	gr.Markdown("Upload an audio file to clone the voice and generate natural-sounding speech using AI.")

	with gr.Tabs():
	# Tab 1: One-Click Reproduction
	with gr.TabItem("🎯 Reproduce Original Audio"):
	gr.Markdown("### Upload your audio and get AI reproduction with same voice and natural speech patterns!")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 📤 Original Audio")
	input_audio_reproduce = gr.Audio(
	label="Upload Your Audio File",
	type="filepath"
	)

	reproduce_btn = gr.Button("🔄 Clone Voice & Reproduce Content", variant="primary", size="lg")

	gr.Markdown("What this does:")
	gr.Markdown("• 🎭 Clones your voice characteristics")
	gr.Markdown("• 📝 Extracts the spoken content")
	gr.Markdown("• 🎤 Reproduces with natural speech patterns")
	gr.Markdown("• ⏱️ Applies natural timing and pauses")

	with gr.Column(scale=1):
	gr.Markdown("### 📥 AI Reproduction")
	output_audio_reproduce = gr.Audio(
	label="🎵 Cloned Voice Output",
	type="filepath"
	)

	transcribed_content = gr.Textbox(
	label="📝 Extracted Content:",
	lines=4,
	interactive=False,
	placeholder="The spoken content will appear here..."
	)

	status_reproduce = gr.Textbox(
	label="🔄 Status",
	lines=3,
	interactive=False,
	placeholder="Upload audio and click 'Clone Voice & Reproduce Content' to start..."
	)

	# Tab 2: Step-by-Step Workflow
	with gr.TabItem("🔄 Step-by-Step Workflow"):
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Input")
	input_audio_main = gr.Audio(
	label="Upload Audio File (WAV, MP3, etc.)",
	type="filepath"
	)

	auto_transcribe_btn = gr.Button("📝 Step 1: Extract Text Content", variant="secondary")

	text_input_main = gr.Textbox(
	label="Text Content (you can edit this)",
	placeholder="Upload audio and click 'Step 1: Extract Text Content' first...",
	lines=5
	)

	clone_btn_main = gr.Button("🎯 Step 2: Clone Voice & Generate", variant="primary", size="lg")

	with gr.Column(scale=1):
	gr.Markdown("### Output")
	output_audio_main = gr.Audio(
	label="Generated Audio (Cloned Voice)",
	type="filepath"
	)

	status_message_main = gr.Textbox(
	label="Status",
	interactive=False,
	placeholder="Status messages will appear here..."
	)

	# Tab 3: Text Extraction Only
	with gr.TabItem("📝 Text Extraction Only"):
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Input")
	input_audio_transcribe = gr.Audio(
	label="Upload Audio File for Text Extraction",
	type="filepath"
	)

	transcribe_btn = gr.Button("📝 Extract Text Content", variant="primary")

	with gr.Column(scale=1):
	gr.Markdown("### Output")
	transcription_output = gr.Textbox(
	label="Extracted Text",
	lines=10,
	placeholder="Extracted text will appear here..."
	)

	transcribe_status = gr.Textbox(
	label="Status",
	interactive=False,
	placeholder="Status messages will appear here..."
	)

	# Tab 4: Voice Cloning Only
	with gr.TabItem("🎯 Voice Cloning Only"):
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Input")
	input_audio_clone = gr.Audio(
	label="Upload Audio File (WAV, MP3, etc.)",
	type="filepath"
	)

	text_input_clone = gr.Textbox(
	label="Text to Speak",
	placeholder="Enter the text you want to be spoken in the cloned voice...",
	lines=3,
	value="Hello, this is a test of the voice cloning system. The AI will speak this text using the characteristics of the uploaded audio."
	)

	clone_btn_only = gr.Button("🎯 Clone Voice & Generate", variant="primary", size="lg")

	with gr.Column(scale=1):
	gr.Markdown("### Output")
	output_audio_clone = gr.Audio(
	label="Generated Audio (Cloned Voice)",
	type="filepath"
	)

	status_message_clone = gr.Textbox(
	label="Status",
	interactive=False,
	placeholder="Status messages will appear here..."
	)

	# Event handling
	reproduce_btn.click(
	fn=process_complete_reproduction,
	inputs=[input_audio_reproduce],
	outputs=[output_audio_reproduce, transcribed_content, status_reproduce]
	)

	auto_transcribe_btn.click(
	fn=process_audio_with_transcription,
	inputs=[input_audio_main],
	outputs=[text_input_main, status_message_main]
	)

	clone_btn_main.click(
	fn=process_voice_cloning,
	inputs=[input_audio_main, text_input_main],
	outputs=[output_audio_main, status_message_main]
	)

	transcribe_btn.click(
	fn=transcribe_only,
	inputs=[input_audio_transcribe],
	outputs=[transcription_output, transcribe_status]
	)

	clone_btn_only.click(
	fn=process_voice_cloning,
	inputs=[input_audio_clone, text_input_clone],
	outputs=[output_audio_clone, status_message_clone]
	)

	# Add tips section
	gr.Markdown("### 💡 Natural Voice Cloning with Background Analysis")
	gr.Markdown("""
	🎯 "Reproduce Original Audio" Tab - Main Feature:
	- ✅ Simply upload your audio file
	- ✅ AI extracts ONLY the spoken content (no analysis descriptions)
	- ✅ Voice characteristics are analyzed in the background for natural speech
	- ✅ Generated audio sounds natural with proper timing and tone

	Quality Tips for Best Results:
	- 🎤 Use clear audio with minimal background noise
	- ⏱️ 15-60 seconds of natural speech works best
	- 🗣️ Conversational tone works better than robotic reading
	- 🎵 Avoid music or overlapping voices
	""")

	gr.Markdown("### 🔧 How It Works")
	gr.Markdown("""
	Behind the Scenes:
	1. Content Extraction: AI extracts only the spoken words (no descriptions)
	2. Voice Analysis: AI analyzes pace, tone, pauses in the background
	3. Natural Synthesis: Voice characteristics are applied to create natural-sounding speech
	4. Clean Output: You get natural-sounding cloned voice without technical descriptions
	""")

	return demo

	# Create and launch the interface
	if __name__ == "__main__":
	# Set up environment variable instructions
	print("🎙️ Enhanced Audio Voice Cloning Tool")
	print("=" * 50)

	missing_keys = []
	if not os.getenv("ELEVENLABS_API_KEY"):
	missing_keys.append("ELEVENLABS_API_KEY")
	if not os.getenv("GOOGLE_API_KEY"):
	missing_keys.append("GOOGLE_API_KEY")

	if missing_keys:
	print("⚠️ WARNING: Missing environment variables!")
	for key in missing_keys:
	print(f" {key} not set")
	print("\nPlease set your API keys:")
	print(" export ELEVENLABS_API_KEY='your-elevenlabs-key-here'")
	print(" export GOOGLE_API_KEY='your-google-key-here'")
	print("Or create a .env file with:")
	print(" ELEVENLABS_API_KEY=your-elevenlabs-key-here")
	print(" GOOGLE_API_KEY=your-google-key-here")
	print()

	# Create and launch the interface
	demo = create_enhanced_voice_cloning_interface()
	demo.launch(
	share=True, # Set to False if you don't want to create a public link
	debug=True,
	server_name="0.0.0.0", # Allow access from other devices on network
	server_port=7860
	)