Audio_Voice_Cloning / 11Labs_V01.py
anuj6316's picture
Upload 3 files
e964f21 verified
import gradio as gr
from elevenlabs import ElevenLabs
from google import genai
from dotenv import load_dotenv
import base64
import os
import tempfile
import shutil
import json
from pathlib import Path
# Load environment variables
load_dotenv()
class AudioVoiceCloner:
def __init__(self):
# Get API keys from environment variables for security
self.elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY")
self.google_api_key = os.getenv("GOOGLE_API_KEY")
if not self.elevenlabs_api_key:
raise ValueError("Please set ELEVENLABS_API_KEY environment variable")
if not self.google_api_key:
raise ValueError("Please set GOOGLE_API_KEY environment variable")
self.elevenlabs_client = ElevenLabs(api_key=self.elevenlabs_api_key)
self.gemini_client = genai.Client(api_key=self.google_api_key)
def transcribe_audio(self, audio_file_path):
"""Transcribe audio using Gemini API - ONLY the spoken content."""
try:
# Upload file to Gemini
uploaded_file = self.gemini_client.files.upload(file=audio_file_path)
# Generate transcription - ONLY the content, no analysis
response = self.gemini_client.models.generate_content(
model="gemini-2.0-flash",
contents=[
'Listen to the audio carefully and transcribe ONLY the spoken words. '
'Focus on the main speaker and ignore background noise or music. '
'Do NOT include any analysis, commentary, or descriptions about the voice, tone, or pace. '
'Do NOT mention speaking characteristics like "fast", "slow", "emphasis", etc. '
'Provide ONLY the exact words that were spoken, nothing else. '
'If parts are unclear, indicate [unclear] but do not make up content.',
uploaded_file
]
)
# Clean up the uploaded file
try:
self.gemini_client.files.delete(uploaded_file.name)
except:
pass # If deletion fails, continue anyway
return response.text.strip(), "Transcription completed successfully!"
except Exception as e:
return "", f"Error during transcription: {str(e)}"
def analyze_voice_characteristics(self, audio_file_path):
"""Analyze voice characteristics in the background for speech tuning (not for transcription)."""
try:
# Upload file to Gemini
uploaded_file = self.gemini_client.files.upload(file=audio_file_path)
# Generate voice characteristics analysis for internal use only
response = self.gemini_client.models.generate_content(
model="gemini-2.0-flash",
contents=[
'Analyze the speaking characteristics of this audio for voice synthesis purposes. '
'I need technical data about the voice patterns, NOT transcription. '
'Focus on analyzing:\n'
'1. Speaking pace and rhythm patterns\n'
'2. Pause locations and durations\n'
'3. Emphasis and stress patterns\n'
'4. Breathing patterns\n'
'5. Tone variations\n'
'6. Pitch patterns\n\n'
'Provide analysis in JSON format:\n'
'{\n'
' "pace": "fast/normal/slow with specific sections",\n'
' "pause_pattern": "description of pause locations and timing",\n'
' "emphasis_style": "description of how emphasis is applied",\n'
' "breathing_pattern": "natural breathing breaks detected",\n'
' "tone_characteristics": "description of voice tone and variations",\n'
' "pitch_pattern": "description of pitch variations",\n'
' "speech_timing_markers": "SSML-compatible timing instructions"\n'
'}\n\n'
'This analysis will be used to configure voice synthesis parameters, not for transcription.',
uploaded_file
]
)
# Clean up the uploaded file
try:
self.gemini_client.files.delete(uploaded_file.name)
except:
pass
return response.text.strip()
except Exception as e:
return "{}" # Return empty JSON on error
def create_natural_speech_with_characteristics(self, original_text, voice_characteristics):
"""Apply voice characteristics to create natural-sounding speech."""
try:
# Parse voice characteristics
try:
char_data = json.loads(voice_characteristics)
except:
char_data = {}
# Apply characteristics to the original text for natural speech
enhanced_text = original_text
# Add natural pauses based on analysis
if "pause_pattern" in char_data:
# Insert natural pauses at sentence boundaries and logical breaks
enhanced_text = enhanced_text.replace(". ", ". <break time='0.5s'/>")
enhanced_text = enhanced_text.replace("? ", "? <break time='0.5s'/>")
enhanced_text = enhanced_text.replace("! ", "! <break time='0.5s'/>")
enhanced_text = enhanced_text.replace(", ", ", <break time='0.2s'/>")
# Apply pace characteristics
if "pace" in char_data:
pace_info = char_data["pace"].lower()
if "slow" in pace_info:
enhanced_text = f'<prosody rate="slow">{enhanced_text}</prosody>'
elif "fast" in pace_info:
enhanced_text = f'<prosody rate="fast">{enhanced_text}</prosody>'
# Wrap in SSML speak tags for ElevenLabs
enhanced_text = f'<speak>{enhanced_text}</speak>'
return enhanced_text, char_data
except Exception as e:
# If processing fails, return original text
return original_text, {}
def clone_voice_with_natural_characteristics(self, input_audio_path, text_to_speak, voice_analysis=None):
"""Clone voice and apply natural speech characteristics in background."""
try:
# Find similar voice from uploaded audio
with open(input_audio_path, 'rb') as audio_file:
similar_voices_response = self.elevenlabs_client.voices.find_similar_voices(
audio_file=audio_file
)
# Extract the first voice from the response
if not similar_voices_response or not hasattr(similar_voices_response, 'voices') or not similar_voices_response.voices:
return None, "No similar voice found. Please try with a clearer audio sample."
# Get the first (most similar) voice
voice_id = similar_voices_response.voices[0].voice_id
# Apply voice characteristics to text if analysis is provided
if voice_analysis:
enhanced_text, characteristics = self.create_natural_speech_with_characteristics(text_to_speak, voice_analysis)
else:
enhanced_text = text_to_speak
characteristics = {}
# Generate speech with enhanced characteristics
try:
# Try with SSML for better natural speech
audio_generator = self.elevenlabs_client.text_to_speech.convert(
voice_id=voice_id,
text=enhanced_text,
model_id="eleven_multilingual_v2", # Better model for SSML support
voice_settings={
"stability": 0.75,
"similarity_boost": 0.8,
"style": 0.1, # Add slight style variation
"use_speaker_boost": True
}
)
except:
# Fallback: Strip SSML tags and use basic generation
import re
clean_text = re.sub(r'<[^>]+>', '', enhanced_text)
audio_generator = self.elevenlabs_client.text_to_speech.convert(
voice_id=voice_id,
text=clean_text,
voice_settings={
"stability": 0.75,
"similarity_boost": 0.85,
"style": 0.1,
"use_speaker_boost": True
}
)
# Convert generator to bytes
audio_bytes = b""
for chunk in audio_generator:
audio_bytes += chunk
# Save the generated audio
output_path = tempfile.mktemp(suffix='.wav')
with open(output_path, 'wb') as f:
f.write(audio_bytes)
return output_path, "Voice cloned with natural speech characteristics!"
except Exception as e:
return None, f"Error during voice cloning: {str(e)}"
def complete_voice_reproduction(self, input_audio_path):
"""Complete workflow: transcribe content and reproduce with same voice characteristics."""
try:
# Step 1: Get the spoken content (transcription)
transcription, transcribe_msg = self.transcribe_audio(input_audio_path)
if not transcription:
return None, "", f"Transcription failed: {transcribe_msg}"
# Step 2: Analyze voice characteristics in background
voice_analysis = self.analyze_voice_characteristics(input_audio_path)
# Step 3: Clone voice and generate speech with natural characteristics
output_audio, clone_msg = self.clone_voice_with_natural_characteristics(
input_audio_path, transcription, voice_analysis
)
if output_audio:
return output_audio, transcription, "βœ… SUCCESS! Voice cloned with natural speech patterns preserved."
else:
return None, transcription, f"❌ Voice cloning failed: {clone_msg}"
except Exception as e:
return None, "", f"Unexpected error: {str(e)}"
def save_audio_from_base64(self, audio_base64, filename):
"""Save base64 audio data to a file."""
try:
audio_bytes = base64.b64decode(audio_base64)
with open(filename, 'wb') as f:
f.write(audio_bytes)
return True
except Exception as e:
print(f"Error saving audio: {e}")
return False
def clone_voice_and_generate(self, input_audio_path, text_to_speak):
"""Clone voice from input audio and generate speech with given text."""
try:
# Analyze voice characteristics for better synthesis
voice_analysis = self.analyze_voice_characteristics(input_audio_path)
# Clone voice with natural characteristics
output_audio, message = self.clone_voice_with_natural_characteristics(
input_audio_path, text_to_speak, voice_analysis
)
return output_audio, message
except Exception as e:
return None, f"Unexpected error: {str(e)}"
def create_enhanced_voice_cloning_interface():
"""Create enhanced Gradio interface with transcription and voice cloning."""
# Initialize the voice cloner
try:
cloner = AudioVoiceCloner()
except ValueError as e:
def error_function(*args):
return None, str(e)
# Create a simple error interface if API keys are missing
with gr.Blocks(title="Enhanced Audio Voice Cloning") as demo:
gr.Markdown("# Enhanced Audio Voice Cloning Tool")
gr.Markdown("⚠️ **Error**: Please set your API keys in environment variables")
gr.Markdown("Required environment variables:")
gr.Markdown("- `ELEVENLABS_API_KEY` from [ElevenLabs Dashboard](https://elevenlabs.io/)")
gr.Markdown("- `GOOGLE_API_KEY` from [Google AI Studio](https://aistudio.google.com/)")
return demo
def transcribe_only(input_audio):
"""Transcribe audio only - just the spoken content."""
if input_audio is None:
return "", "Please upload an audio file."
try:
transcription, message = cloner.transcribe_audio(input_audio)
return transcription, message
except Exception as e:
return "", f"Unexpected error: {str(e)}"
def process_audio_with_transcription(input_audio):
"""Auto-transcribe and populate text field."""
if input_audio is None:
return "", "Please upload an audio file."
try:
transcription, _ = cloner.transcribe_audio(input_audio)
return transcription, "Auto-transcription completed! You can edit the text if needed."
except Exception as e:
return "", f"Error in auto-transcription: {str(e)}"
def process_complete_reproduction(input_audio):
"""Complete workflow: transcribe and reproduce with same voice and natural characteristics."""
if input_audio is None:
return None, "", "Please upload an audio file."
try:
output_audio, transcription, message = cloner.complete_voice_reproduction(input_audio)
return output_audio, transcription, message
except Exception as e:
return None, "", f"Unexpected error: {str(e)}"
def process_voice_cloning(input_audio, text_input):
"""Process the uploaded audio and generate cloned voice with natural characteristics."""
if input_audio is None:
return None, "Please upload an audio file."
if not text_input or text_input.strip() == "":
return None, "Please enter text to be spoken."
try:
# Process the audio with voice characteristic analysis
output_audio, message = cloner.clone_voice_and_generate(input_audio, text_input.strip())
return output_audio, message
except Exception as e:
return None, f"Unexpected error: {str(e)}"
# Create the Gradio interface
with gr.Blocks(title="Enhanced Audio Voice Cloning", theme=gr.themes.Soft()) as demo:
gr.Markdown("# πŸŽ™οΈ Enhanced Audio Voice Cloning Tool")
gr.Markdown("Upload an audio file to clone the voice and generate natural-sounding speech using AI.")
with gr.Tabs():
# Tab 1: One-Click Reproduction
with gr.TabItem("🎯 Reproduce Original Audio"):
gr.Markdown("### Upload your audio and get AI reproduction with same voice and natural speech patterns!")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### πŸ“€ Original Audio")
input_audio_reproduce = gr.Audio(
label="Upload Your Audio File",
type="filepath"
)
reproduce_btn = gr.Button("πŸ”„ Clone Voice & Reproduce Content", variant="primary", size="lg")
gr.Markdown("**What this does:**")
gr.Markdown("β€’ 🎭 Clones your voice characteristics")
gr.Markdown("β€’ πŸ“ Extracts the spoken content")
gr.Markdown("β€’ 🎀 Reproduces with natural speech patterns")
gr.Markdown("β€’ ⏱️ Applies natural timing and pauses")
with gr.Column(scale=1):
gr.Markdown("### πŸ“₯ AI Reproduction")
output_audio_reproduce = gr.Audio(
label="🎡 Cloned Voice Output",
type="filepath"
)
transcribed_content = gr.Textbox(
label="πŸ“ Extracted Content:",
lines=4,
interactive=False,
placeholder="The spoken content will appear here..."
)
status_reproduce = gr.Textbox(
label="πŸ”„ Status",
lines=3,
interactive=False,
placeholder="Upload audio and click 'Clone Voice & Reproduce Content' to start..."
)
# Tab 2: Step-by-Step Workflow
with gr.TabItem("πŸ”„ Step-by-Step Workflow"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Input")
input_audio_main = gr.Audio(
label="Upload Audio File (WAV, MP3, etc.)",
type="filepath"
)
auto_transcribe_btn = gr.Button("πŸ“ Step 1: Extract Text Content", variant="secondary")
text_input_main = gr.Textbox(
label="Text Content (you can edit this)",
placeholder="Upload audio and click 'Step 1: Extract Text Content' first...",
lines=5
)
clone_btn_main = gr.Button("🎯 Step 2: Clone Voice & Generate", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### Output")
output_audio_main = gr.Audio(
label="Generated Audio (Cloned Voice)",
type="filepath"
)
status_message_main = gr.Textbox(
label="Status",
interactive=False,
placeholder="Status messages will appear here..."
)
# Tab 3: Text Extraction Only
with gr.TabItem("πŸ“ Text Extraction Only"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Input")
input_audio_transcribe = gr.Audio(
label="Upload Audio File for Text Extraction",
type="filepath"
)
transcribe_btn = gr.Button("πŸ“ Extract Text Content", variant="primary")
with gr.Column(scale=1):
gr.Markdown("### Output")
transcription_output = gr.Textbox(
label="Extracted Text",
lines=10,
placeholder="Extracted text will appear here..."
)
transcribe_status = gr.Textbox(
label="Status",
interactive=False,
placeholder="Status messages will appear here..."
)
# Tab 4: Voice Cloning Only
with gr.TabItem("🎯 Voice Cloning Only"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Input")
input_audio_clone = gr.Audio(
label="Upload Audio File (WAV, MP3, etc.)",
type="filepath"
)
text_input_clone = gr.Textbox(
label="Text to Speak",
placeholder="Enter the text you want to be spoken in the cloned voice...",
lines=3,
value="Hello, this is a test of the voice cloning system. The AI will speak this text using the characteristics of the uploaded audio."
)
clone_btn_only = gr.Button("🎯 Clone Voice & Generate", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### Output")
output_audio_clone = gr.Audio(
label="Generated Audio (Cloned Voice)",
type="filepath"
)
status_message_clone = gr.Textbox(
label="Status",
interactive=False,
placeholder="Status messages will appear here..."
)
# Event handling
reproduce_btn.click(
fn=process_complete_reproduction,
inputs=[input_audio_reproduce],
outputs=[output_audio_reproduce, transcribed_content, status_reproduce]
)
auto_transcribe_btn.click(
fn=process_audio_with_transcription,
inputs=[input_audio_main],
outputs=[text_input_main, status_message_main]
)
clone_btn_main.click(
fn=process_voice_cloning,
inputs=[input_audio_main, text_input_main],
outputs=[output_audio_main, status_message_main]
)
transcribe_btn.click(
fn=transcribe_only,
inputs=[input_audio_transcribe],
outputs=[transcription_output, transcribe_status]
)
clone_btn_only.click(
fn=process_voice_cloning,
inputs=[input_audio_clone, text_input_clone],
outputs=[output_audio_clone, status_message_clone]
)
# Add tips section
gr.Markdown("### πŸ’‘ Natural Voice Cloning with Background Analysis")
gr.Markdown("""
**🎯 "Reproduce Original Audio" Tab - Main Feature:**
- βœ… Simply upload your audio file
- βœ… AI extracts ONLY the spoken content (no analysis descriptions)
- βœ… Voice characteristics are analyzed in the background for natural speech
- βœ… Generated audio sounds natural with proper timing and tone
**Quality Tips for Best Results:**
- 🎀 Use clear audio with minimal background noise
- ⏱️ 15-60 seconds of natural speech works best
- πŸ—£οΈ Conversational tone works better than robotic reading
- 🎡 Avoid music or overlapping voices
""")
gr.Markdown("### πŸ”§ How It Works")
gr.Markdown("""
**Behind the Scenes:**
1. **Content Extraction:** AI extracts only the spoken words (no descriptions)
2. **Voice Analysis:** AI analyzes pace, tone, pauses in the background
3. **Natural Synthesis:** Voice characteristics are applied to create natural-sounding speech
4. **Clean Output:** You get natural-sounding cloned voice without technical descriptions
""")
return demo
# Create and launch the interface
if __name__ == "__main__":
# Set up environment variable instructions
print("πŸŽ™οΈ Enhanced Audio Voice Cloning Tool")
print("=" * 50)
missing_keys = []
if not os.getenv("ELEVENLABS_API_KEY"):
missing_keys.append("ELEVENLABS_API_KEY")
if not os.getenv("GOOGLE_API_KEY"):
missing_keys.append("GOOGLE_API_KEY")
if missing_keys:
print("⚠️ WARNING: Missing environment variables!")
for key in missing_keys:
print(f" {key} not set")
print("\nPlease set your API keys:")
print(" export ELEVENLABS_API_KEY='your-elevenlabs-key-here'")
print(" export GOOGLE_API_KEY='your-google-key-here'")
print("Or create a .env file with:")
print(" ELEVENLABS_API_KEY=your-elevenlabs-key-here")
print(" GOOGLE_API_KEY=your-google-key-here")
print()
# Create and launch the interface
demo = create_enhanced_voice_cloning_interface()
demo.launch(
share=True, # Set to False if you don't want to create a public link
debug=True,
server_name="0.0.0.0", # Allow access from other devices on network
server_port=7860
)