Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from elevenlabs import ElevenLabs | |
| from google import genai | |
| from dotenv import load_dotenv | |
| import base64 | |
| import os | |
| import tempfile | |
| import shutil | |
| import json | |
| from pathlib import Path | |
| # Load environment variables | |
| load_dotenv() | |
| class AudioVoiceCloner: | |
| def __init__(self): | |
| # Get API keys from environment variables for security | |
| self.elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY") | |
| self.google_api_key = os.getenv("GOOGLE_API_KEY") | |
| if not self.elevenlabs_api_key: | |
| raise ValueError("Please set ELEVENLABS_API_KEY environment variable") | |
| if not self.google_api_key: | |
| raise ValueError("Please set GOOGLE_API_KEY environment variable") | |
| self.elevenlabs_client = ElevenLabs(api_key=self.elevenlabs_api_key) | |
| self.gemini_client = genai.Client(api_key=self.google_api_key) | |
| def transcribe_audio(self, audio_file_path): | |
| """Transcribe audio using Gemini API - ONLY the spoken content.""" | |
| try: | |
| # Upload file to Gemini | |
| uploaded_file = self.gemini_client.files.upload(file=audio_file_path) | |
| # Generate transcription - ONLY the content, no analysis | |
| response = self.gemini_client.models.generate_content( | |
| model="gemini-2.0-flash", | |
| contents=[ | |
| 'Listen to the audio carefully and transcribe ONLY the spoken words. ' | |
| 'Focus on the main speaker and ignore background noise or music. ' | |
| 'Do NOT include any analysis, commentary, or descriptions about the voice, tone, or pace. ' | |
| 'Do NOT mention speaking characteristics like "fast", "slow", "emphasis", etc. ' | |
| 'Provide ONLY the exact words that were spoken, nothing else. ' | |
| 'If parts are unclear, indicate [unclear] but do not make up content.', | |
| uploaded_file | |
| ] | |
| ) | |
| # Clean up the uploaded file | |
| try: | |
| self.gemini_client.files.delete(uploaded_file.name) | |
| except: | |
| pass # If deletion fails, continue anyway | |
| return response.text.strip(), "Transcription completed successfully!" | |
| except Exception as e: | |
| return "", f"Error during transcription: {str(e)}" | |
| def analyze_voice_characteristics(self, audio_file_path): | |
| """Analyze voice characteristics in the background for speech tuning (not for transcription).""" | |
| try: | |
| # Upload file to Gemini | |
| uploaded_file = self.gemini_client.files.upload(file=audio_file_path) | |
| # Generate voice characteristics analysis for internal use only | |
| response = self.gemini_client.models.generate_content( | |
| model="gemini-2.0-flash", | |
| contents=[ | |
| 'Analyze the speaking characteristics of this audio for voice synthesis purposes. ' | |
| 'I need technical data about the voice patterns, NOT transcription. ' | |
| 'Focus on analyzing:\n' | |
| '1. Speaking pace and rhythm patterns\n' | |
| '2. Pause locations and durations\n' | |
| '3. Emphasis and stress patterns\n' | |
| '4. Breathing patterns\n' | |
| '5. Tone variations\n' | |
| '6. Pitch patterns\n\n' | |
| 'Provide analysis in JSON format:\n' | |
| '{\n' | |
| ' "pace": "fast/normal/slow with specific sections",\n' | |
| ' "pause_pattern": "description of pause locations and timing",\n' | |
| ' "emphasis_style": "description of how emphasis is applied",\n' | |
| ' "breathing_pattern": "natural breathing breaks detected",\n' | |
| ' "tone_characteristics": "description of voice tone and variations",\n' | |
| ' "pitch_pattern": "description of pitch variations",\n' | |
| ' "speech_timing_markers": "SSML-compatible timing instructions"\n' | |
| '}\n\n' | |
| 'This analysis will be used to configure voice synthesis parameters, not for transcription.', | |
| uploaded_file | |
| ] | |
| ) | |
| # Clean up the uploaded file | |
| try: | |
| self.gemini_client.files.delete(uploaded_file.name) | |
| except: | |
| pass | |
| return response.text.strip() | |
| except Exception as e: | |
| return "{}" # Return empty JSON on error | |
| def create_natural_speech_with_characteristics(self, original_text, voice_characteristics): | |
| """Apply voice characteristics to create natural-sounding speech.""" | |
| try: | |
| # Parse voice characteristics | |
| try: | |
| char_data = json.loads(voice_characteristics) | |
| except: | |
| char_data = {} | |
| # Apply characteristics to the original text for natural speech | |
| enhanced_text = original_text | |
| # Add natural pauses based on analysis | |
| if "pause_pattern" in char_data: | |
| # Insert natural pauses at sentence boundaries and logical breaks | |
| enhanced_text = enhanced_text.replace(". ", ". <break time='0.5s'/>") | |
| enhanced_text = enhanced_text.replace("? ", "? <break time='0.5s'/>") | |
| enhanced_text = enhanced_text.replace("! ", "! <break time='0.5s'/>") | |
| enhanced_text = enhanced_text.replace(", ", ", <break time='0.2s'/>") | |
| # Apply pace characteristics | |
| if "pace" in char_data: | |
| pace_info = char_data["pace"].lower() | |
| if "slow" in pace_info: | |
| enhanced_text = f'<prosody rate="slow">{enhanced_text}</prosody>' | |
| elif "fast" in pace_info: | |
| enhanced_text = f'<prosody rate="fast">{enhanced_text}</prosody>' | |
| # Wrap in SSML speak tags for ElevenLabs | |
| enhanced_text = f'<speak>{enhanced_text}</speak>' | |
| return enhanced_text, char_data | |
| except Exception as e: | |
| # If processing fails, return original text | |
| return original_text, {} | |
| def clone_voice_with_natural_characteristics(self, input_audio_path, text_to_speak, voice_analysis=None): | |
| """Clone voice and apply natural speech characteristics in background.""" | |
| try: | |
| # Find similar voice from uploaded audio | |
| with open(input_audio_path, 'rb') as audio_file: | |
| similar_voices_response = self.elevenlabs_client.voices.find_similar_voices( | |
| audio_file=audio_file | |
| ) | |
| # Extract the first voice from the response | |
| if not similar_voices_response or not hasattr(similar_voices_response, 'voices') or not similar_voices_response.voices: | |
| return None, "No similar voice found. Please try with a clearer audio sample." | |
| # Get the first (most similar) voice | |
| voice_id = similar_voices_response.voices[0].voice_id | |
| # Apply voice characteristics to text if analysis is provided | |
| if voice_analysis: | |
| enhanced_text, characteristics = self.create_natural_speech_with_characteristics(text_to_speak, voice_analysis) | |
| else: | |
| enhanced_text = text_to_speak | |
| characteristics = {} | |
| # Generate speech with enhanced characteristics | |
| try: | |
| # Try with SSML for better natural speech | |
| audio_generator = self.elevenlabs_client.text_to_speech.convert( | |
| voice_id=voice_id, | |
| text=enhanced_text, | |
| model_id="eleven_multilingual_v2", # Better model for SSML support | |
| voice_settings={ | |
| "stability": 0.75, | |
| "similarity_boost": 0.8, | |
| "style": 0.1, # Add slight style variation | |
| "use_speaker_boost": True | |
| } | |
| ) | |
| except: | |
| # Fallback: Strip SSML tags and use basic generation | |
| import re | |
| clean_text = re.sub(r'<[^>]+>', '', enhanced_text) | |
| audio_generator = self.elevenlabs_client.text_to_speech.convert( | |
| voice_id=voice_id, | |
| text=clean_text, | |
| voice_settings={ | |
| "stability": 0.75, | |
| "similarity_boost": 0.85, | |
| "style": 0.1, | |
| "use_speaker_boost": True | |
| } | |
| ) | |
| # Convert generator to bytes | |
| audio_bytes = b"" | |
| for chunk in audio_generator: | |
| audio_bytes += chunk | |
| # Save the generated audio | |
| output_path = tempfile.mktemp(suffix='.wav') | |
| with open(output_path, 'wb') as f: | |
| f.write(audio_bytes) | |
| return output_path, "Voice cloned with natural speech characteristics!" | |
| except Exception as e: | |
| return None, f"Error during voice cloning: {str(e)}" | |
| def complete_voice_reproduction(self, input_audio_path): | |
| """Complete workflow: transcribe content and reproduce with same voice characteristics.""" | |
| try: | |
| # Step 1: Get the spoken content (transcription) | |
| transcription, transcribe_msg = self.transcribe_audio(input_audio_path) | |
| if not transcription: | |
| return None, "", f"Transcription failed: {transcribe_msg}" | |
| # Step 2: Analyze voice characteristics in background | |
| voice_analysis = self.analyze_voice_characteristics(input_audio_path) | |
| # Step 3: Clone voice and generate speech with natural characteristics | |
| output_audio, clone_msg = self.clone_voice_with_natural_characteristics( | |
| input_audio_path, transcription, voice_analysis | |
| ) | |
| if output_audio: | |
| return output_audio, transcription, "β SUCCESS! Voice cloned with natural speech patterns preserved." | |
| else: | |
| return None, transcription, f"β Voice cloning failed: {clone_msg}" | |
| except Exception as e: | |
| return None, "", f"Unexpected error: {str(e)}" | |
| def save_audio_from_base64(self, audio_base64, filename): | |
| """Save base64 audio data to a file.""" | |
| try: | |
| audio_bytes = base64.b64decode(audio_base64) | |
| with open(filename, 'wb') as f: | |
| f.write(audio_bytes) | |
| return True | |
| except Exception as e: | |
| print(f"Error saving audio: {e}") | |
| return False | |
| def clone_voice_and_generate(self, input_audio_path, text_to_speak): | |
| """Clone voice from input audio and generate speech with given text.""" | |
| try: | |
| # Analyze voice characteristics for better synthesis | |
| voice_analysis = self.analyze_voice_characteristics(input_audio_path) | |
| # Clone voice with natural characteristics | |
| output_audio, message = self.clone_voice_with_natural_characteristics( | |
| input_audio_path, text_to_speak, voice_analysis | |
| ) | |
| return output_audio, message | |
| except Exception as e: | |
| return None, f"Unexpected error: {str(e)}" | |
| def create_enhanced_voice_cloning_interface(): | |
| """Create enhanced Gradio interface with transcription and voice cloning.""" | |
| # Initialize the voice cloner | |
| try: | |
| cloner = AudioVoiceCloner() | |
| except ValueError as e: | |
| def error_function(*args): | |
| return None, str(e) | |
| # Create a simple error interface if API keys are missing | |
| with gr.Blocks(title="Enhanced Audio Voice Cloning") as demo: | |
| gr.Markdown("# Enhanced Audio Voice Cloning Tool") | |
| gr.Markdown("β οΈ **Error**: Please set your API keys in environment variables") | |
| gr.Markdown("Required environment variables:") | |
| gr.Markdown("- `ELEVENLABS_API_KEY` from [ElevenLabs Dashboard](https://elevenlabs.io/)") | |
| gr.Markdown("- `GOOGLE_API_KEY` from [Google AI Studio](https://aistudio.google.com/)") | |
| return demo | |
| def transcribe_only(input_audio): | |
| """Transcribe audio only - just the spoken content.""" | |
| if input_audio is None: | |
| return "", "Please upload an audio file." | |
| try: | |
| transcription, message = cloner.transcribe_audio(input_audio) | |
| return transcription, message | |
| except Exception as e: | |
| return "", f"Unexpected error: {str(e)}" | |
| def process_audio_with_transcription(input_audio): | |
| """Auto-transcribe and populate text field.""" | |
| if input_audio is None: | |
| return "", "Please upload an audio file." | |
| try: | |
| transcription, _ = cloner.transcribe_audio(input_audio) | |
| return transcription, "Auto-transcription completed! You can edit the text if needed." | |
| except Exception as e: | |
| return "", f"Error in auto-transcription: {str(e)}" | |
| def process_complete_reproduction(input_audio): | |
| """Complete workflow: transcribe and reproduce with same voice and natural characteristics.""" | |
| if input_audio is None: | |
| return None, "", "Please upload an audio file." | |
| try: | |
| output_audio, transcription, message = cloner.complete_voice_reproduction(input_audio) | |
| return output_audio, transcription, message | |
| except Exception as e: | |
| return None, "", f"Unexpected error: {str(e)}" | |
| def process_voice_cloning(input_audio, text_input): | |
| """Process the uploaded audio and generate cloned voice with natural characteristics.""" | |
| if input_audio is None: | |
| return None, "Please upload an audio file." | |
| if not text_input or text_input.strip() == "": | |
| return None, "Please enter text to be spoken." | |
| try: | |
| # Process the audio with voice characteristic analysis | |
| output_audio, message = cloner.clone_voice_and_generate(input_audio, text_input.strip()) | |
| return output_audio, message | |
| except Exception as e: | |
| return None, f"Unexpected error: {str(e)}" | |
| # Create the Gradio interface | |
| with gr.Blocks(title="Enhanced Audio Voice Cloning", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# ποΈ Enhanced Audio Voice Cloning Tool") | |
| gr.Markdown("Upload an audio file to clone the voice and generate natural-sounding speech using AI.") | |
| with gr.Tabs(): | |
| # Tab 1: One-Click Reproduction | |
| with gr.TabItem("π― Reproduce Original Audio"): | |
| gr.Markdown("### Upload your audio and get AI reproduction with same voice and natural speech patterns!") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π€ Original Audio") | |
| input_audio_reproduce = gr.Audio( | |
| label="Upload Your Audio File", | |
| type="filepath" | |
| ) | |
| reproduce_btn = gr.Button("π Clone Voice & Reproduce Content", variant="primary", size="lg") | |
| gr.Markdown("**What this does:**") | |
| gr.Markdown("β’ π Clones your voice characteristics") | |
| gr.Markdown("β’ π Extracts the spoken content") | |
| gr.Markdown("β’ π€ Reproduces with natural speech patterns") | |
| gr.Markdown("β’ β±οΈ Applies natural timing and pauses") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π₯ AI Reproduction") | |
| output_audio_reproduce = gr.Audio( | |
| label="π΅ Cloned Voice Output", | |
| type="filepath" | |
| ) | |
| transcribed_content = gr.Textbox( | |
| label="π Extracted Content:", | |
| lines=4, | |
| interactive=False, | |
| placeholder="The spoken content will appear here..." | |
| ) | |
| status_reproduce = gr.Textbox( | |
| label="π Status", | |
| lines=3, | |
| interactive=False, | |
| placeholder="Upload audio and click 'Clone Voice & Reproduce Content' to start..." | |
| ) | |
| # Tab 2: Step-by-Step Workflow | |
| with gr.TabItem("π Step-by-Step Workflow"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Input") | |
| input_audio_main = gr.Audio( | |
| label="Upload Audio File (WAV, MP3, etc.)", | |
| type="filepath" | |
| ) | |
| auto_transcribe_btn = gr.Button("π Step 1: Extract Text Content", variant="secondary") | |
| text_input_main = gr.Textbox( | |
| label="Text Content (you can edit this)", | |
| placeholder="Upload audio and click 'Step 1: Extract Text Content' first...", | |
| lines=5 | |
| ) | |
| clone_btn_main = gr.Button("π― Step 2: Clone Voice & Generate", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Output") | |
| output_audio_main = gr.Audio( | |
| label="Generated Audio (Cloned Voice)", | |
| type="filepath" | |
| ) | |
| status_message_main = gr.Textbox( | |
| label="Status", | |
| interactive=False, | |
| placeholder="Status messages will appear here..." | |
| ) | |
| # Tab 3: Text Extraction Only | |
| with gr.TabItem("π Text Extraction Only"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Input") | |
| input_audio_transcribe = gr.Audio( | |
| label="Upload Audio File for Text Extraction", | |
| type="filepath" | |
| ) | |
| transcribe_btn = gr.Button("π Extract Text Content", variant="primary") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Output") | |
| transcription_output = gr.Textbox( | |
| label="Extracted Text", | |
| lines=10, | |
| placeholder="Extracted text will appear here..." | |
| ) | |
| transcribe_status = gr.Textbox( | |
| label="Status", | |
| interactive=False, | |
| placeholder="Status messages will appear here..." | |
| ) | |
| # Tab 4: Voice Cloning Only | |
| with gr.TabItem("π― Voice Cloning Only"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Input") | |
| input_audio_clone = gr.Audio( | |
| label="Upload Audio File (WAV, MP3, etc.)", | |
| type="filepath" | |
| ) | |
| text_input_clone = gr.Textbox( | |
| label="Text to Speak", | |
| placeholder="Enter the text you want to be spoken in the cloned voice...", | |
| lines=3, | |
| value="Hello, this is a test of the voice cloning system. The AI will speak this text using the characteristics of the uploaded audio." | |
| ) | |
| clone_btn_only = gr.Button("π― Clone Voice & Generate", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Output") | |
| output_audio_clone = gr.Audio( | |
| label="Generated Audio (Cloned Voice)", | |
| type="filepath" | |
| ) | |
| status_message_clone = gr.Textbox( | |
| label="Status", | |
| interactive=False, | |
| placeholder="Status messages will appear here..." | |
| ) | |
| # Event handling | |
| reproduce_btn.click( | |
| fn=process_complete_reproduction, | |
| inputs=[input_audio_reproduce], | |
| outputs=[output_audio_reproduce, transcribed_content, status_reproduce] | |
| ) | |
| auto_transcribe_btn.click( | |
| fn=process_audio_with_transcription, | |
| inputs=[input_audio_main], | |
| outputs=[text_input_main, status_message_main] | |
| ) | |
| clone_btn_main.click( | |
| fn=process_voice_cloning, | |
| inputs=[input_audio_main, text_input_main], | |
| outputs=[output_audio_main, status_message_main] | |
| ) | |
| transcribe_btn.click( | |
| fn=transcribe_only, | |
| inputs=[input_audio_transcribe], | |
| outputs=[transcription_output, transcribe_status] | |
| ) | |
| clone_btn_only.click( | |
| fn=process_voice_cloning, | |
| inputs=[input_audio_clone, text_input_clone], | |
| outputs=[output_audio_clone, status_message_clone] | |
| ) | |
| # Add tips section | |
| gr.Markdown("### π‘ Natural Voice Cloning with Background Analysis") | |
| gr.Markdown(""" | |
| **π― "Reproduce Original Audio" Tab - Main Feature:** | |
| - β Simply upload your audio file | |
| - β AI extracts ONLY the spoken content (no analysis descriptions) | |
| - β Voice characteristics are analyzed in the background for natural speech | |
| - β Generated audio sounds natural with proper timing and tone | |
| **Quality Tips for Best Results:** | |
| - π€ Use clear audio with minimal background noise | |
| - β±οΈ 15-60 seconds of natural speech works best | |
| - π£οΈ Conversational tone works better than robotic reading | |
| - π΅ Avoid music or overlapping voices | |
| """) | |
| gr.Markdown("### π§ How It Works") | |
| gr.Markdown(""" | |
| **Behind the Scenes:** | |
| 1. **Content Extraction:** AI extracts only the spoken words (no descriptions) | |
| 2. **Voice Analysis:** AI analyzes pace, tone, pauses in the background | |
| 3. **Natural Synthesis:** Voice characteristics are applied to create natural-sounding speech | |
| 4. **Clean Output:** You get natural-sounding cloned voice without technical descriptions | |
| """) | |
| return demo | |
| # Create and launch the interface | |
| if __name__ == "__main__": | |
| # Set up environment variable instructions | |
| print("ποΈ Enhanced Audio Voice Cloning Tool") | |
| print("=" * 50) | |
| missing_keys = [] | |
| if not os.getenv("ELEVENLABS_API_KEY"): | |
| missing_keys.append("ELEVENLABS_API_KEY") | |
| if not os.getenv("GOOGLE_API_KEY"): | |
| missing_keys.append("GOOGLE_API_KEY") | |
| if missing_keys: | |
| print("β οΈ WARNING: Missing environment variables!") | |
| for key in missing_keys: | |
| print(f" {key} not set") | |
| print("\nPlease set your API keys:") | |
| print(" export ELEVENLABS_API_KEY='your-elevenlabs-key-here'") | |
| print(" export GOOGLE_API_KEY='your-google-key-here'") | |
| print("Or create a .env file with:") | |
| print(" ELEVENLABS_API_KEY=your-elevenlabs-key-here") | |
| print(" GOOGLE_API_KEY=your-google-key-here") | |
| print() | |
| # Create and launch the interface | |
| demo = create_enhanced_voice_cloning_interface() | |
| demo.launch( | |
| share=True, # Set to False if you don't want to create a public link | |
| debug=True, | |
| server_name="0.0.0.0", # Allow access from other devices on network | |
| server_port=7860 | |
| ) |