Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import torch | |
| import torchaudio | |
| import numpy as np | |
| import librosa | |
| import soundfile as sf | |
| import matplotlib.pyplot as plt | |
| import plotly.graph_objects as go | |
| import plotly.express as px | |
| from scipy.signal import butter, filtfilt | |
| import tempfile | |
| import os | |
| import io | |
| import base64 | |
| from datetime import datetime | |
| import requests | |
| import zipfile | |
| from pathlib import Path | |
| import pickle | |
| import json | |
| # Import voice cloning modules | |
| from voice_cloning_engine import VoiceCloningEngine | |
| from audio_processor import AudioProcessor | |
| from voice_analyzer import VoiceAnalyzer | |
| # Page configuration | |
| st.set_page_config( | |
| page_title="AI Voice Clone Studio", | |
| page_icon="🎭", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Custom CSS | |
| st.markdown(""" | |
| <style> | |
| .main-header { | |
| font-size: 3rem; | |
| font-weight: bold; | |
| text-align: center; | |
| margin-bottom: 2rem; | |
| background: linear-gradient(90deg, #ff6b6b, #4ecdc4, #45b7d1); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| background-clip: text; | |
| } | |
| .clone-box { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| padding: 2rem; | |
| border-radius: 15px; | |
| color: white; | |
| margin: 1rem 0; | |
| } | |
| .reference-box { | |
| background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); | |
| padding: 1.5rem; | |
| border-radius: 10px; | |
| color: white; | |
| margin: 1rem 0; | |
| } | |
| .input-box { | |
| background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%); | |
| padding: 1.5rem; | |
| border-radius: 10px; | |
| color: white; | |
| margin: 1rem 0; | |
| } | |
| .result-box { | |
| background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%); | |
| padding: 1.5rem; | |
| border-radius: 10px; | |
| color: white; | |
| margin: 1rem 0; | |
| } | |
| .stAudio { | |
| margin: 1rem 0; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Initialize session state | |
| if 'cloning_engine' not in st.session_state: | |
| st.session_state.cloning_engine = None | |
| if 'reference_voice' not in st.session_state: | |
| st.session_state.reference_voice = None | |
| if 'cloned_audio' not in st.session_state: | |
| st.session_state.cloned_audio = None | |
| if 'voice_profiles' not in st.session_state: | |
| st.session_state.voice_profiles = {} | |
| def load_cloning_engine(): | |
| """Initialize the voice cloning engine""" | |
| return VoiceCloningEngine() | |
| def save_uploaded_file(uploaded_file, directory="temp"): | |
| """Save uploaded file to directory""" | |
| if uploaded_file is not None: | |
| os.makedirs(directory, exist_ok=True) | |
| file_path = os.path.join(directory, uploaded_file.name) | |
| with open(file_path, "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| return file_path | |
| return None | |
| def create_audio_comparison(original_audio, cloned_audio, sample_rate): | |
| """Create side-by-side audio comparison""" | |
| fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8)) | |
| # Original audio | |
| time_original = np.linspace(0, len(original_audio) / sample_rate, len(original_audio)) | |
| ax1.plot(time_original, original_audio, color='blue', alpha=0.7) | |
| ax1.set_title('Original Audio', fontsize=14, fontweight='bold') | |
| ax1.set_xlabel('Time (seconds)') | |
| ax1.set_ylabel('Amplitude') | |
| ax1.grid(True, alpha=0.3) | |
| # Cloned audio | |
| time_cloned = np.linspace(0, len(cloned_audio) / sample_rate, len(cloned_audio)) | |
| ax2.plot(time_cloned, cloned_audio, color='red', alpha=0.7) | |
| ax2.set_title('Voice Cloned Audio', fontsize=14, fontweight='bold') | |
| ax2.set_xlabel('Time (seconds)') | |
| ax2.set_ylabel('Amplitude') | |
| ax2.grid(True, alpha=0.3) | |
| plt.tight_layout() | |
| return fig | |
| def create_spectrogram_comparison(original_audio, cloned_audio, sample_rate): | |
| """Create spectrogram comparison""" | |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6)) | |
| # Original spectrogram | |
| D1 = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max) | |
| librosa.display.specshow(D1, sr=sample_rate, x_axis='time', y_axis='hz', ax=ax1, cmap='viridis') | |
| ax1.set_title('Original Audio Spectrogram') | |
| # Cloned spectrogram | |
| D2 = librosa.amplitude_to_db(np.abs(librosa.stft(cloned_audio)), ref=np.max) | |
| librosa.display.specshow(D2, sr=sample_rate, x_axis='time', y_axis='hz', ax=ax2, cmap='viridis') | |
| ax2.set_title('Voice Cloned Audio Spectrogram') | |
| plt.tight_layout() | |
| return fig | |
| def main(): | |
| # Header | |
| st.markdown('<div class="main-header">🎭 AI Voice Clone Studio</div>', unsafe_allow_html=True) | |
| st.markdown("### Transform any voice into any other voice with advanced AI") | |
| # Initialize cloning engine | |
| if st.session_state.cloning_engine is None: | |
| with st.spinner("🚀 Loading Voice Cloning Engine..."): | |
| st.session_state.cloning_engine = load_cloning_engine() | |
| # Sidebar Configuration | |
| with st.sidebar: | |
| st.header("⚙️ Voice Cloning Settings") | |
| # Model Selection | |
| cloning_method = st.selectbox( | |
| "Cloning Method:", | |
| ["OpenVoice", "Real-Time VC", "SV2TTS", "Neural Voice Puppetry"], | |
| help="Choose the voice cloning algorithm" | |
| ) | |
| # Quality Settings | |
| st.subheader("🎛️ Quality Settings") | |
| quality_level = st.select_slider( | |
| "Quality Level:", | |
| options=["Fast", "Balanced", "High Quality"], | |
| value="Balanced" | |
| ) | |
| preserve_emotion = st.checkbox("Preserve Emotion", value=True) | |
| preserve_accent = st.checkbox("Preserve Accent", value=True) | |
| preserve_pace = st.checkbox("Preserve Speaking Pace", value=True) | |
| # Advanced Settings | |
| with st.expander("🔧 Advanced Settings"): | |
| similarity_threshold = st.slider("Voice Similarity Threshold", 0.5, 1.0, 0.8) | |
| noise_reduction = st.checkbox("Apply Noise Reduction", value=True) | |
| auto_trim = st.checkbox("Auto-trim Silence", value=True) | |
| enhance_quality = st.checkbox("Enhance Audio Quality", value=True) | |
| # Main Interface | |
| col1, col2 = st.columns([1, 1]) | |
| # Reference Voice Section | |
| with col1: | |
| st.markdown(""" | |
| <div class="reference-box"> | |
| <h3>🎤 Reference Voice (Target)</h3> | |
| <p>Upload or record the voice you want to clone</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| reference_method = st.radio( | |
| "Reference Voice Input:", | |
| ["Upload Audio File", "Record Live", "Use Saved Profile"], | |
| horizontal=True | |
| ) | |
| reference_audio_data = None | |
| reference_sr = None | |
| if reference_method == "Upload Audio File": | |
| reference_file = st.file_uploader( | |
| "Upload Reference Voice:", | |
| type=['wav', 'mp3', 'flac', 'm4a'], | |
| help="Upload a clear audio sample of the target voice (10+ seconds recommended)" | |
| ) | |
| if reference_file: | |
| file_path = save_uploaded_file(reference_file, "reference_voices") | |
| reference_audio_data, reference_sr = librosa.load(file_path, sr=None) | |
| st.audio(reference_file, format='audio/wav') | |
| # Voice Analysis | |
| if st.button("🔍 Analyze Reference Voice"): | |
| with st.spinner("Analyzing voice characteristics..."): | |
| analyzer = VoiceAnalyzer() | |
| voice_features = analyzer.analyze_voice(reference_audio_data, reference_sr) | |
| st.json(voice_features) | |
| elif reference_method == "Record Live": | |
| st.info("🎙️ Use the record button below to capture reference voice") | |
| # Audio recorder component would go here | |
| # For now, showing placeholder | |
| st.warning("Live recording feature requires additional setup") | |
| elif reference_method == "Use Saved Profile": | |
| if st.session_state.voice_profiles: | |
| selected_profile = st.selectbox( | |
| "Select Voice Profile:", | |
| list(st.session_state.voice_profiles.keys()) | |
| ) | |
| if selected_profile: | |
| profile_data = st.session_state.voice_profiles[selected_profile] | |
| reference_audio_data = profile_data['audio_data'] | |
| reference_sr = profile_data['sample_rate'] | |
| st.success(f"✅ Loaded voice profile: {selected_profile}") | |
| else: | |
| st.info("No saved voice profiles available") | |
| # Input Audio Section | |
| with col2: | |
| st.markdown(""" | |
| <div class="input-box"> | |
| <h3>📢 Input Audio (Source)</h3> | |
| <p>Upload the audio you want to transform</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| input_method = st.radio( | |
| "Input Audio Method:", | |
| ["Upload Audio File", "Record Live", "Text-to-Speech"], | |
| horizontal=True | |
| ) | |
| input_audio_data = None | |
| input_sr = None | |
| if input_method == "Upload Audio File": | |
| input_file = st.file_uploader( | |
| "Upload Input Audio:", | |
| type=['wav', 'mp3', 'flac', 'm4a'], | |
| help="Upload the audio you want to transform to the reference voice" | |
| ) | |
| if input_file: | |
| file_path = save_uploaded_file(input_file, "temp") | |
| input_audio_data, input_sr = librosa.load(file_path, sr=None) | |
| st.audio(input_file, format='audio/wav') | |
| elif input_method == "Record Live": | |
| st.info("🎙️ Use the record button below to capture input audio") | |
| st.warning("Live recording feature requires additional setup") | |
| elif input_method == "Text-to-Speech": | |
| tts_text = st.text_area( | |
| "Enter text to convert:", | |
| height=150, | |
| placeholder="Type the text you want to speak in the cloned voice..." | |
| ) | |
| if tts_text and st.button("🗣️ Generate TTS"): | |
| with st.spinner("Generating speech from text..."): | |
| # Generate TTS audio (placeholder) | |
| st.success("TTS generated! Now clone the voice.") | |
| # Voice Cloning Process | |
| if reference_audio_data is not None and input_audio_data is not None: | |
| st.markdown("---") | |
| st.markdown(""" | |
| <div class="clone-box"> | |
| <h2>🎭 Voice Cloning Process</h2> | |
| <p>Ready to clone the reference voice and apply it to your input audio!</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| col1, col2, col3 = st.columns([1, 2, 1]) | |
| with col2: | |
| if st.button("🚀 Start Voice Cloning", type="primary", use_container_width=True): | |
| try: | |
| with st.spinner("🎭 Cloning voice... This may take a few minutes"): | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| # Step 1: Preprocess audio | |
| status_text.text("📊 Preprocessing audio...") | |
| progress_bar.progress(20) | |
| processor = AudioProcessor() | |
| ref_processed = processor.preprocess_audio(reference_audio_data, reference_sr) | |
| input_processed = processor.preprocess_audio(input_audio_data, input_sr) | |
| # Step 2: Extract voice features | |
| status_text.text("🔍 Extracting voice features...") | |
| progress_bar.progress(40) | |
| # Step 3: Voice cloning | |
| status_text.text("🎭 Performing voice cloning...") | |
| progress_bar.progress(60) | |
| cloned_audio = st.session_state.cloning_engine.clone_voice( | |
| reference_audio=ref_processed, | |
| input_audio=input_processed, | |
| method=cloning_method, | |
| preserve_emotion=preserve_emotion, | |
| preserve_accent=preserve_accent, | |
| preserve_pace=preserve_pace | |
| ) | |
| # Step 4: Post-processing | |
| status_text.text("✨ Post-processing...") | |
| progress_bar.progress(80) | |
| if enhance_quality: | |
| cloned_audio = processor.enhance_audio(cloned_audio) | |
| progress_bar.progress(100) | |
| status_text.text("✅ Voice cloning completed!") | |
| # Store result | |
| st.session_state.cloned_audio = { | |
| 'audio_data': cloned_audio, | |
| 'sample_rate': input_sr, | |
| 'original_input': input_audio_data, | |
| 'reference_voice': reference_audio_data | |
| } | |
| st.success("🎉 Voice cloning successful!") | |
| except Exception as e: | |
| st.error(f"❌ Error during voice cloning: {str(e)}") | |
| # Results Section | |
| if st.session_state.cloned_audio: | |
| st.markdown("---") | |
| st.markdown(""" | |
| <div class="result-box"> | |
| <h2>🎵 Cloning Results</h2> | |
| <p>Your voice has been successfully cloned!</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| cloned_data = st.session_state.cloned_audio | |
| # Audio Players | |
| st.subheader("🔊 Audio Comparison") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.markdown("**📢 Original Input:**") | |
| input_bytes = AudioProcessor.audio_to_bytes(cloned_data['original_input'], cloned_data['sample_rate']) | |
| st.audio(input_bytes, format='audio/wav') | |
| with col2: | |
| st.markdown("**🎤 Reference Voice:**") | |
| ref_bytes = AudioProcessor.audio_to_bytes(cloned_data['reference_voice'], cloned_data['sample_rate']) | |
| st.audio(ref_bytes, format='audio/wav') | |
| with col3: | |
| st.markdown("**🎭 Cloned Result:**") | |
| cloned_bytes = AudioProcessor.audio_to_bytes(cloned_data['audio_data'], cloned_data['sample_rate']) | |
| st.audio(cloned_bytes, format='audio/wav') | |
| # Visualizations | |
| st.subheader("📊 Audio Analysis") | |
| tab1, tab2, tab3 = st.tabs(["Waveform Comparison", "Spectrogram Analysis", "Voice Similarity"]) | |
| with tab1: | |
| fig_wave = create_audio_comparison( | |
| cloned_data['original_input'], | |
| cloned_data['audio_data'], | |
| cloned_data['sample_rate'] | |
| ) | |
| st.pyplot(fig_wave) | |
| with tab2: | |
| fig_spec = create_spectrogram_comparison( | |
| cloned_data['original_input'], | |
| cloned_data['audio_data'], | |
| cloned_data['sample_rate'] | |
| ) | |
| st.pyplot(fig_spec) | |
| with tab3: | |
| # Voice similarity metrics | |
| analyzer = VoiceAnalyzer() | |
| similarity_score = analyzer.calculate_similarity( | |
| cloned_data['reference_voice'], | |
| cloned_data['audio_data'], | |
| cloned_data['sample_rate'] | |
| ) | |
| # Create similarity gauge | |
| fig_gauge = go.Figure(go.Indicator( | |
| mode = "gauge+number+delta", | |
| value = similarity_score * 100, | |
| domain = {'x': [0, 1], 'y': [0, 1]}, | |
| title = {'text': "Voice Similarity Score"}, | |
| delta = {'reference': 80}, | |
| gauge = { | |
| 'axis': {'range': [None, 100]}, | |
| 'bar': {'color': "darkblue"}, | |
| 'steps': [ | |
| {'range': [0, 50], 'color': "lightgray"}, | |
| {'range': [50, 80], 'color': "gray"} | |
| ], | |
| 'threshold': { | |
| 'line': {'color': "red", 'width': 4}, | |
| 'thickness': 0.75, | |
| 'value': 90 | |
| } | |
| } | |
| )) | |
| st.plotly_chart(fig_gauge, use_container_width=True) | |
| # Download Options | |
| st.subheader("💾 Download Options") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.download_button( | |
| label="⬇️ Download WAV", | |
| data=cloned_bytes, | |
| file_name=f"voice_cloned_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav", | |
| mime="audio/wav" | |
| ) | |
| with col2: | |
| # Convert to MP3 and download | |
| if st.button("⬇️ Download MP3"): | |
| st.info("MP3 conversion feature coming soon!") | |
| with col3: | |
| # Save as voice profile | |
| profile_name = st.text_input("Voice Profile Name:", placeholder="My Voice Clone") | |
| if st.button("💾 Save Profile") and profile_name: | |
| st.session_state.voice_profiles[profile_name] = { | |
| 'audio_data': cloned_data['reference_voice'], | |
| 'sample_rate': cloned_data['sample_rate'], | |
| 'created': datetime.now().isoformat() | |
| } | |
| st.success(f"✅ Voice profile '{profile_name}' saved!") | |
| # Voice Profile Manager | |
| if st.session_state.voice_profiles: | |
| st.markdown("---") | |
| st.subheader("👤 Voice Profile Manager") | |
| for profile_name, profile_data in st.session_state.voice_profiles.items(): | |
| col1, col2, col3 = st.columns([2, 1, 1]) | |
| with col1: | |
| st.write(f"**{profile_name}**") | |
| st.caption(f"Created: {profile_data['created']}") | |
| with col2: | |
| audio_bytes = AudioProcessor.audio_to_bytes( | |
| profile_data['audio_data'], | |
| profile_data['sample_rate'] | |
| ) | |
| st.audio(audio_bytes, format='audio/wav') | |
| with col3: | |
| if st.button(f"🗑️ Delete", key=f"del_{profile_name}"): | |
| del st.session_state.voice_profiles[profile_name] | |
| st.rerun() | |
| # Footer | |
| st.markdown("---") | |
| st.markdown( | |
| """ | |
| <div style="text-align: center; color: #666; padding: 2rem;"> | |
| 🎭 <strong>AI Voice Clone Studio</strong> - Advanced Voice Cloning Technology<br> | |
| Transform any voice into any other voice with state-of-the-art AI<br> | |
| <small>⚠️ Use responsibly and with consent from voice owners</small> | |
| </div> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| if __name__ == "__main__": | |
| main() | |