import streamlit as st import torch import torchaudio import numpy as np import librosa import soundfile as sf import matplotlib.pyplot as plt import plotly.graph_objects as go import plotly.express as px from scipy.signal import butter, filtfilt import tempfile import os import io import base64 from datetime import datetime import requests import zipfile from pathlib import Path import pickle import json # Import voice cloning modules from voice_cloning_engine import VoiceCloningEngine from audio_processor import AudioProcessor from voice_analyzer import VoiceAnalyzer # Page configuration st.set_page_config( page_title="AI Voice Clone Studio", page_icon="🎭", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS st.markdown(""" """, unsafe_allow_html=True) # Initialize session state if 'cloning_engine' not in st.session_state: st.session_state.cloning_engine = None if 'reference_voice' not in st.session_state: st.session_state.reference_voice = None if 'cloned_audio' not in st.session_state: st.session_state.cloned_audio = None if 'voice_profiles' not in st.session_state: st.session_state.voice_profiles = {} @st.cache_resource def load_cloning_engine(): """Initialize the voice cloning engine""" return VoiceCloningEngine() def save_uploaded_file(uploaded_file, directory="temp"): """Save uploaded file to directory""" if uploaded_file is not None: os.makedirs(directory, exist_ok=True) file_path = os.path.join(directory, uploaded_file.name) with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) return file_path return None def create_audio_comparison(original_audio, cloned_audio, sample_rate): """Create side-by-side audio comparison""" fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8)) # Original audio time_original = np.linspace(0, len(original_audio) / sample_rate, len(original_audio)) ax1.plot(time_original, original_audio, color='blue', alpha=0.7) ax1.set_title('Original Audio', fontsize=14, fontweight='bold') ax1.set_xlabel('Time (seconds)') ax1.set_ylabel('Amplitude') ax1.grid(True, alpha=0.3) # Cloned audio time_cloned = np.linspace(0, len(cloned_audio) / sample_rate, len(cloned_audio)) ax2.plot(time_cloned, cloned_audio, color='red', alpha=0.7) ax2.set_title('Voice Cloned Audio', fontsize=14, fontweight='bold') ax2.set_xlabel('Time (seconds)') ax2.set_ylabel('Amplitude') ax2.grid(True, alpha=0.3) plt.tight_layout() return fig def create_spectrogram_comparison(original_audio, cloned_audio, sample_rate): """Create spectrogram comparison""" fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6)) # Original spectrogram D1 = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max) librosa.display.specshow(D1, sr=sample_rate, x_axis='time', y_axis='hz', ax=ax1, cmap='viridis') ax1.set_title('Original Audio Spectrogram') # Cloned spectrogram D2 = librosa.amplitude_to_db(np.abs(librosa.stft(cloned_audio)), ref=np.max) librosa.display.specshow(D2, sr=sample_rate, x_axis='time', y_axis='hz', ax=ax2, cmap='viridis') ax2.set_title('Voice Cloned Audio Spectrogram') plt.tight_layout() return fig def main(): # Header st.markdown('

🎭 AI Voice Clone Studio

', unsafe_allow_html=True) st.markdown("### Transform any voice into any other voice with advanced AI") # Initialize cloning engine if st.session_state.cloning_engine is None: with st.spinner("🚀 Loading Voice Cloning Engine..."): st.session_state.cloning_engine = load_cloning_engine() # Sidebar Configuration with st.sidebar: st.header("⚙️ Voice Cloning Settings") # Model Selection cloning_method = st.selectbox( "Cloning Method:", ["OpenVoice", "Real-Time VC", "SV2TTS", "Neural Voice Puppetry"], help="Choose the voice cloning algorithm" ) # Quality Settings st.subheader("🎛️ Quality Settings") quality_level = st.select_slider( "Quality Level:", options=["Fast", "Balanced", "High Quality"], value="Balanced" ) preserve_emotion = st.checkbox("Preserve Emotion", value=True) preserve_accent = st.checkbox("Preserve Accent", value=True) preserve_pace = st.checkbox("Preserve Speaking Pace", value=True) # Advanced Settings with st.expander("🔧 Advanced Settings"): similarity_threshold = st.slider("Voice Similarity Threshold", 0.5, 1.0, 0.8) noise_reduction = st.checkbox("Apply Noise Reduction", value=True) auto_trim = st.checkbox("Auto-trim Silence", value=True) enhance_quality = st.checkbox("Enhance Audio Quality", value=True) # Main Interface col1, col2 = st.columns([1, 1]) # Reference Voice Section with col1: st.markdown("""

🎤 Reference Voice (Target)

Upload or record the voice you want to clone

""", unsafe_allow_html=True) reference_method = st.radio( "Reference Voice Input:", ["Upload Audio File", "Record Live", "Use Saved Profile"], horizontal=True ) reference_audio_data = None reference_sr = None if reference_method == "Upload Audio File": reference_file = st.file_uploader( "Upload Reference Voice:", type=['wav', 'mp3', 'flac', 'm4a'], help="Upload a clear audio sample of the target voice (10+ seconds recommended)" ) if reference_file: file_path = save_uploaded_file(reference_file, "reference_voices") reference_audio_data, reference_sr = librosa.load(file_path, sr=None) st.audio(reference_file, format='audio/wav') # Voice Analysis if st.button("🔍 Analyze Reference Voice"): with st.spinner("Analyzing voice characteristics..."): analyzer = VoiceAnalyzer() voice_features = analyzer.analyze_voice(reference_audio_data, reference_sr) st.json(voice_features) elif reference_method == "Record Live": st.info("🎙️ Use the record button below to capture reference voice") # Audio recorder component would go here # For now, showing placeholder st.warning("Live recording feature requires additional setup") elif reference_method == "Use Saved Profile": if st.session_state.voice_profiles: selected_profile = st.selectbox( "Select Voice Profile:", list(st.session_state.voice_profiles.keys()) ) if selected_profile: profile_data = st.session_state.voice_profiles[selected_profile] reference_audio_data = profile_data['audio_data'] reference_sr = profile_data['sample_rate'] st.success(f"✅ Loaded voice profile: {selected_profile}") else: st.info("No saved voice profiles available") # Input Audio Section with col2: st.markdown("""

📢 Input Audio (Source)

Upload the audio you want to transform

""", unsafe_allow_html=True) input_method = st.radio( "Input Audio Method:", ["Upload Audio File", "Record Live", "Text-to-Speech"], horizontal=True ) input_audio_data = None input_sr = None if input_method == "Upload Audio File": input_file = st.file_uploader( "Upload Input Audio:", type=['wav', 'mp3', 'flac', 'm4a'], help="Upload the audio you want to transform to the reference voice" ) if input_file: file_path = save_uploaded_file(input_file, "temp") input_audio_data, input_sr = librosa.load(file_path, sr=None) st.audio(input_file, format='audio/wav') elif input_method == "Record Live": st.info("🎙️ Use the record button below to capture input audio") st.warning("Live recording feature requires additional setup") elif input_method == "Text-to-Speech": tts_text = st.text_area( "Enter text to convert:", height=150, placeholder="Type the text you want to speak in the cloned voice..." ) if tts_text and st.button("🗣️ Generate TTS"): with st.spinner("Generating speech from text..."): # Generate TTS audio (placeholder) st.success("TTS generated! Now clone the voice.") # Voice Cloning Process if reference_audio_data is not None and input_audio_data is not None: st.markdown("---") st.markdown("""

🎭 Voice Cloning Process

Ready to clone the reference voice and apply it to your input audio!

""", unsafe_allow_html=True) col1, col2, col3 = st.columns([1, 2, 1]) with col2: if st.button("🚀 Start Voice Cloning", type="primary", use_container_width=True): try: with st.spinner("🎭 Cloning voice... This may take a few minutes"): progress_bar = st.progress(0) status_text = st.empty() # Step 1: Preprocess audio status_text.text("📊 Preprocessing audio...") progress_bar.progress(20) processor = AudioProcessor() ref_processed = processor.preprocess_audio(reference_audio_data, reference_sr) input_processed = processor.preprocess_audio(input_audio_data, input_sr) # Step 2: Extract voice features status_text.text("🔍 Extracting voice features...") progress_bar.progress(40) # Step 3: Voice cloning status_text.text("🎭 Performing voice cloning...") progress_bar.progress(60) cloned_audio = st.session_state.cloning_engine.clone_voice( reference_audio=ref_processed, input_audio=input_processed, method=cloning_method, preserve_emotion=preserve_emotion, preserve_accent=preserve_accent, preserve_pace=preserve_pace ) # Step 4: Post-processing status_text.text("✨ Post-processing...") progress_bar.progress(80) if enhance_quality: cloned_audio = processor.enhance_audio(cloned_audio) progress_bar.progress(100) status_text.text("✅ Voice cloning completed!") # Store result st.session_state.cloned_audio = { 'audio_data': cloned_audio, 'sample_rate': input_sr, 'original_input': input_audio_data, 'reference_voice': reference_audio_data } st.success("🎉 Voice cloning successful!") except Exception as e: st.error(f"❌ Error during voice cloning: {str(e)}") # Results Section if st.session_state.cloned_audio: st.markdown("---") st.markdown("""

🎵 Cloning Results

Your voice has been successfully cloned!

""", unsafe_allow_html=True) cloned_data = st.session_state.cloned_audio # Audio Players st.subheader("🔊 Audio Comparison") col1, col2, col3 = st.columns(3) with col1: st.markdown("**📢 Original Input:**") input_bytes = AudioProcessor.audio_to_bytes(cloned_data['original_input'], cloned_data['sample_rate']) st.audio(input_bytes, format='audio/wav') with col2: st.markdown("**🎤 Reference Voice:**") ref_bytes = AudioProcessor.audio_to_bytes(cloned_data['reference_voice'], cloned_data['sample_rate']) st.audio(ref_bytes, format='audio/wav') with col3: st.markdown("**🎭 Cloned Result:**") cloned_bytes = AudioProcessor.audio_to_bytes(cloned_data['audio_data'], cloned_data['sample_rate']) st.audio(cloned_bytes, format='audio/wav') # Visualizations st.subheader("📊 Audio Analysis") tab1, tab2, tab3 = st.tabs(["Waveform Comparison", "Spectrogram Analysis", "Voice Similarity"]) with tab1: fig_wave = create_audio_comparison( cloned_data['original_input'], cloned_data['audio_data'], cloned_data['sample_rate'] ) st.pyplot(fig_wave) with tab2: fig_spec = create_spectrogram_comparison( cloned_data['original_input'], cloned_data['audio_data'], cloned_data['sample_rate'] ) st.pyplot(fig_spec) with tab3: # Voice similarity metrics analyzer = VoiceAnalyzer() similarity_score = analyzer.calculate_similarity( cloned_data['reference_voice'], cloned_data['audio_data'], cloned_data['sample_rate'] ) # Create similarity gauge fig_gauge = go.Figure(go.Indicator( mode = "gauge+number+delta", value = similarity_score * 100, domain = {'x': [0, 1], 'y': [0, 1]}, title = {'text': "Voice Similarity Score"}, delta = {'reference': 80}, gauge = { 'axis': {'range': [None, 100]}, 'bar': {'color': "darkblue"}, 'steps': [ {'range': [0, 50], 'color': "lightgray"}, {'range': [50, 80], 'color': "gray"} ], 'threshold': { 'line': {'color': "red", 'width': 4}, 'thickness': 0.75, 'value': 90 } } )) st.plotly_chart(fig_gauge, use_container_width=True) # Download Options st.subheader("💾 Download Options") col1, col2, col3 = st.columns(3) with col1: st.download_button( label="⬇️ Download WAV", data=cloned_bytes, file_name=f"voice_cloned_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav", mime="audio/wav" ) with col2: # Convert to MP3 and download if st.button("⬇️ Download MP3"): st.info("MP3 conversion feature coming soon!") with col3: # Save as voice profile profile_name = st.text_input("Voice Profile Name:", placeholder="My Voice Clone") if st.button("💾 Save Profile") and profile_name: st.session_state.voice_profiles[profile_name] = { 'audio_data': cloned_data['reference_voice'], 'sample_rate': cloned_data['sample_rate'], 'created': datetime.now().isoformat() } st.success(f"✅ Voice profile '{profile_name}' saved!") # Voice Profile Manager if st.session_state.voice_profiles: st.markdown("---") st.subheader("👤 Voice Profile Manager") for profile_name, profile_data in st.session_state.voice_profiles.items(): col1, col2, col3 = st.columns([2, 1, 1]) with col1: st.write(f"**{profile_name}**") st.caption(f"Created: {profile_data['created']}") with col2: audio_bytes = AudioProcessor.audio_to_bytes( profile_data['audio_data'], profile_data['sample_rate'] ) st.audio(audio_bytes, format='audio/wav') with col3: if st.button(f"🗑️ Delete", key=f"del_{profile_name}"): del st.session_state.voice_profiles[profile_name] st.rerun() # Footer st.markdown("---") st.markdown( """

🎭 AI Voice Clone Studio - Advanced Voice Cloning Technology
Transform any voice into any other voice with state-of-the-art AI
⚠️ Use responsibly and with consent from voice owners

""", unsafe_allow_html=True ) if __name__ == "__main__": main()