voiceclone-dev / app.py
crackuser's picture
Update app.py
2c8d218 verified
raw
history blame
19.7 kB
import streamlit as st
import torch
import torchaudio
import numpy as np
import librosa
import soundfile as sf
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from scipy.signal import butter, filtfilt
import tempfile
import os
import io
import base64
from datetime import datetime
import requests
import zipfile
from pathlib import Path
import pickle
import json
# Import voice cloning modules
from voice_cloning_engine import VoiceCloningEngine
from audio_processor import AudioProcessor
from voice_analyzer import VoiceAnalyzer
# Page configuration
st.set_page_config(
page_title="AI Voice Clone Studio",
page_icon="🎭",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS
st.markdown("""
<style>
.main-header {
font-size: 3rem;
font-weight: bold;
text-align: center;
margin-bottom: 2rem;
background: linear-gradient(90deg, #ff6b6b, #4ecdc4, #45b7d1);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
}
.clone-box {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
padding: 2rem;
border-radius: 15px;
color: white;
margin: 1rem 0;
}
.reference-box {
background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
padding: 1.5rem;
border-radius: 10px;
color: white;
margin: 1rem 0;
}
.input-box {
background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
padding: 1.5rem;
border-radius: 10px;
color: white;
margin: 1rem 0;
}
.result-box {
background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%);
padding: 1.5rem;
border-radius: 10px;
color: white;
margin: 1rem 0;
}
.stAudio {
margin: 1rem 0;
}
</style>
""", unsafe_allow_html=True)
# Initialize session state
if 'cloning_engine' not in st.session_state:
st.session_state.cloning_engine = None
if 'reference_voice' not in st.session_state:
st.session_state.reference_voice = None
if 'cloned_audio' not in st.session_state:
st.session_state.cloned_audio = None
if 'voice_profiles' not in st.session_state:
st.session_state.voice_profiles = {}
@st.cache_resource
def load_cloning_engine():
"""Initialize the voice cloning engine"""
return VoiceCloningEngine()
def save_uploaded_file(uploaded_file, directory="temp"):
"""Save uploaded file to directory"""
if uploaded_file is not None:
os.makedirs(directory, exist_ok=True)
file_path = os.path.join(directory, uploaded_file.name)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
return file_path
return None
def create_audio_comparison(original_audio, cloned_audio, sample_rate):
"""Create side-by-side audio comparison"""
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
# Original audio
time_original = np.linspace(0, len(original_audio) / sample_rate, len(original_audio))
ax1.plot(time_original, original_audio, color='blue', alpha=0.7)
ax1.set_title('Original Audio', fontsize=14, fontweight='bold')
ax1.set_xlabel('Time (seconds)')
ax1.set_ylabel('Amplitude')
ax1.grid(True, alpha=0.3)
# Cloned audio
time_cloned = np.linspace(0, len(cloned_audio) / sample_rate, len(cloned_audio))
ax2.plot(time_cloned, cloned_audio, color='red', alpha=0.7)
ax2.set_title('Voice Cloned Audio', fontsize=14, fontweight='bold')
ax2.set_xlabel('Time (seconds)')
ax2.set_ylabel('Amplitude')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
return fig
def create_spectrogram_comparison(original_audio, cloned_audio, sample_rate):
"""Create spectrogram comparison"""
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# Original spectrogram
D1 = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)
librosa.display.specshow(D1, sr=sample_rate, x_axis='time', y_axis='hz', ax=ax1, cmap='viridis')
ax1.set_title('Original Audio Spectrogram')
# Cloned spectrogram
D2 = librosa.amplitude_to_db(np.abs(librosa.stft(cloned_audio)), ref=np.max)
librosa.display.specshow(D2, sr=sample_rate, x_axis='time', y_axis='hz', ax=ax2, cmap='viridis')
ax2.set_title('Voice Cloned Audio Spectrogram')
plt.tight_layout()
return fig
def main():
# Header
st.markdown('<div class="main-header">🎭 AI Voice Clone Studio</div>', unsafe_allow_html=True)
st.markdown("### Transform any voice into any other voice with advanced AI")
# Initialize cloning engine
if st.session_state.cloning_engine is None:
with st.spinner("🚀 Loading Voice Cloning Engine..."):
st.session_state.cloning_engine = load_cloning_engine()
# Sidebar Configuration
with st.sidebar:
st.header("⚙️ Voice Cloning Settings")
# Model Selection
cloning_method = st.selectbox(
"Cloning Method:",
["OpenVoice", "Real-Time VC", "SV2TTS", "Neural Voice Puppetry"],
help="Choose the voice cloning algorithm"
)
# Quality Settings
st.subheader("🎛️ Quality Settings")
quality_level = st.select_slider(
"Quality Level:",
options=["Fast", "Balanced", "High Quality"],
value="Balanced"
)
preserve_emotion = st.checkbox("Preserve Emotion", value=True)
preserve_accent = st.checkbox("Preserve Accent", value=True)
preserve_pace = st.checkbox("Preserve Speaking Pace", value=True)
# Advanced Settings
with st.expander("🔧 Advanced Settings"):
similarity_threshold = st.slider("Voice Similarity Threshold", 0.5, 1.0, 0.8)
noise_reduction = st.checkbox("Apply Noise Reduction", value=True)
auto_trim = st.checkbox("Auto-trim Silence", value=True)
enhance_quality = st.checkbox("Enhance Audio Quality", value=True)
# Main Interface
col1, col2 = st.columns([1, 1])
# Reference Voice Section
with col1:
st.markdown("""
<div class="reference-box">
<h3>🎤 Reference Voice (Target)</h3>
<p>Upload or record the voice you want to clone</p>
</div>
""", unsafe_allow_html=True)
reference_method = st.radio(
"Reference Voice Input:",
["Upload Audio File", "Record Live", "Use Saved Profile"],
horizontal=True
)
reference_audio_data = None
reference_sr = None
if reference_method == "Upload Audio File":
reference_file = st.file_uploader(
"Upload Reference Voice:",
type=['wav', 'mp3', 'flac', 'm4a'],
help="Upload a clear audio sample of the target voice (10+ seconds recommended)"
)
if reference_file:
file_path = save_uploaded_file(reference_file, "reference_voices")
reference_audio_data, reference_sr = librosa.load(file_path, sr=None)
st.audio(reference_file, format='audio/wav')
# Voice Analysis
if st.button("🔍 Analyze Reference Voice"):
with st.spinner("Analyzing voice characteristics..."):
analyzer = VoiceAnalyzer()
voice_features = analyzer.analyze_voice(reference_audio_data, reference_sr)
st.json(voice_features)
elif reference_method == "Record Live":
st.info("🎙️ Use the record button below to capture reference voice")
# Audio recorder component would go here
# For now, showing placeholder
st.warning("Live recording feature requires additional setup")
elif reference_method == "Use Saved Profile":
if st.session_state.voice_profiles:
selected_profile = st.selectbox(
"Select Voice Profile:",
list(st.session_state.voice_profiles.keys())
)
if selected_profile:
profile_data = st.session_state.voice_profiles[selected_profile]
reference_audio_data = profile_data['audio_data']
reference_sr = profile_data['sample_rate']
st.success(f"✅ Loaded voice profile: {selected_profile}")
else:
st.info("No saved voice profiles available")
# Input Audio Section
with col2:
st.markdown("""
<div class="input-box">
<h3>📢 Input Audio (Source)</h3>
<p>Upload the audio you want to transform</p>
</div>
""", unsafe_allow_html=True)
input_method = st.radio(
"Input Audio Method:",
["Upload Audio File", "Record Live", "Text-to-Speech"],
horizontal=True
)
input_audio_data = None
input_sr = None
if input_method == "Upload Audio File":
input_file = st.file_uploader(
"Upload Input Audio:",
type=['wav', 'mp3', 'flac', 'm4a'],
help="Upload the audio you want to transform to the reference voice"
)
if input_file:
file_path = save_uploaded_file(input_file, "temp")
input_audio_data, input_sr = librosa.load(file_path, sr=None)
st.audio(input_file, format='audio/wav')
elif input_method == "Record Live":
st.info("🎙️ Use the record button below to capture input audio")
st.warning("Live recording feature requires additional setup")
elif input_method == "Text-to-Speech":
tts_text = st.text_area(
"Enter text to convert:",
height=150,
placeholder="Type the text you want to speak in the cloned voice..."
)
if tts_text and st.button("🗣️ Generate TTS"):
with st.spinner("Generating speech from text..."):
# Generate TTS audio (placeholder)
st.success("TTS generated! Now clone the voice.")
# Voice Cloning Process
if reference_audio_data is not None and input_audio_data is not None:
st.markdown("---")
st.markdown("""
<div class="clone-box">
<h2>🎭 Voice Cloning Process</h2>
<p>Ready to clone the reference voice and apply it to your input audio!</p>
</div>
""", unsafe_allow_html=True)
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
if st.button("🚀 Start Voice Cloning", type="primary", use_container_width=True):
try:
with st.spinner("🎭 Cloning voice... This may take a few minutes"):
progress_bar = st.progress(0)
status_text = st.empty()
# Step 1: Preprocess audio
status_text.text("📊 Preprocessing audio...")
progress_bar.progress(20)
processor = AudioProcessor()
ref_processed = processor.preprocess_audio(reference_audio_data, reference_sr)
input_processed = processor.preprocess_audio(input_audio_data, input_sr)
# Step 2: Extract voice features
status_text.text("🔍 Extracting voice features...")
progress_bar.progress(40)
# Step 3: Voice cloning
status_text.text("🎭 Performing voice cloning...")
progress_bar.progress(60)
cloned_audio = st.session_state.cloning_engine.clone_voice(
reference_audio=ref_processed,
input_audio=input_processed,
method=cloning_method,
preserve_emotion=preserve_emotion,
preserve_accent=preserve_accent,
preserve_pace=preserve_pace
)
# Step 4: Post-processing
status_text.text("✨ Post-processing...")
progress_bar.progress(80)
if enhance_quality:
cloned_audio = processor.enhance_audio(cloned_audio)
progress_bar.progress(100)
status_text.text("✅ Voice cloning completed!")
# Store result
st.session_state.cloned_audio = {
'audio_data': cloned_audio,
'sample_rate': input_sr,
'original_input': input_audio_data,
'reference_voice': reference_audio_data
}
st.success("🎉 Voice cloning successful!")
except Exception as e:
st.error(f"❌ Error during voice cloning: {str(e)}")
# Results Section
if st.session_state.cloned_audio:
st.markdown("---")
st.markdown("""
<div class="result-box">
<h2>🎵 Cloning Results</h2>
<p>Your voice has been successfully cloned!</p>
</div>
""", unsafe_allow_html=True)
cloned_data = st.session_state.cloned_audio
# Audio Players
st.subheader("🔊 Audio Comparison")
col1, col2, col3 = st.columns(3)
with col1:
st.markdown("**📢 Original Input:**")
input_bytes = AudioProcessor.audio_to_bytes(cloned_data['original_input'], cloned_data['sample_rate'])
st.audio(input_bytes, format='audio/wav')
with col2:
st.markdown("**🎤 Reference Voice:**")
ref_bytes = AudioProcessor.audio_to_bytes(cloned_data['reference_voice'], cloned_data['sample_rate'])
st.audio(ref_bytes, format='audio/wav')
with col3:
st.markdown("**🎭 Cloned Result:**")
cloned_bytes = AudioProcessor.audio_to_bytes(cloned_data['audio_data'], cloned_data['sample_rate'])
st.audio(cloned_bytes, format='audio/wav')
# Visualizations
st.subheader("📊 Audio Analysis")
tab1, tab2, tab3 = st.tabs(["Waveform Comparison", "Spectrogram Analysis", "Voice Similarity"])
with tab1:
fig_wave = create_audio_comparison(
cloned_data['original_input'],
cloned_data['audio_data'],
cloned_data['sample_rate']
)
st.pyplot(fig_wave)
with tab2:
fig_spec = create_spectrogram_comparison(
cloned_data['original_input'],
cloned_data['audio_data'],
cloned_data['sample_rate']
)
st.pyplot(fig_spec)
with tab3:
# Voice similarity metrics
analyzer = VoiceAnalyzer()
similarity_score = analyzer.calculate_similarity(
cloned_data['reference_voice'],
cloned_data['audio_data'],
cloned_data['sample_rate']
)
# Create similarity gauge
fig_gauge = go.Figure(go.Indicator(
mode = "gauge+number+delta",
value = similarity_score * 100,
domain = {'x': [0, 1], 'y': [0, 1]},
title = {'text': "Voice Similarity Score"},
delta = {'reference': 80},
gauge = {
'axis': {'range': [None, 100]},
'bar': {'color': "darkblue"},
'steps': [
{'range': [0, 50], 'color': "lightgray"},
{'range': [50, 80], 'color': "gray"}
],
'threshold': {
'line': {'color': "red", 'width': 4},
'thickness': 0.75,
'value': 90
}
}
))
st.plotly_chart(fig_gauge, use_container_width=True)
# Download Options
st.subheader("💾 Download Options")
col1, col2, col3 = st.columns(3)
with col1:
st.download_button(
label="⬇️ Download WAV",
data=cloned_bytes,
file_name=f"voice_cloned_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav",
mime="audio/wav"
)
with col2:
# Convert to MP3 and download
if st.button("⬇️ Download MP3"):
st.info("MP3 conversion feature coming soon!")
with col3:
# Save as voice profile
profile_name = st.text_input("Voice Profile Name:", placeholder="My Voice Clone")
if st.button("💾 Save Profile") and profile_name:
st.session_state.voice_profiles[profile_name] = {
'audio_data': cloned_data['reference_voice'],
'sample_rate': cloned_data['sample_rate'],
'created': datetime.now().isoformat()
}
st.success(f"✅ Voice profile '{profile_name}' saved!")
# Voice Profile Manager
if st.session_state.voice_profiles:
st.markdown("---")
st.subheader("👤 Voice Profile Manager")
for profile_name, profile_data in st.session_state.voice_profiles.items():
col1, col2, col3 = st.columns([2, 1, 1])
with col1:
st.write(f"**{profile_name}**")
st.caption(f"Created: {profile_data['created']}")
with col2:
audio_bytes = AudioProcessor.audio_to_bytes(
profile_data['audio_data'],
profile_data['sample_rate']
)
st.audio(audio_bytes, format='audio/wav')
with col3:
if st.button(f"🗑️ Delete", key=f"del_{profile_name}"):
del st.session_state.voice_profiles[profile_name]
st.rerun()
# Footer
st.markdown("---")
st.markdown(
"""
<div style="text-align: center; color: #666; padding: 2rem;">
🎭 <strong>AI Voice Clone Studio</strong> - Advanced Voice Cloning Technology<br>
Transform any voice into any other voice with state-of-the-art AI<br>
<small>⚠️ Use responsibly and with consent from voice owners</small>
</div>
""",
unsafe_allow_html=True
)
if __name__ == "__main__":
main()