Spaces:

crackuser
/

voiceclone-dev

Sleeping

App Files Files Community

voiceclone-dev / app.py

crackuser

Update app.py

2c8d218 verified 4 months ago

raw

history blame

19.7 kB

	import streamlit as st
	import torch
	import torchaudio
	import numpy as np
	import librosa
	import soundfile as sf
	import matplotlib.pyplot as plt
	import plotly.graph_objects as go
	import plotly.express as px
	from scipy.signal import butter, filtfilt
	import tempfile
	import os
	import io
	import base64
	from datetime import datetime
	import requests
	import zipfile
	from pathlib import Path
	import pickle
	import json

	# Import voice cloning modules
	from voice_cloning_engine import VoiceCloningEngine
	from audio_processor import AudioProcessor
	from voice_analyzer import VoiceAnalyzer

	# Page configuration
	st.set_page_config(
	page_title="AI Voice Clone Studio",
	page_icon="🎭",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS
	st.markdown("""
	<style>
	.main-header {
	font-size: 3rem;
	font-weight: bold;
	text-align: center;
	margin-bottom: 2rem;
	background: linear-gradient(90deg, #ff6b6b, #4ecdc4, #45b7d1);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	background-clip: text;
	}
	.clone-box {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	padding: 2rem;
	border-radius: 15px;
	color: white;
	margin: 1rem 0;
	}
	.reference-box {
	background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
	padding: 1.5rem;
	border-radius: 10px;
	color: white;
	margin: 1rem 0;
	}
	.input-box {
	background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
	padding: 1.5rem;
	border-radius: 10px;
	color: white;
	margin: 1rem 0;
	}
	.result-box {
	background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%);
	padding: 1.5rem;
	border-radius: 10px;
	color: white;
	margin: 1rem 0;
	}
	.stAudio {
	margin: 1rem 0;
	}
	</style>
	""", unsafe_allow_html=True)

	# Initialize session state
	if 'cloning_engine' not in st.session_state:
	st.session_state.cloning_engine = None
	if 'reference_voice' not in st.session_state:
	st.session_state.reference_voice = None
	if 'cloned_audio' not in st.session_state:
	st.session_state.cloned_audio = None
	if 'voice_profiles' not in st.session_state:
	st.session_state.voice_profiles = {}

	@st.cache_resource
	def load_cloning_engine():
	"""Initialize the voice cloning engine"""
	return VoiceCloningEngine()

	def save_uploaded_file(uploaded_file, directory="temp"):
	"""Save uploaded file to directory"""
	if uploaded_file is not None:
	os.makedirs(directory, exist_ok=True)
	file_path = os.path.join(directory, uploaded_file.name)
	with open(file_path, "wb") as f:
	f.write(uploaded_file.getbuffer())
	return file_path
	return None

	def create_audio_comparison(original_audio, cloned_audio, sample_rate):
	"""Create side-by-side audio comparison"""
	fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))

	# Original audio
	time_original = np.linspace(0, len(original_audio) / sample_rate, len(original_audio))
	ax1.plot(time_original, original_audio, color='blue', alpha=0.7)
	ax1.set_title('Original Audio', fontsize=14, fontweight='bold')
	ax1.set_xlabel('Time (seconds)')
	ax1.set_ylabel('Amplitude')
	ax1.grid(True, alpha=0.3)

	# Cloned audio
	time_cloned = np.linspace(0, len(cloned_audio) / sample_rate, len(cloned_audio))
	ax2.plot(time_cloned, cloned_audio, color='red', alpha=0.7)
	ax2.set_title('Voice Cloned Audio', fontsize=14, fontweight='bold')
	ax2.set_xlabel('Time (seconds)')
	ax2.set_ylabel('Amplitude')
	ax2.grid(True, alpha=0.3)

	plt.tight_layout()
	return fig

	def create_spectrogram_comparison(original_audio, cloned_audio, sample_rate):
	"""Create spectrogram comparison"""
	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

	# Original spectrogram
	D1 = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)
	librosa.display.specshow(D1, sr=sample_rate, x_axis='time', y_axis='hz', ax=ax1, cmap='viridis')
	ax1.set_title('Original Audio Spectrogram')

	# Cloned spectrogram
	D2 = librosa.amplitude_to_db(np.abs(librosa.stft(cloned_audio)), ref=np.max)
	librosa.display.specshow(D2, sr=sample_rate, x_axis='time', y_axis='hz', ax=ax2, cmap='viridis')
	ax2.set_title('Voice Cloned Audio Spectrogram')

	plt.tight_layout()
	return fig

	def main():
	# Header
	st.markdown('<div class="main-header">🎭 AI Voice Clone Studio</div>', unsafe_allow_html=True)
	st.markdown("### Transform any voice into any other voice with advanced AI")

	# Initialize cloning engine
	if st.session_state.cloning_engine is None:
	with st.spinner("🚀 Loading Voice Cloning Engine..."):
	st.session_state.cloning_engine = load_cloning_engine()

	# Sidebar Configuration
	with st.sidebar:
	st.header("⚙️ Voice Cloning Settings")

	# Model Selection
	cloning_method = st.selectbox(
	"Cloning Method:",
	["OpenVoice", "Real-Time VC", "SV2TTS", "Neural Voice Puppetry"],
	help="Choose the voice cloning algorithm"
	)

	# Quality Settings
	st.subheader("🎛️ Quality Settings")
	quality_level = st.select_slider(
	"Quality Level:",
	options=["Fast", "Balanced", "High Quality"],
	value="Balanced"
	)

	preserve_emotion = st.checkbox("Preserve Emotion", value=True)
	preserve_accent = st.checkbox("Preserve Accent", value=True)
	preserve_pace = st.checkbox("Preserve Speaking Pace", value=True)

	# Advanced Settings
	with st.expander("🔧 Advanced Settings"):
	similarity_threshold = st.slider("Voice Similarity Threshold", 0.5, 1.0, 0.8)
	noise_reduction = st.checkbox("Apply Noise Reduction", value=True)
	auto_trim = st.checkbox("Auto-trim Silence", value=True)
	enhance_quality = st.checkbox("Enhance Audio Quality", value=True)

	# Main Interface
	col1, col2 = st.columns([1, 1])

	# Reference Voice Section
	with col1:
	st.markdown("""
	<div class="reference-box">
	<h3>🎤 Reference Voice (Target)</h3>
	<p>Upload or record the voice you want to clone</p>
	</div>
	""", unsafe_allow_html=True)

	reference_method = st.radio(
	"Reference Voice Input:",
	["Upload Audio File", "Record Live", "Use Saved Profile"],
	horizontal=True
	)

	reference_audio_data = None
	reference_sr = None

	if reference_method == "Upload Audio File":
	reference_file = st.file_uploader(
	"Upload Reference Voice:",
	type=['wav', 'mp3', 'flac', 'm4a'],
	help="Upload a clear audio sample of the target voice (10+ seconds recommended)"
	)

	if reference_file:
	file_path = save_uploaded_file(reference_file, "reference_voices")
	reference_audio_data, reference_sr = librosa.load(file_path, sr=None)
	st.audio(reference_file, format='audio/wav')

	# Voice Analysis
	if st.button("🔍 Analyze Reference Voice"):
	with st.spinner("Analyzing voice characteristics..."):
	analyzer = VoiceAnalyzer()
	voice_features = analyzer.analyze_voice(reference_audio_data, reference_sr)

	st.json(voice_features)

	elif reference_method == "Record Live":
	st.info("🎙️ Use the record button below to capture reference voice")
	# Audio recorder component would go here
	# For now, showing placeholder
	st.warning("Live recording feature requires additional setup")

	elif reference_method == "Use Saved Profile":
	if st.session_state.voice_profiles:
	selected_profile = st.selectbox(
	"Select Voice Profile:",
	list(st.session_state.voice_profiles.keys())
	)

	if selected_profile:
	profile_data = st.session_state.voice_profiles[selected_profile]
	reference_audio_data = profile_data['audio_data']
	reference_sr = profile_data['sample_rate']
	st.success(f"✅ Loaded voice profile: {selected_profile}")
	else:
	st.info("No saved voice profiles available")

	# Input Audio Section
	with col2:
	st.markdown("""
	<div class="input-box">
	<h3>📢 Input Audio (Source)</h3>
	<p>Upload the audio you want to transform</p>
	</div>
	""", unsafe_allow_html=True)

	input_method = st.radio(
	"Input Audio Method:",
	["Upload Audio File", "Record Live", "Text-to-Speech"],
	horizontal=True
	)

	input_audio_data = None
	input_sr = None

	if input_method == "Upload Audio File":
	input_file = st.file_uploader(
	"Upload Input Audio:",
	type=['wav', 'mp3', 'flac', 'm4a'],
	help="Upload the audio you want to transform to the reference voice"
	)

	if input_file:
	file_path = save_uploaded_file(input_file, "temp")
	input_audio_data, input_sr = librosa.load(file_path, sr=None)
	st.audio(input_file, format='audio/wav')

	elif input_method == "Record Live":
	st.info("🎙️ Use the record button below to capture input audio")
	st.warning("Live recording feature requires additional setup")

	elif input_method == "Text-to-Speech":
	tts_text = st.text_area(
	"Enter text to convert:",
	height=150,
	placeholder="Type the text you want to speak in the cloned voice..."
	)

	if tts_text and st.button("🗣️ Generate TTS"):
	with st.spinner("Generating speech from text..."):
	# Generate TTS audio (placeholder)
	st.success("TTS generated! Now clone the voice.")

	# Voice Cloning Process
	if reference_audio_data is not None and input_audio_data is not None:
	st.markdown("---")
	st.markdown("""
	<div class="clone-box">
	<h2>🎭 Voice Cloning Process</h2>
	<p>Ready to clone the reference voice and apply it to your input audio!</p>
	</div>
	""", unsafe_allow_html=True)

	col1, col2, col3 = st.columns([1, 2, 1])

	with col2:
	if st.button("🚀 Start Voice Cloning", type="primary", use_container_width=True):
	try:
	with st.spinner("🎭 Cloning voice... This may take a few minutes"):
	progress_bar = st.progress(0)
	status_text = st.empty()

	# Step 1: Preprocess audio
	status_text.text("📊 Preprocessing audio...")
	progress_bar.progress(20)

	processor = AudioProcessor()
	ref_processed = processor.preprocess_audio(reference_audio_data, reference_sr)
	input_processed = processor.preprocess_audio(input_audio_data, input_sr)

	# Step 2: Extract voice features
	status_text.text("🔍 Extracting voice features...")
	progress_bar.progress(40)

	# Step 3: Voice cloning
	status_text.text("🎭 Performing voice cloning...")
	progress_bar.progress(60)

	cloned_audio = st.session_state.cloning_engine.clone_voice(
	reference_audio=ref_processed,
	input_audio=input_processed,
	method=cloning_method,
	preserve_emotion=preserve_emotion,
	preserve_accent=preserve_accent,
	preserve_pace=preserve_pace
	)

	# Step 4: Post-processing
	status_text.text("✨ Post-processing...")
	progress_bar.progress(80)

	if enhance_quality:
	cloned_audio = processor.enhance_audio(cloned_audio)

	progress_bar.progress(100)
	status_text.text("✅ Voice cloning completed!")

	# Store result
	st.session_state.cloned_audio = {
	'audio_data': cloned_audio,
	'sample_rate': input_sr,
	'original_input': input_audio_data,
	'reference_voice': reference_audio_data
	}

	st.success("🎉 Voice cloning successful!")

	except Exception as e:
	st.error(f"❌ Error during voice cloning: {str(e)}")

	# Results Section
	if st.session_state.cloned_audio:
	st.markdown("---")
	st.markdown("""
	<div class="result-box">
	<h2>🎵 Cloning Results</h2>
	<p>Your voice has been successfully cloned!</p>
	</div>
	""", unsafe_allow_html=True)

	cloned_data = st.session_state.cloned_audio

	# Audio Players
	st.subheader("🔊 Audio Comparison")

	col1, col2, col3 = st.columns(3)

	with col1:
	st.markdown("📢 Original Input:")
	input_bytes = AudioProcessor.audio_to_bytes(cloned_data['original_input'], cloned_data['sample_rate'])
	st.audio(input_bytes, format='audio/wav')

	with col2:
	st.markdown("🎤 Reference Voice:")
	ref_bytes = AudioProcessor.audio_to_bytes(cloned_data['reference_voice'], cloned_data['sample_rate'])
	st.audio(ref_bytes, format='audio/wav')

	with col3:
	st.markdown("🎭 Cloned Result:")
	cloned_bytes = AudioProcessor.audio_to_bytes(cloned_data['audio_data'], cloned_data['sample_rate'])
	st.audio(cloned_bytes, format='audio/wav')

	# Visualizations
	st.subheader("📊 Audio Analysis")

	tab1, tab2, tab3 = st.tabs(["Waveform Comparison", "Spectrogram Analysis", "Voice Similarity"])

	with tab1:
	fig_wave = create_audio_comparison(
	cloned_data['original_input'],
	cloned_data['audio_data'],
	cloned_data['sample_rate']
	)
	st.pyplot(fig_wave)

	with tab2:
	fig_spec = create_spectrogram_comparison(
	cloned_data['original_input'],
	cloned_data['audio_data'],
	cloned_data['sample_rate']
	)
	st.pyplot(fig_spec)

	with tab3:
	# Voice similarity metrics
	analyzer = VoiceAnalyzer()
	similarity_score = analyzer.calculate_similarity(
	cloned_data['reference_voice'],
	cloned_data['audio_data'],
	cloned_data['sample_rate']
	)

	# Create similarity gauge
	fig_gauge = go.Figure(go.Indicator(
	mode = "gauge+number+delta",
	value = similarity_score * 100,
	domain = {'x': [0, 1], 'y': [0, 1]},
	title = {'text': "Voice Similarity Score"},
	delta = {'reference': 80},
	gauge = {
	'axis': {'range': [None, 100]},
	'bar': {'color': "darkblue"},
	'steps': [
	{'range': [0, 50], 'color': "lightgray"},
	{'range': [50, 80], 'color': "gray"}
	],
	'threshold': {
	'line': {'color': "red", 'width': 4},
	'thickness': 0.75,
	'value': 90
	}
	}
	))

	st.plotly_chart(fig_gauge, use_container_width=True)

	# Download Options
	st.subheader("💾 Download Options")

	col1, col2, col3 = st.columns(3)

	with col1:
	st.download_button(
	label="⬇️ Download WAV",
	data=cloned_bytes,
	file_name=f"voice_cloned_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav",
	mime="audio/wav"
	)

	with col2:
	# Convert to MP3 and download
	if st.button("⬇️ Download MP3"):
	st.info("MP3 conversion feature coming soon!")

	with col3:
	# Save as voice profile
	profile_name = st.text_input("Voice Profile Name:", placeholder="My Voice Clone")
	if st.button("💾 Save Profile") and profile_name:
	st.session_state.voice_profiles[profile_name] = {
	'audio_data': cloned_data['reference_voice'],
	'sample_rate': cloned_data['sample_rate'],
	'created': datetime.now().isoformat()
	}
	st.success(f"✅ Voice profile '{profile_name}' saved!")

	# Voice Profile Manager
	if st.session_state.voice_profiles:
	st.markdown("---")
	st.subheader("👤 Voice Profile Manager")

	for profile_name, profile_data in st.session_state.voice_profiles.items():
	col1, col2, col3 = st.columns([2, 1, 1])

	with col1:
	st.write(f"{profile_name}")
	st.caption(f"Created: {profile_data['created']}")

	with col2:
	audio_bytes = AudioProcessor.audio_to_bytes(
	profile_data['audio_data'],
	profile_data['sample_rate']
	)
	st.audio(audio_bytes, format='audio/wav')

	with col3:
	if st.button(f"🗑️ Delete", key=f"del_{profile_name}"):
	del st.session_state.voice_profiles[profile_name]
	st.rerun()

	# Footer
	st.markdown("---")
	st.markdown(
	"""
	<div style="text-align: center; color: #666; padding: 2rem;">
	🎭 <strong>AI Voice Clone Studio</strong> - Advanced Voice Cloning Technology<br>
	Transform any voice into any other voice with state-of-the-art AI<br>
	<small>⚠️ Use responsibly and with consent from voice owners</small>
	</div>
	""",
	unsafe_allow_html=True
	)

	if __name__ == "__main__":
	main()