Spaces:

crackuser
/

voiceclone-dev

Sleeping

App Files Files Community

crackuser commited on Sep 10, 2025

Commit

f758d08

verified ·

1 Parent(s): 8581048

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -500

app.py CHANGED Viewed

@@ -1,515 +1,65 @@
-import streamlit as st
 import torch
 import torchaudio
 import numpy as np
-import librosa
-import soundfile as sf
-import matplotlib.pyplot as plt
-import plotly.graph_objects as go
-import plotly.express as px
-from scipy.signal import butter, filtfilt
 import tempfile
 import os
-import io
-import base64
-from datetime import datetime
-import requests
-import zipfile
-from pathlib import Path
-import pickle
-import json
-# Import voice cloning modules
-from voice_cloning_engine import VoiceCloningEngine
-from audio_processor import AudioProcessor
-from voice_analyzer import VoiceAnalyzer
-# Page configuration
-st.set_page_config(
-    page_title="AI Voice Clone Studio",
-    page_icon="🎭",
-    layout="wide",
-    initial_sidebar_state="expanded"
-)
-# Custom CSS
-st.markdown("""
-<style>
-    .main-header {
-        font-size: 3rem;
-        font-weight: bold;
-        text-align: center;
-        margin-bottom: 2rem;
-        background: linear-gradient(90deg, #ff6b6b, #4ecdc4, #45b7d1);
-        -webkit-background-clip: text;
-        -webkit-text-fill-color: transparent;
-        background-clip: text;
-    }
-    .clone-box {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        padding: 2rem;
-        border-radius: 15px;
-        color: white;
-        margin: 1rem 0;
-    }
-    .reference-box {
-        background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
-        padding: 1.5rem;
-        border-radius: 10px;
-        color: white;
-        margin: 1rem 0;
-    }
-    .input-box {
-        background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
-        padding: 1.5rem;
-        border-radius: 10px;
-        color: white;
-        margin: 1rem 0;
-    }
-    .result-box {
-        background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%);
-        padding: 1.5rem;
-        border-radius: 10px;
-        color: white;
-        margin: 1rem 0;
-    }
-    .stAudio {
-        margin: 1rem 0;
-    }
-</style>
-""", unsafe_allow_html=True)
-# Initialize session state
-if 'cloning_engine' not in st.session_state:
-    st.session_state.cloning_engine = None
-if 'reference_voice' not in st.session_state:
-    st.session_state.reference_voice = None
-if 'cloned_audio' not in st.session_state:
-    st.session_state.cloned_audio = None
-if 'voice_profiles' not in st.session_state:
-    st.session_state.voice_profiles = {}
-@st.cache_resource
-def load_cloning_engine():
-    """Initialize the voice cloning engine"""
-    return VoiceCloningEngine()
-def save_uploaded_file(uploaded_file, directory="temp"):
-    """Save uploaded file to directory"""
-    if uploaded_file is not None:
-        os.makedirs(directory, exist_ok=True)
-        file_path = os.path.join(directory, uploaded_file.name)
-        with open(file_path, "wb") as f:
-            f.write(uploaded_file.getbuffer())
-        return file_path
-    return None
-def create_audio_comparison(original_audio, cloned_audio, sample_rate):
-    """Create side-by-side audio comparison"""
-    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
-    # Original audio
-    time_original = np.linspace(0, len(original_audio) / sample_rate, len(original_audio))
-    ax1.plot(time_original, original_audio, color='blue', alpha=0.7)
-    ax1.set_title('Original Audio', fontsize=14, fontweight='bold')
-    ax1.set_xlabel('Time (seconds)')
-    ax1.set_ylabel('Amplitude')
-    ax1.grid(True, alpha=0.3)
-    # Cloned audio
-    time_cloned = np.linspace(0, len(cloned_audio) / sample_rate, len(cloned_audio))
-    ax2.plot(time_cloned, cloned_audio, color='red', alpha=0.7)
-    ax2.set_title('Voice Cloned Audio', fontsize=14, fontweight='bold')
-    ax2.set_xlabel('Time (seconds)')
-    ax2.set_ylabel('Amplitude')
-    ax2.grid(True, alpha=0.3)
-    plt.tight_layout()
-    return fig
-def create_spectrogram_comparison(original_audio, cloned_audio, sample_rate):
-    """Create spectrogram comparison"""
-    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
-    # Original spectrogram
-    D1 = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)
-    librosa.display.specshow(D1, sr=sample_rate, x_axis='time', y_axis='hz', ax=ax1, cmap='viridis')
-    ax1.set_title('Original Audio Spectrogram')
-    # Cloned spectrogram
-    D2 = librosa.amplitude_to_db(np.abs(librosa.stft(cloned_audio)), ref=np.max)
-    librosa.display.specshow(D2, sr=sample_rate, x_axis='time', y_axis='hz', ax=ax2, cmap='viridis')
-    ax2.set_title('Voice Cloned Audio Spectrogram')
-    plt.tight_layout()
-    return fig
-def main():
-    # Header
-    st.markdown('<div class="main-header">🎭 AI Voice Clone Studio</div>', unsafe_allow_html=True)
-    st.markdown("### Transform any voice into any other voice with advanced AI")
-    # Initialize cloning engine
-    if st.session_state.cloning_engine is None:
-        with st.spinner("🚀 Loading Voice Cloning Engine..."):
-            st.session_state.cloning_engine = load_cloning_engine()
-    # Sidebar Configuration
-    with st.sidebar:
-        st.header("⚙️ Voice Cloning Settings")
-        # Model Selection
-        cloning_method = st.selectbox(
-            "Cloning Method:",
-            ["OpenVoice", "Real-Time VC", "SV2TTS", "Neural Voice Puppetry"],
-            help="Choose the voice cloning algorithm"
-        )
-        # Quality Settings
-        st.subheader("🎛️ Quality Settings")
-        quality_level = st.select_slider(
-            "Quality Level:",
-            options=["Fast", "Balanced", "High Quality"],
-            value="Balanced"
-        )
-        preserve_emotion = st.checkbox("Preserve Emotion", value=True)
-        preserve_accent = st.checkbox("Preserve Accent", value=True)
-        preserve_pace = st.checkbox("Preserve Speaking Pace", value=True)
-        # Advanced Settings
-        with st.expander("🔧 Advanced Settings"):
-            similarity_threshold = st.slider("Voice Similarity Threshold", 0.5, 1.0, 0.8)
-            noise_reduction = st.checkbox("Apply Noise Reduction", value=True)
-            auto_trim = st.checkbox("Auto-trim Silence", value=True)
-            enhance_quality = st.checkbox("Enhance Audio Quality", value=True)
-    # Main Interface
-    col1, col2 = st.columns([1, 1])
-    # Reference Voice Section
-    with col1:
-        st.markdown("""
-        <div class="reference-box">
-            <h3>🎤 Reference Voice (Target)</h3>
-            <p>Upload or record the voice you want to clone</p>
-        </div>
-        """, unsafe_allow_html=True)
-        reference_method = st.radio(
-            "Reference Voice Input:",
-            ["Upload Audio File", "Record Live", "Use Saved Profile"],
-            horizontal=True
-        )
-        reference_audio_data = None
-        reference_sr = None
-        if reference_method == "Upload Audio File":
-            reference_file = st.file_uploader(
-                "Upload Reference Voice:",
-                type=['wav', 'mp3', 'flac', 'm4a'],
-                help="Upload a clear audio sample of the target voice (10+ seconds recommended)"
             )
-            if reference_file:
-                file_path = save_uploaded_file(reference_file, "reference_voices")
-                reference_audio_data, reference_sr = librosa.load(file_path, sr=None)
-                st.audio(reference_file, format='audio/wav')
-                # Voice Analysis
-                if st.button("🔍 Analyze Reference Voice"):
-                    with st.spinner("Analyzing voice characteristics..."):
-                        analyzer = VoiceAnalyzer()
-                        voice_features = analyzer.analyze_voice(reference_audio_data, reference_sr)
-                        st.json(voice_features)
-        elif reference_method == "Record Live":
-            st.info("🎙️ Use the record button below to capture reference voice")
-            # Audio recorder component would go here
-            # For now, showing placeholder
-            st.warning("Live recording feature requires additional setup")
-        elif reference_method == "Use Saved Profile":
-            if st.session_state.voice_profiles:
-                selected_profile = st.selectbox(
-                    "Select Voice Profile:",
-                    list(st.session_state.voice_profiles.keys())
-                )
-                if selected_profile:
-                    profile_data = st.session_state.voice_profiles[selected_profile]
-                    reference_audio_data = profile_data['audio_data']
-                    reference_sr = profile_data['sample_rate']
-                    st.success(f"✅ Loaded voice profile: {selected_profile}")
-            else:
-                st.info("No saved voice profiles available")
-    # Input Audio Section
-    with col2:
-        st.markdown("""
-        <div class="input-box">
-            <h3>📢 Input Audio (Source)</h3>
-            <p>Upload the audio you want to transform</p>
-        </div>
-        """, unsafe_allow_html=True)
-        input_method = st.radio(
-            "Input Audio Method:",
-            ["Upload Audio File", "Record Live", "Text-to-Speech"],
-            horizontal=True
-        )
-        input_audio_data = None
-        input_sr = None
-        if input_method == "Upload Audio File":
-            input_file = st.file_uploader(
-                "Upload Input Audio:",
-                type=['wav', 'mp3', 'flac', 'm4a'],
-                help="Upload the audio you want to transform to the reference voice"
-            )
-            if input_file:
-                file_path = save_uploaded_file(input_file, "temp")
-                input_audio_data, input_sr = librosa.load(file_path, sr=None)
-                st.audio(input_file, format='audio/wav')
-        elif input_method == "Record Live":
-            st.info("🎙️ Use the record button below to capture input audio")
-            st.warning("Live recording feature requires additional setup")
-        elif input_method == "Text-to-Speech":
-            tts_text = st.text_area(
-                "Enter text to convert:",
-                height=150,
-                placeholder="Type the text you want to speak in the cloned voice..."
             )
-            if tts_text and st.button("🗣️ Generate TTS"):
-                with st.spinner("Generating speech from text..."):
-                    # Generate TTS audio (placeholder)
-                    st.success("TTS generated! Now clone the voice.")
-    # Voice Cloning Process
-    if reference_audio_data is not None and input_audio_data is not None:
-        st.markdown("---")
-        st.markdown("""
-        <div class="clone-box">
-            <h2>🎭 Voice Cloning Process</h2>
-            <p>Ready to clone the reference voice and apply it to your input audio!</p>
-        </div>
-        """, unsafe_allow_html=True)
-        col1, col2, col3 = st.columns([1, 2, 1])
-        with col2:
-            if st.button("🚀 Start Voice Cloning", type="primary", use_container_width=True):
-                try:
-                    with st.spinner("🎭 Cloning voice... This may take a few minutes"):
-                        progress_bar = st.progress(0)
-                        status_text = st.empty()
-                        # Step 1: Preprocess audio
-                        status_text.text("📊 Preprocessing audio...")
-                        progress_bar.progress(20)
-                        processor = AudioProcessor()
-                        ref_processed = processor.preprocess_audio(reference_audio_data, reference_sr)
-                        input_processed = processor.preprocess_audio(input_audio_data, input_sr)
-                        # Step 2: Extract voice features
-                        status_text.text("🔍 Extracting voice features...")
-                        progress_bar.progress(40)
-                        # Step 3: Voice cloning
-                        status_text.text("🎭 Performing voice cloning...")
-                        progress_bar.progress(60)
-                        cloned_audio = st.session_state.cloning_engine.clone_voice(
-                            reference_audio=ref_processed,
-                            input_audio=input_processed,
-                            method=cloning_method,
-                            preserve_emotion=preserve_emotion,
-                            preserve_accent=preserve_accent,
-                            preserve_pace=preserve_pace
-                        )
-                        # Step 4: Post-processing
-                        status_text.text("✨ Post-processing...")
-                        progress_bar.progress(80)
-                        if enhance_quality:
-                            cloned_audio = processor.enhance_audio(cloned_audio)
-                        progress_bar.progress(100)
-                        status_text.text("✅ Voice cloning completed!")
-                        # Store result
-                        st.session_state.cloned_audio = {
-                            'audio_data': cloned_audio,
-                            'sample_rate': input_sr,
-                            'original_input': input_audio_data,
-                            'reference_voice': reference_audio_data
-                        }
-                        st.success("🎉 Voice cloning successful!")
-                except Exception as e:
-                    st.error(f"❌ Error during voice cloning: {str(e)}")
-    # Results Section
-    if st.session_state.cloned_audio:
-        st.markdown("---")
-        st.markdown("""
-        <div class="result-box">
-            <h2>🎵 Cloning Results</h2>
-            <p>Your voice has been successfully cloned!</p>
-        </div>
-        """, unsafe_allow_html=True)
-        cloned_data = st.session_state.cloned_audio
-        # Audio Players
-        st.subheader("🔊 Audio Comparison")
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            st.markdown("**📢 Original Input:**")
-            input_bytes = AudioProcessor.audio_to_bytes(cloned_data['original_input'], cloned_data['sample_rate'])
-            st.audio(input_bytes, format='audio/wav')
-        with col2:
-            st.markdown("**🎤 Reference Voice:**")
-            ref_bytes = AudioProcessor.audio_to_bytes(cloned_data['reference_voice'], cloned_data['sample_rate'])
-            st.audio(ref_bytes, format='audio/wav')
-        with col3:
-            st.markdown("**🎭 Cloned Result:**")
-            cloned_bytes = AudioProcessor.audio_to_bytes(cloned_data['audio_data'], cloned_data['sample_rate'])
-            st.audio(cloned_bytes, format='audio/wav')
-        # Visualizations
-        st.subheader("📊 Audio Analysis")
-        tab1, tab2, tab3 = st.tabs(["Waveform Comparison", "Spectrogram Analysis", "Voice Similarity"])
-        with tab1:
-            fig_wave = create_audio_comparison(
-                cloned_data['original_input'],
-                cloned_data['audio_data'],
-                cloned_data['sample_rate']
-            )
-            st.pyplot(fig_wave)
-        with tab2:
-            fig_spec = create_spectrogram_comparison(
-                cloned_data['original_input'],
-                cloned_data['audio_data'],
-                cloned_data['sample_rate']
-            )
-            st.pyplot(fig_spec)
-        with tab3:
-            # Voice similarity metrics
-            analyzer = VoiceAnalyzer()
-            similarity_score = analyzer.calculate_similarity(
-                cloned_data['reference_voice'],
-                cloned_data['audio_data'],
-                cloned_data['sample_rate']
-            )
-            # Create similarity gauge
-            fig_gauge = go.Figure(go.Indicator(
-                mode = "gauge+number+delta",
-                value = similarity_score * 100,
-                domain = {'x': [0, 1], 'y': [0, 1]},
-                title = {'text': "Voice Similarity Score"},
-                delta = {'reference': 80},
-                gauge = {
-                    'axis': {'range': [None, 100]},
-                    'bar': {'color': "darkblue"},
-                    'steps': [
-                        {'range': [0, 50], 'color': "lightgray"},
-                        {'range': [50, 80], 'color': "gray"}
-                    ],
-                    'threshold': {
-                        'line': {'color': "red", 'width': 4},
-                        'thickness': 0.75,
-                        'value': 90
-                    }
-                }
-            ))
-            st.plotly_chart(fig_gauge, use_container_width=True)
-        # Download Options
-        st.subheader("💾 Download Options")
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            st.download_button(
-                label="⬇️ Download WAV",
-                data=cloned_bytes,
-                file_name=f"voice_cloned_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav",
-                mime="audio/wav"
-            )
-        with col2:
-            # Convert to MP3 and download
-            if st.button("⬇️ Download MP3"):
-                st.info("MP3 conversion feature coming soon!")
-        with col3:
-            # Save as voice profile
-            profile_name = st.text_input("Voice Profile Name:", placeholder="My Voice Clone")
-            if st.button("💾 Save Profile") and profile_name:
-                st.session_state.voice_profiles[profile_name] = {
-                    'audio_data': cloned_data['reference_voice'],
-                    'sample_rate': cloned_data['sample_rate'],
-                    'created': datetime.now().isoformat()
-                }
-                st.success(f"✅ Voice profile '{profile_name}' saved!")
-    # Voice Profile Manager
-    if st.session_state.voice_profiles:
-        st.markdown("---")
-        st.subheader("👤 Voice Profile Manager")
-        for profile_name, profile_data in st.session_state.voice_profiles.items():
-            col1, col2, col3 = st.columns([2, 1, 1])
-            with col1:
-                st.write(f"**{profile_name}**")
-                st.caption(f"Created: {profile_data['created']}")
-            with col2:
-                audio_bytes = AudioProcessor.audio_to_bytes(
-                    profile_data['audio_data'],
-                    profile_data['sample_rate']
-                )
-                st.audio(audio_bytes, format='audio/wav')
-            with col3:
-                if st.button(f"🗑️ Delete", key=f"del_{profile_name}"):
-                    del st.session_state.voice_profiles[profile_name]
-                    st.rerun()
-    # Footer
-    st.markdown("---")
-    st.markdown(
-        """
-        <div style="text-align: center; color: #666; padding: 2rem;">
-            🎭 <strong>AI Voice Clone Studio</strong> - Advanced Voice Cloning Technology<br>
-            Transform any voice into any other voice with state-of-the-art AI<br>
-            <small>⚠️ Use responsibly and with consent from voice owners</small>
-        </div>
-        """,
-        unsafe_allow_html=True
     )
 if __name__ == "__main__":
-    main()

+import gradio as gr
 import torch
 import torchaudio
 import numpy as np
+from transformers import AutoModel, AutoTokenizer
 import tempfile
 import os
+def clone_voice(reference_audio, input_text):
+    """Voice cloning function"""
+    try:
+        # Your voice cloning logic here
+        # This is a basic template - replace with your actual model
+        # Load your model (replace with actual model loading)
+        # model = AutoModel.from_pretrained("your-model-name")
+        # Process the reference audio
+        if reference_audio is None:
+            return None, "Please upload reference audio"
+        # Simple echo for testing (replace with actual voice cloning)
+        # In a real implementation, you'd:
+        # 1. Process reference_audio to extract voice features
+        # 2. Generate speech from input_text using those features
+        # 3. Return the generated audio
+        # For now, return the reference audio as a test
+        return reference_audio, "Voice cloning completed (test mode)"
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="Voice Cloning") as app:
+    gr.Markdown("# 🎭 AI Voice Cloning")
+    gr.Markdown("Upload reference audio and enter text to clone the voice.")
+    with gr.Row():
+        with gr.Column():
+            reference_audio = gr.Audio(
+                label="Reference Voice (10+ seconds)",
+                type="filepath"
             )
+            input_text = gr.Textbox(
+                label="Text to Convert",
+                placeholder="Enter the text you want to speak in the cloned voice...",
+                lines=3
             )
+            clone_btn = gr.Button("🎤 Clone Voice", variant="primary")
+        with gr.Column():
+            output_audio = gr.Audio(label="Cloned Voice Output")
+            status_text = gr.Textbox(label="Status", interactive=False)
+    # Connect the function
+    clone_btn.click(
+        fn=clone_voice,
+        inputs=[reference_audio, input_text],
+        outputs=[output_audio, status_text]
     )
+# Launch the app
 if __name__ == "__main__":
+    app.launch()