Spaces:

crackuser
/

voiceclone-dev

Running

App Files Files Community

crackuser commited on Sep 10, 2025

Commit

187313d

verified ·

1 Parent(s): 3a79786

Update app.py

Browse files

Files changed (1) hide show

app.py +309 -53

app.py CHANGED Viewed

@@ -4,10 +4,59 @@ import numpy as np
 import soundfile as sf
 import tempfile
 import os
-def voice_clone_demo(reference_audio, input_text):
     """
-    Demo voice cloning function
     """
     try:
         if not reference_audio:
@@ -16,67 +65,274 @@ def voice_clone_demo(reference_audio, input_text):
         if not input_text or not input_text.strip():
             return None, "❌ Please enter text to convert!"
-        # For demo purposes, return the reference audio
-        # In production, this would call actual voice cloning APIs
-        return reference_audio, f"✅ Demo: Would clone '{input_text[:50]}...' using uploaded voice"
     except Exception as e:
-        return None, f"❌ Error: {str(e)}"
-# Create Gradio interface
-with gr.Blocks(
-    title="🎭 Voice Cloning Studio",
-    theme=gr.themes.Soft(primary_hue="blue")
-) as demo:
-    gr.HTML("""
-    <div style="text-align: center; padding: 20px;">
-        <h1 style="color: #2E86AB;">🎭 AI Voice Cloning Studio</h1>
-        <p style="color: #666; font-size: 18px;">Clone any voice with AI technology</p>
-    </div>
-    """)
-    with gr.Row():
-        with gr.Column():
-            gr.HTML("<h3>📤 Upload Reference Voice</h3>")
-            reference_audio = gr.Audio(
-                label="Reference Audio (10+ seconds)",
-                type="filepath"
-            )
-            gr.HTML("<h3>📝 Enter Text</h3>")
-            text_input = gr.Textbox(
-                label="Text to Convert",
-                placeholder="Enter text to speak in the cloned voice...",
-                lines=4
             )
-            clone_button = gr.Button("🎤 Clone Voice", variant="primary")
-        with gr.Column():
-            gr.HTML("<h3>🎵 Output</h3>")
-            audio_output = gr.Audio(label="Cloned Voice")
-            status_output = gr.Textbox(label="Status", interactive=False)
-    # Examples
-    examples = [
-        "Hello, this is a demonstration of voice cloning technology.",
-        "Welcome to the future of AI-powered speech synthesis.",
-        "This voice was generated using advanced machine learning."
-    ]
-    gr.Examples(
-        examples=examples,
-        inputs=text_input
-    )
-    # Event handler
-    clone_button.click(
-        fn=voice_clone_demo,
-        inputs=[reference_audio, text_input],
-        outputs=[audio_output, status_output]
-    )
 if __name__ == "__main__":
-    demo.launch()

 import soundfile as sf
 import tempfile
 import os
+from scipy.io import wavfile
+import librosa
+def extract_audio_features(audio_path):
+    """Extract features from audio for voice cloning"""
+    try:
+        # Load audio file
+        audio, sr = librosa.load(audio_path, sr=16000)
+        return audio, sr
+    except Exception as e:
+        print(f"Error processing audio: {e}")
+        return None, None
+def voice_clone_with_audio(reference_audio, input_audio, enhance_quality=True):
+    """
+    Voice-to-Voice cloning: Clone reference voice using input audio
+    """
+    try:
+        if not reference_audio:
+            return None, "❌ Please upload reference audio!"
+        if not input_audio:
+            return None, "❌ Please upload input audio to transform!"
+        # Process reference audio
+        ref_audio, ref_sr = extract_audio_features(reference_audio)
+        if ref_audio is None:
+            return None, "❌ Error processing reference audio!"
+        # Process input audio
+        input_audio_data, input_sr = extract_audio_features(input_audio)
+        if input_audio_data is None:
+            return None, "❌ Error processing input audio!"
+        # For demo: Apply simple voice transformation
+        # In production, this would use actual voice cloning models
+        transformed_audio = apply_voice_transformation(
+            reference_audio=ref_audio,
+            input_audio=input_audio_data,
+            enhance_quality=enhance_quality
+        )
+        # Save output audio
+        output_path = save_audio_output(transformed_audio, ref_sr)
+        return output_path, f"✅ Voice cloning complete!\n🎵 Transformed {len(input_audio_data)/input_sr:.1f}s of audio using reference voice"
+    except Exception as e:
+        return None, f"❌ Error in voice cloning: {str(e)}"
+def voice_clone_with_text(reference_audio, input_text, language="en", speed=1.0):
     """
+    Text-to-Voice cloning: Generate speech from text using reference voice
     """
     try:
         if not reference_audio:
         if not input_text or not input_text.strip():
             return None, "❌ Please enter text to convert!"
+        # Process reference audio
+        ref_audio, ref_sr = extract_audio_features(reference_audio)
+        if ref_audio is None:
+            return None, "❌ Error processing reference audio!"
+        # Generate speech from text (demo implementation)
+        generated_audio = text_to_speech_with_voice(
+            text=input_text,
+            reference_voice=ref_audio,
+            language=language,
+            speed=speed
+        )
+        # Save output audio
+        output_path = save_audio_output(generated_audio, ref_sr)
+        return output_path, f"✅ Text-to-speech complete!\n📝 Generated speech for: '{input_text[:100]}{'...' if len(input_text) > 100 else ''}'"
     except Exception as e:
+        return None, f"❌ Error in text-to-speech: {str(e)}"
+def apply_voice_transformation(reference_audio, input_audio, enhance_quality=True):
+    """
+    Apply voice transformation (demo implementation)
+    In production, this would use models like XTTS, OpenVoice, etc.
+    """
+    # Demo: Simple pitch and tone adjustment
+    # This is a placeholder - replace with actual voice cloning model
+    # Normalize audio lengths
+    min_length = min(len(reference_audio), len(input_audio))
+    if min_length > 0:
+        # Simple blending for demo (not real voice cloning)
+        alpha = 0.7  # Weight for input audio
+        beta = 0.3   # Weight for reference characteristics
+        # Resize to same length
+        ref_segment = reference_audio[:min_length]
+        input_segment = input_audio[:min_length]
+        # Simple transformation (placeholder)
+        transformed = alpha * input_segment + beta * ref_segment
+        # Apply enhancement if requested
+        if enhance_quality:
+            transformed = enhance_audio_quality(transformed)
+        return transformed
+    else:
+        return input_audio
+def text_to_speech_with_voice(text, reference_voice, language="en", speed=1.0):
+    """
+    Generate speech from text using reference voice characteristics
+    In production, this would use TTS models with voice cloning
+    """
+    # Demo: Generate simple synthetic speech
+    # This is a placeholder - replace with actual TTS model
+    duration = len(text) * 0.1 * speed  # Rough duration estimate
+    sr = 16000
+    samples = int(duration * sr)
+    # Generate simple sine wave pattern (placeholder)
+    t = np.linspace(0, duration, samples)
+    frequency = 200 + np.mean(np.abs(reference_voice)) * 100  # Use ref voice characteristics
+    synthetic_speech = 0.3 * np.sin(2 * np.pi * frequency * t)
+    # Add some variation based on text length
+    for i, char in enumerate(text[:10]):
+        freq_mod = 200 + ord(char) % 100
+        synthetic_speech += 0.1 * np.sin(2 * np.pi * freq_mod * t)
+    return synthetic_speech[:samples]
+def enhance_audio_quality(audio):
+    """Apply audio enhancement"""
+    # Simple noise reduction and normalization
+    audio = audio / np.max(np.abs(audio))  # Normalize
+    audio = audio * 0.8  # Reduce volume slightly
+    return audio
+def save_audio_output(audio_data, sample_rate):
+    """Save audio data to temporary file"""
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+        output_path = tmp_file.name
+    # Ensure audio is in correct format
+    audio_data = np.array(audio_data, dtype=np.float32)
+    # Save using soundfile
+    sf.write(output_path, audio_data, sample_rate)
+    return output_path
+# Create Gradio interface with tabs
+def create_interface():
+    with gr.Blocks(
+        title="🎭 Voice Cloning Studio",
+        theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
+    ) as demo:
+        # Header
+        gr.HTML("""
+        <div style="text-align: center; padding: 20px;">
+            <h1 style="color: #2E86AB; margin-bottom: 10px;">🎭 AI Voice Cloning Studio</h1>
+            <p style="color: #666; font-size: 18px;">Clone any voice with AI technology - Support for both Audio and Text input</p>
+        </div>
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Reference Voice Section
+                gr.HTML("<h3 style='color: #2E86AB;'>🎤 Upload Reference Voice</h3>")
+                reference_audio = gr.Audio(
+                    label="Reference Audio (10+ seconds recommended)",
+                    type="filepath",
+                    sources=["upload", "microphone"]
+                )
+                gr.HTML("<p style='color: #666; font-size: 14px;'>This is the voice you want to clone. Upload clear, high-quality audio.</p>")
+            with gr.Column(scale=1):
+                # Input Method Selection
+                gr.HTML("<h3 style='color: #2E86AB;'>📥 Choose Input Method</h3>")
+                with gr.Tabs():
+                    with gr.TabItem("🎵 Audio Input"):
+                        gr.HTML("<p>Upload audio to transform into the reference voice</p>")
+                        input_audio = gr.Audio(
+                            label="Input Audio to Transform",
+                            type="filepath",
+                            sources=["upload", "microphone"]
+                        )
+                        enhance_audio = gr.Checkbox(
+                            label="🎚️ Enhance Audio Quality",
+                            value=True
+                        )
+                        audio_clone_btn = gr.Button(
+                            "🎤 Clone Voice from Audio",
+                            variant="primary",
+                            size="lg"
+                        )
+                    with gr.TabItem("📝 Text Input"):
+                        gr.HTML("<p>Enter text to speak in the reference voice</p>")
+                        text_input = gr.Textbox(
+                            label="Text to Convert",
+                            placeholder="Enter the text you want to speak in the cloned voice...",
+                            lines=4,
+                            max_lines=6
+                        )
+                        with gr.Row():
+                            language_select = gr.Dropdown(
+                                choices=[
+                                    ("🇺🇸 English", "en"),
+                                    ("🇪🇸 Spanish", "es"),
+                                    ("🇫🇷 French", "fr"),
+                                    ("🇩🇪 German", "de"),
+                                    ("🇮🇹 Italian", "it"),
+                                    ("🇧🇷 Portuguese", "pt"),
+                                    ("🇨🇳 Chinese", "zh"),
+                                    ("🇯🇵 Japanese", "ja")
+                                ],
+                                value="en",
+                                label="Language"
+                            )
+                            speed_control = gr.Slider(
+                                minimum=0.5,
+                                maximum=2.0,
+                                step=0.1,
+                                value=1.0,
+                                label="Speech Speed"
+                            )
+                        text_clone_btn = gr.Button(
+                            "📝 Generate Speech from Text",
+                            variant="secondary",
+                            size="lg"
+                        )
+        # Output Section
+        with gr.Row():
+            with gr.Column():
+                gr.HTML("<h3 style='color: #2E86AB;'>🎵 Cloned Voice Output</h3>")
+                audio_output = gr.Audio(
+                    label="Generated Audio",
+                    type="filepath"
+                )
+                status_output = gr.Textbox(
+                    label="Status",
+                    lines=3,
+                    interactive=False
+                )
+        # Examples Section
+        with gr.Accordion("💡 Example Texts", open=False):
+            examples = [
+                "Hello, this is a demonstration of AI voice cloning technology.",
+                "Welcome to the future of artificial intelligence and speech synthesis.",
+                "This voice was generated using advanced machine learning models.",
+                "Experience the power of AI-driven voice generation with natural speech patterns."
+            ]
+            gr.Examples(
+                examples=examples,
+                inputs=text_input,
+                label="Click to try these examples:"
             )
+        # How it works section
+        with gr.Accordion("🔍 How Voice Cloning Works", open=False):
+            gr.Markdown("""
+            ### Voice-to-Voice Cloning Process
+            1. **🎤 Reference Voice**: Upload 10+ seconds of clear speech
+            2. **📥 Input Audio**: Upload audio you want to transform
+            3. **🧠 AI Analysis**: Extract voice characteristics and features
+            4. **🎵 Voice Synthesis**: Apply reference voice to input content
+            ### Text-to-Speech Process
+            1. **🎤 Reference Voice**: Upload voice sample to clone
+            2. **📝 Text Input**: Enter text to convert to speech
+            3. **🗣️ Speech Generation**: Generate speech in the cloned voice
+            4. **🎵 Audio Output**: Download your cloned speech
+            ### Tips for Best Results
+            - **Reference Audio**: Use 10+ seconds of clear, single-speaker audio
+            - **Input Audio**: Ensure good quality with minimal background noise
+            - **Language**: Match reference voice language when possible
+            - **Length**: Shorter inputs (under 30 seconds) work better
+            """)
+        # Event handlers
+        audio_clone_btn.click(
+            fn=voice_clone_with_audio,
+            inputs=[reference_audio, input_audio, enhance_audio],
+            outputs=[audio_output, status_output],
+            show_progress=True
+        )
+        text_clone_btn.click(
+            fn=voice_clone_with_text,
+            inputs=[reference_audio, text_input, language_select, speed_control],
+            outputs=[audio_output, status_output],
+            show_progress=True
+        )
+        # Auto-generate on Enter for text
+        text_input.submit(
+            fn=voice_clone_with_text,
+            inputs=[reference_audio, text_input, language_select, speed_control],
+            outputs=[audio_output, status_output],
+            show_progress=True
+        )
+    return demo
+# Launch the app
 if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )