Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from encoder import inference as encoder_inference | |
| from synthesizer.inference import Synthesizer | |
| from vocoder import inference as vocoder_inference | |
| import librosa | |
| import soundfile as sf | |
| import numpy as np | |
| import os | |
| # Load models at startup | |
| print("Loading models...") | |
| try: | |
| encoder_inference.load_model("saved_models/encoder.pt") | |
| print("✓ Encoder loaded!") | |
| except Exception as e: | |
| print(f"Encoder load error: {e}") | |
| try: | |
| synthesizer = Synthesizer("saved_models/synthesizer.pt") | |
| print("✓ Synthesizer loaded!") | |
| except Exception as e: | |
| print(f"Synthesizer load error: {e}") | |
| try: | |
| vocoder_inference.load_model("saved_models/vocoder.pt") | |
| print("✓ Vocoder loaded!") | |
| except Exception as e: | |
| print(f"Vocoder load error: {e}") | |
| print("Ready for voice cloning!") | |
| def clone_voice(voice_sample, text): | |
| """Clone voice and generate speech""" | |
| try: | |
| if voice_sample is None: | |
| return None, "❌ Error: No voice sample provided" | |
| if not text or len(text.strip()) == 0: | |
| return None, "❌ Error: No text provided" | |
| print(f"Processing: text='{text}', voice_sample={voice_sample}") | |
| # Extract audio data and sample rate | |
| if isinstance(voice_sample, tuple): | |
| sr, audio_data = voice_sample | |
| wav = audio_data.astype(np.float32) / 32768.0 | |
| else: | |
| wav, sr = librosa.load(voice_sample, sr=16000) | |
| print(f"Audio loaded: sr={sr}, shape={wav.shape}") | |
| # Resample if needed | |
| if sr != 16000: | |
| wav = librosa.resample(wav, orig_sr=sr, target_sr=16000) | |
| # Preprocess audio | |
| wav = encoder_inference.preprocess_wav(wav) | |
| print(f"Preprocessed audio: {wav.shape}") | |
| # Generate speaker embedding | |
| embed = encoder_inference.embed_utterance(wav) | |
| print(f"Speaker embedding: {embed.shape}") | |
| # Synthesize | |
| mels = synthesizer.synthesize_spectrograms([text], [embed]) | |
| print(f"Mel-spectrogram: {mels[0].shape}") | |
| # Vocode to audio | |
| wav_generated = vocoder_inference.vocoder(mels[0]) | |
| print(f"Generated audio: {wav_generated.shape}") | |
| return (22050, (wav_generated * 32768).astype(np.int16)), "✅ Success! Your voice has been cloned!" | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return None, f"❌ Error: {str(e)}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="Voice Cloning - Real-Time Test", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🎤 Real-Time Voice Cloning") | |
| gr.Markdown("**Record your voice, enter text, and hear it synthesized in your voice!**") | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### 📝 Step 1: Record Your Voice") | |
| gr.Markdown("Record 5-10 seconds of clear audio in **Hindi or Kannada**") | |
| voice_input = gr.Audio( | |
| label="🎙️ Voice Sample (Microphone or Upload)", | |
| type="numpy", | |
| sources=["microphone", "upload"] | |
| ) | |
| gr.Markdown("### ✍️ Step 2: Enter Text") | |
| text_input = gr.Textbox( | |
| label="📄 Text to Synthesize (Hindi or Kannada)", | |
| placeholder="नमस्ते, यह एक परीक्षण है", | |
| lines=3 | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("### 🔊 Step 3: Generated Speech") | |
| audio_output = gr.Audio(label="🎧 Cloned Voice Output", type="numpy") | |
| status_output = gr.Textbox(label="📊 Status", interactive=False, lines=2) | |
| clone_button = gr.Button("🎯 Clone Voice & Generate Speech", variant="primary", size="lg") | |
| clone_button.click( | |
| clone_voice, | |
| inputs=[voice_input, text_input], | |
| outputs=[audio_output, status_output] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### 📋 Instructions: | |
| 1. **Record your voice** using the microphone (5-10 seconds) OR upload a WAV/OGG file | |
| - Speak clearly in Hindi or Kannada | |
| - Avoid background noise | |
| 2. **Enter text** you want to generate in your voice (same language as recording) | |
| 3. **Click "Clone Voice & Generate Speech"** | |
| 4. **Wait** (10-30 seconds on CPU) and hear the result! | |
| ### 💡 Tips for Best Results: | |
| - **Clear voice samples** = better results | |
| - **10+ seconds** = better voice cloning accuracy | |
| - **Same language** as input voice works best | |
| - **Patience** - CPU processing takes time (GPU would be 2-3x faster) | |
| - **Quality audio** - minimize background noise | |
| ### ⚠️ Limitations: | |
| - CPU processing is slower (~10-30 seconds per request) | |
| - Long texts (500+ characters) may timeout | |
| - Best results with 10+ second voice samples | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch(share=False, server_name="0.0.0.0", server_port=7860) |