import gradio as gr from encoder import inference as encoder_inference from synthesizer.inference import Synthesizer from vocoder import inference as vocoder_inference import librosa import soundfile as sf import numpy as np import os # Load models at startup print("Loading models...") try: encoder_inference.load_model("saved_models/encoder.pt") print("✓ Encoder loaded!") except Exception as e: print(f"Encoder load error: {e}") try: synthesizer = Synthesizer("saved_models/synthesizer.pt") print("✓ Synthesizer loaded!") except Exception as e: print(f"Synthesizer load error: {e}") try: vocoder_inference.load_model("saved_models/vocoder.pt") print("✓ Vocoder loaded!") except Exception as e: print(f"Vocoder load error: {e}") print("Ready for voice cloning!") def clone_voice(voice_sample, text): """Clone voice and generate speech""" try: if voice_sample is None: return None, "❌ Error: No voice sample provided" if not text or len(text.strip()) == 0: return None, "❌ Error: No text provided" print(f"Processing: text='{text}', voice_sample={voice_sample}") # Extract audio data and sample rate if isinstance(voice_sample, tuple): sr, audio_data = voice_sample wav = audio_data.astype(np.float32) / 32768.0 else: wav, sr = librosa.load(voice_sample, sr=16000) print(f"Audio loaded: sr={sr}, shape={wav.shape}") # Resample if needed if sr != 16000: wav = librosa.resample(wav, orig_sr=sr, target_sr=16000) # Preprocess audio wav = encoder_inference.preprocess_wav(wav) print(f"Preprocessed audio: {wav.shape}") # Generate speaker embedding embed = encoder_inference.embed_utterance(wav) print(f"Speaker embedding: {embed.shape}") # Synthesize mels = synthesizer.synthesize_spectrograms([text], [embed]) print(f"Mel-spectrogram: {mels[0].shape}") # Vocode to audio wav_generated = vocoder_inference.vocoder(mels[0]) print(f"Generated audio: {wav_generated.shape}") return (22050, (wav_generated * 32768).astype(np.int16)), "✅ Success! Your voice has been cloned!" except Exception as e: print(f"Error: {e}") import traceback traceback.print_exc() return None, f"❌ Error: {str(e)}" # Create Gradio interface with gr.Blocks(title="Voice Cloning - Real-Time Test", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🎤 Real-Time Voice Cloning") gr.Markdown("**Record your voice, enter text, and hear it synthesized in your voice!**") gr.Markdown("---") with gr.Row(): with gr.Column(): gr.Markdown("### 📝 Step 1: Record Your Voice") gr.Markdown("Record 5-10 seconds of clear audio in **Hindi or Kannada**") voice_input = gr.Audio( label="🎙️ Voice Sample (Microphone or Upload)", type="numpy", sources=["microphone", "upload"] ) gr.Markdown("### ✍️ Step 2: Enter Text") text_input = gr.Textbox( label="📄 Text to Synthesize (Hindi or Kannada)", placeholder="नमस्ते, यह एक परीक्षण है", lines=3 ) with gr.Column(): gr.Markdown("### 🔊 Step 3: Generated Speech") audio_output = gr.Audio(label="🎧 Cloned Voice Output", type="numpy") status_output = gr.Textbox(label="📊 Status", interactive=False, lines=2) clone_button = gr.Button("🎯 Clone Voice & Generate Speech", variant="primary", size="lg") clone_button.click( clone_voice, inputs=[voice_input, text_input], outputs=[audio_output, status_output] ) gr.Markdown(""" --- ### 📋 Instructions: 1. **Record your voice** using the microphone (5-10 seconds) OR upload a WAV/OGG file - Speak clearly in Hindi or Kannada - Avoid background noise 2. **Enter text** you want to generate in your voice (same language as recording) 3. **Click "Clone Voice & Generate Speech"** 4. **Wait** (10-30 seconds on CPU) and hear the result! ### 💡 Tips for Best Results: - **Clear voice samples** = better results - **10+ seconds** = better voice cloning accuracy - **Same language** as input voice works best - **Patience** - CPU processing takes time (GPU would be 2-3x faster) - **Quality audio** - minimize background noise ### ⚠️ Limitations: - CPU processing is slower (~10-30 seconds per request) - Long texts (500+ characters) may timeout - Best results with 10+ second voice samples """) if __name__ == "__main__": demo.launch(share=False, server_name="0.0.0.0", server_port=7860)