import gradio as gr from encoder import inference as encoder_inference from synthesizer.inference import Synthesizer import librosa import soundfile as sf import numpy as np import os import torch # Try to load HiFi-GAN vocoder vocoder = None try: from speechbrain.inference.vocoders import HIFIGAN vocoder = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="pretrained_models/hifigan", run_opts={"device":"cpu"}) print("✓ HiFi-GAN vocoder loaded!") except Exception as e: print(f"HiFi-GAN load error: {e}, will use Griffin-Lim fallback") vocoder = None # Load models at startup print("Loading models...") encoder_path = "saved_models/encoder.pt" synthesizer_path = "saved_models/synthesizer.pt" try: encoder_inference.load_model(encoder_path) print("✓ Encoder loaded!") except Exception as e: print(f"Encoder load error: {e}") try: synthesizer = Synthesizer(synthesizer_path) print("✓ Synthesizer loaded!") except Exception as e: print(f"Synthesizer load error: {e}") print("Ready for voice cloning!") def clone_voice(voice_sample, text): """Clone voice and generate speech""" try: if voice_sample is None: return None, "❌ Error: No voice sample provided" if not text or len(text.strip()) == 0: return None, "❌ Error: No text provided" print(f"Processing: text='{text}'") # Extract audio data and sample rate if isinstance(voice_sample, tuple): sr, audio_data = voice_sample wav = audio_data.astype(np.float32) / 32768.0 else: wav, sr = librosa.load(voice_sample, sr=None) print(f"Audio loaded: sr={sr}, shape={wav.shape}, duration={len(wav)/sr:.2f}s") if len(wav) < sr: return None, f"❌ Error: Audio too short ({len(wav)/sr:.2f}s). Please record at least 2 seconds." # Resample if needed if sr != 16000 and len(wav) > 100: wav = librosa.resample(wav, orig_sr=sr, target_sr=16000) elif sr != 16000: return None, f"❌ Error: Audio file corrupted or invalid (only {len(wav)} samples)" # Preprocess audio wav = encoder_inference.preprocess_wav(wav) print(f"Preprocessed audio: {wav.shape}") # Generate speaker embedding embed = encoder_inference.embed_utterance(wav) print(f"Speaker embedding: {embed.shape}") # Synthesize mels = synthesizer.synthesize_spectrograms([text], [embed]) print(f"Mel-spectrogram: {mels[0].shape}") # Vocode to audio if vocoder is not None: try: # Use HiFi-GAN mel_spec_tensor = torch.from_numpy(mels[0]).unsqueeze(0).float() with torch.no_grad(): wav_generated = vocoder.decode_batch(mel_spec_tensor) wav_generated = wav_generated.squeeze().cpu().numpy() print(f"Generated audio with HiFi-GAN: {wav_generated.shape}") except Exception as e: print(f"HiFi-GAN failed: {e}, using Griffin-Lim fallback") wav_generated = librosa.feature.inverse.mel_to_audio(mels[0], sr=22050, n_iter=32) else: # Use Griffin-Lim as fallback print("Using Griffin-Lim vocoder (fallback)") wav_generated = librosa.feature.inverse.mel_to_audio(mels[0], sr=22050, n_iter=32) # Normalize audio if np.max(np.abs(wav_generated)) > 0: wav_generated = wav_generated / np.max(np.abs(wav_generated)) * 0.95 print(f"Generated audio: {wav_generated.shape}, range: {np.min(wav_generated):.4f} to {np.max(np.abs(wav_generated)):.4f}") return (22050, (wav_generated * 32768).astype(np.int16)), "✅ Success! Your voice has been cloned!" except Exception as e: print(f"Error: {e}") import traceback traceback.print_exc() return None, f"❌ Error: {str(e)}" # Create Gradio interface with gr.Blocks(title="Voice Cloning - Real-Time Test", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🎤 Real-Time Voice Cloning") gr.Markdown("**Record your voice, enter text, and hear it synthesized in your voice!**") gr.Markdown("---") with gr.Row(): with gr.Column(): gr.Markdown("### 📝 Step 1: Record Your Voice") gr.Markdown("Record 5-10 seconds of clear audio in **Hindi or Kannada**") voice_input = gr.Audio( label="🎙️ Voice Sample (Microphone or Upload)", type="numpy", sources=["microphone", "upload"] ) gr.Markdown("### ✍️ Step 2: Enter Text") text_input = gr.Textbox( label="📄 Text to Synthesize (Hindi or Kannada)", placeholder="नमस्ते, यह एक परीक्षण है", lines=3 ) with gr.Column(): gr.Markdown("### 🔊 Step 3: Generated Speech") audio_output = gr.Audio(label="🎧 Cloned Voice Output", type="numpy") status_output = gr.Textbox(label="📊 Status", interactive=False, lines=2) clone_button = gr.Button("🎯 Clone Voice & Generate Speech", variant="primary", size="lg") clone_button.click( clone_voice, inputs=[voice_input, text_input], outputs=[audio_output, status_output] ) gr.Markdown(""" --- ### 📋 Instructions: 1. **Record your voice** using the microphone (5-10 seconds) OR upload a WAV/OGG file - Speak clearly in Hindi or Kannada - Avoid background noise 2. **Enter text** you want to generate in your voice (same language as recording) 3. **Click "Clone Voice & Generate Speech"** 4. **Wait** (10-30 seconds on CPU) and hear the result! ### 💡 Tips for Best Results: - **Clear voice samples** = better results - **10+ seconds** = better voice cloning accuracy - **Same language** as input voice works best - **Patience** - CPU processing takes time (GPU would be 2-3x faster) - **Quality audio** - minimize background noise ### ⚠️ Limitations: - CPU processing is slower (~10-30 seconds per request) - Long texts (500+ characters) may timeout - Best results with 10+ second voice samples """) if __name__ == "__main__": demo.launch(share=False, server_name="0.0.0.0", server_port=7860)