Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from encoder import inference as encoder_inference | |
| from synthesizer.inference import Synthesizer | |
| import librosa | |
| import soundfile as sf | |
| import numpy as np | |
| import os | |
| import torch | |
| # Try to load HiFi-GAN vocoder | |
| vocoder = None | |
| try: | |
| from speechbrain.inference.vocoders import HIFIGAN | |
| vocoder = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="pretrained_models/hifigan", run_opts={"device":"cpu"}) | |
| print("✓ HiFi-GAN vocoder loaded!") | |
| except Exception as e: | |
| print(f"HiFi-GAN load error: {e}, will use Griffin-Lim fallback") | |
| vocoder = None | |
| # Load models at startup | |
| print("Loading models...") | |
| encoder_path = "saved_models/encoder.pt" | |
| synthesizer_path = "saved_models/synthesizer.pt" | |
| try: | |
| encoder_inference.load_model(encoder_path) | |
| print("✓ Encoder loaded!") | |
| except Exception as e: | |
| print(f"Encoder load error: {e}") | |
| try: | |
| synthesizer = Synthesizer(synthesizer_path) | |
| print("✓ Synthesizer loaded!") | |
| except Exception as e: | |
| print(f"Synthesizer load error: {e}") | |
| print("Ready for voice cloning!") | |
| def clone_voice(voice_sample, text): | |
| """Clone voice and generate speech""" | |
| try: | |
| if voice_sample is None: | |
| return None, "❌ Error: No voice sample provided" | |
| if not text or len(text.strip()) == 0: | |
| return None, "❌ Error: No text provided" | |
| print(f"Processing: text='{text}'") | |
| # Extract audio data and sample rate | |
| if isinstance(voice_sample, tuple): | |
| sr, audio_data = voice_sample | |
| wav = audio_data.astype(np.float32) / 32768.0 | |
| else: | |
| wav, sr = librosa.load(voice_sample, sr=None) | |
| print(f"Audio loaded: sr={sr}, shape={wav.shape}, duration={len(wav)/sr:.2f}s") | |
| if len(wav) < sr: | |
| return None, f"❌ Error: Audio too short ({len(wav)/sr:.2f}s). Please record at least 2 seconds." | |
| # Resample if needed | |
| if sr != 16000 and len(wav) > 100: | |
| wav = librosa.resample(wav, orig_sr=sr, target_sr=16000) | |
| elif sr != 16000: | |
| return None, f"❌ Error: Audio file corrupted or invalid (only {len(wav)} samples)" | |
| # Preprocess audio | |
| wav = encoder_inference.preprocess_wav(wav) | |
| print(f"Preprocessed audio: {wav.shape}") | |
| # Generate speaker embedding | |
| embed = encoder_inference.embed_utterance(wav) | |
| print(f"Speaker embedding: {embed.shape}") | |
| # Synthesize | |
| mels = synthesizer.synthesize_spectrograms([text], [embed]) | |
| print(f"Mel-spectrogram: {mels[0].shape}") | |
| # Vocode to audio | |
| if vocoder is not None: | |
| try: | |
| # Use HiFi-GAN | |
| mel_spec_tensor = torch.from_numpy(mels[0]).unsqueeze(0).float() | |
| with torch.no_grad(): | |
| wav_generated = vocoder.decode_batch(mel_spec_tensor) | |
| wav_generated = wav_generated.squeeze().cpu().numpy() | |
| print(f"Generated audio with HiFi-GAN: {wav_generated.shape}") | |
| except Exception as e: | |
| print(f"HiFi-GAN failed: {e}, using Griffin-Lim fallback") | |
| wav_generated = librosa.feature.inverse.mel_to_audio(mels[0], sr=22050, n_iter=32) | |
| else: | |
| # Use Griffin-Lim as fallback | |
| print("Using Griffin-Lim vocoder (fallback)") | |
| wav_generated = librosa.feature.inverse.mel_to_audio(mels[0], sr=22050, n_iter=32) | |
| # Normalize audio | |
| if np.max(np.abs(wav_generated)) > 0: | |
| wav_generated = wav_generated / np.max(np.abs(wav_generated)) * 0.95 | |
| print(f"Generated audio: {wav_generated.shape}, range: {np.min(wav_generated):.4f} to {np.max(np.abs(wav_generated)):.4f}") | |
| return (22050, (wav_generated * 32768).astype(np.int16)), "✅ Success! Your voice has been cloned!" | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return None, f"❌ Error: {str(e)}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="Voice Cloning - Real-Time Test", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🎤 Real-Time Voice Cloning") | |
| gr.Markdown("**Record your voice, enter text, and hear it synthesized in your voice!**") | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### 📝 Step 1: Record Your Voice") | |
| gr.Markdown("Record 5-10 seconds of clear audio in **Hindi or Kannada**") | |
| voice_input = gr.Audio( | |
| label="🎙️ Voice Sample (Microphone or Upload)", | |
| type="numpy", | |
| sources=["microphone", "upload"] | |
| ) | |
| gr.Markdown("### ✍️ Step 2: Enter Text") | |
| text_input = gr.Textbox( | |
| label="📄 Text to Synthesize (Hindi or Kannada)", | |
| placeholder="नमस्ते, यह एक परीक्षण है", | |
| lines=3 | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("### 🔊 Step 3: Generated Speech") | |
| audio_output = gr.Audio(label="🎧 Cloned Voice Output", type="numpy") | |
| status_output = gr.Textbox(label="📊 Status", interactive=False, lines=2) | |
| clone_button = gr.Button("🎯 Clone Voice & Generate Speech", variant="primary", size="lg") | |
| clone_button.click( | |
| clone_voice, | |
| inputs=[voice_input, text_input], | |
| outputs=[audio_output, status_output] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### 📋 Instructions: | |
| 1. **Record your voice** using the microphone (5-10 seconds) OR upload a WAV/OGG file | |
| - Speak clearly in Hindi or Kannada | |
| - Avoid background noise | |
| 2. **Enter text** you want to generate in your voice (same language as recording) | |
| 3. **Click "Clone Voice & Generate Speech"** | |
| 4. **Wait** (10-30 seconds on CPU) and hear the result! | |
| ### 💡 Tips for Best Results: | |
| - **Clear voice samples** = better results | |
| - **10+ seconds** = better voice cloning accuracy | |
| - **Same language** as input voice works best | |
| - **Patience** - CPU processing takes time (GPU would be 2-3x faster) | |
| - **Quality audio** - minimize background noise | |
| ### ⚠️ Limitations: | |
| - CPU processing is slower (~10-30 seconds per request) | |
| - Long texts (500+ characters) may timeout | |
| - Best results with 10+ second voice samples | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch(share=False, server_name="0.0.0.0", server_port=7860) |