Spaces:
Sleeping
Sleeping
File size: 6,638 Bytes
f42952b 784779a f42952b 784779a 1fae304 784779a f42952b 784779a 3fb66d9 1fae304 3fb66d9 2d146cc 1fae304 2d146cc 3fb66d9 2d146cc 784779a f42952b 784779a f42952b 2d146cc f42952b 2d146cc d0850d2 784779a f42952b 2c37cc3 784779a 2c37cc3 784779a f42952b 2c37cc3 f42952b 2c37cc3 f42952b 784779a f42952b 784779a f42952b 784779a f42952b 784779a f42952b 784779a 1fae304 784779a 2d146cc 784779a f42952b 2d146cc 784779a f42952b 2d146cc f42952b 2d146cc f42952b 2d146cc f42952b 2d146cc f42952b 2d146cc f42952b 2d146cc f42952b 2d146cc f42952b 2d146cc f42952b 784779a f42952b 2d146cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import gradio as gr
from encoder import inference as encoder_inference
from synthesizer.inference import Synthesizer
import librosa
import soundfile as sf
import numpy as np
import os
import torch
# Try to load HiFi-GAN vocoder
vocoder = None
try:
from speechbrain.inference.vocoders import HIFIGAN
vocoder = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="pretrained_models/hifigan", run_opts={"device":"cpu"})
print("✓ HiFi-GAN vocoder loaded!")
except Exception as e:
print(f"HiFi-GAN load error: {e}, will use Griffin-Lim fallback")
vocoder = None
# Load models at startup
print("Loading models...")
encoder_path = "saved_models/encoder.pt"
synthesizer_path = "saved_models/synthesizer.pt"
try:
encoder_inference.load_model(encoder_path)
print("✓ Encoder loaded!")
except Exception as e:
print(f"Encoder load error: {e}")
try:
synthesizer = Synthesizer(synthesizer_path)
print("✓ Synthesizer loaded!")
except Exception as e:
print(f"Synthesizer load error: {e}")
print("Ready for voice cloning!")
def clone_voice(voice_sample, text):
"""Clone voice and generate speech"""
try:
if voice_sample is None:
return None, "❌ Error: No voice sample provided"
if not text or len(text.strip()) == 0:
return None, "❌ Error: No text provided"
print(f"Processing: text='{text}'")
# Extract audio data and sample rate
if isinstance(voice_sample, tuple):
sr, audio_data = voice_sample
wav = audio_data.astype(np.float32) / 32768.0
else:
wav, sr = librosa.load(voice_sample, sr=None)
print(f"Audio loaded: sr={sr}, shape={wav.shape}, duration={len(wav)/sr:.2f}s")
if len(wav) < sr:
return None, f"❌ Error: Audio too short ({len(wav)/sr:.2f}s). Please record at least 2 seconds."
# Resample if needed
if sr != 16000 and len(wav) > 100:
wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
elif sr != 16000:
return None, f"❌ Error: Audio file corrupted or invalid (only {len(wav)} samples)"
# Preprocess audio
wav = encoder_inference.preprocess_wav(wav)
print(f"Preprocessed audio: {wav.shape}")
# Generate speaker embedding
embed = encoder_inference.embed_utterance(wav)
print(f"Speaker embedding: {embed.shape}")
# Synthesize
mels = synthesizer.synthesize_spectrograms([text], [embed])
print(f"Mel-spectrogram: {mels[0].shape}")
# Vocode to audio
if vocoder is not None:
try:
# Use HiFi-GAN
mel_spec_tensor = torch.from_numpy(mels[0]).unsqueeze(0).float()
with torch.no_grad():
wav_generated = vocoder.decode_batch(mel_spec_tensor)
wav_generated = wav_generated.squeeze().cpu().numpy()
print(f"Generated audio with HiFi-GAN: {wav_generated.shape}")
except Exception as e:
print(f"HiFi-GAN failed: {e}, using Griffin-Lim fallback")
wav_generated = librosa.feature.inverse.mel_to_audio(mels[0], sr=22050, n_iter=32)
else:
# Use Griffin-Lim as fallback
print("Using Griffin-Lim vocoder (fallback)")
wav_generated = librosa.feature.inverse.mel_to_audio(mels[0], sr=22050, n_iter=32)
# Normalize audio
if np.max(np.abs(wav_generated)) > 0:
wav_generated = wav_generated / np.max(np.abs(wav_generated)) * 0.95
print(f"Generated audio: {wav_generated.shape}, range: {np.min(wav_generated):.4f} to {np.max(np.abs(wav_generated)):.4f}")
return (22050, (wav_generated * 32768).astype(np.int16)), "✅ Success! Your voice has been cloned!"
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()
return None, f"❌ Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Voice Cloning - Real-Time Test", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🎤 Real-Time Voice Cloning")
gr.Markdown("**Record your voice, enter text, and hear it synthesized in your voice!**")
gr.Markdown("---")
with gr.Row():
with gr.Column():
gr.Markdown("### 📝 Step 1: Record Your Voice")
gr.Markdown("Record 5-10 seconds of clear audio in **Hindi or Kannada**")
voice_input = gr.Audio(
label="🎙️ Voice Sample (Microphone or Upload)",
type="numpy",
sources=["microphone", "upload"]
)
gr.Markdown("### ✍️ Step 2: Enter Text")
text_input = gr.Textbox(
label="📄 Text to Synthesize (Hindi or Kannada)",
placeholder="नमस्ते, यह एक परीक्षण है",
lines=3
)
with gr.Column():
gr.Markdown("### 🔊 Step 3: Generated Speech")
audio_output = gr.Audio(label="🎧 Cloned Voice Output", type="numpy")
status_output = gr.Textbox(label="📊 Status", interactive=False, lines=2)
clone_button = gr.Button("🎯 Clone Voice & Generate Speech", variant="primary", size="lg")
clone_button.click(
clone_voice,
inputs=[voice_input, text_input],
outputs=[audio_output, status_output]
)
gr.Markdown("""
---
### 📋 Instructions:
1. **Record your voice** using the microphone (5-10 seconds) OR upload a WAV/OGG file
- Speak clearly in Hindi or Kannada
- Avoid background noise
2. **Enter text** you want to generate in your voice (same language as recording)
3. **Click "Clone Voice & Generate Speech"**
4. **Wait** (10-30 seconds on CPU) and hear the result!
### 💡 Tips for Best Results:
- **Clear voice samples** = better results
- **10+ seconds** = better voice cloning accuracy
- **Same language** as input voice works best
- **Patience** - CPU processing takes time (GPU would be 2-3x faster)
- **Quality audio** - minimize background noise
### ⚠️ Limitations:
- CPU processing is slower (~10-30 seconds per request)
- Long texts (500+ characters) may timeout
- Best results with 10+ second voice samples
""")
if __name__ == "__main__":
demo.launch(share=False, server_name="0.0.0.0", server_port=7860) |