Spaces:
Sleeping
Sleeping
File size: 5,053 Bytes
f42952b 784779a f42952b 784779a f42952b 784779a 2d146cc 784779a f42952b 784779a f42952b 2d146cc f42952b 2d146cc 784779a f42952b 784779a f42952b 784779a f42952b 784779a f42952b 784779a f42952b 784779a f42952b 784779a f42952b 784779a f42952b 784779a 2d146cc 784779a f42952b 2d146cc 784779a f42952b 2d146cc f42952b 2d146cc f42952b 2d146cc f42952b 2d146cc f42952b 2d146cc f42952b 2d146cc f42952b 2d146cc f42952b 2d146cc f42952b 784779a f42952b 2d146cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import gradio as gr
from encoder import inference as encoder_inference
from synthesizer.inference import Synthesizer
from vocoder import inference as vocoder_inference
import librosa
import soundfile as sf
import numpy as np
import os
# Load models at startup
print("Loading models...")
try:
encoder_inference.load_model("saved_models/encoder.pt")
print("✓ Encoder loaded!")
except Exception as e:
print(f"Encoder load error: {e}")
try:
synthesizer = Synthesizer("saved_models/synthesizer.pt")
print("✓ Synthesizer loaded!")
except Exception as e:
print(f"Synthesizer load error: {e}")
try:
vocoder_inference.load_model("saved_models/vocoder.pt")
print("✓ Vocoder loaded!")
except Exception as e:
print(f"Vocoder load error: {e}")
print("Ready for voice cloning!")
def clone_voice(voice_sample, text):
"""Clone voice and generate speech"""
try:
if voice_sample is None:
return None, "❌ Error: No voice sample provided"
if not text or len(text.strip()) == 0:
return None, "❌ Error: No text provided"
print(f"Processing: text='{text}', voice_sample={voice_sample}")
# Extract audio data and sample rate
if isinstance(voice_sample, tuple):
sr, audio_data = voice_sample
wav = audio_data.astype(np.float32) / 32768.0
else:
wav, sr = librosa.load(voice_sample, sr=16000)
print(f"Audio loaded: sr={sr}, shape={wav.shape}")
# Resample if needed
if sr != 16000:
wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
# Preprocess audio
wav = encoder_inference.preprocess_wav(wav)
print(f"Preprocessed audio: {wav.shape}")
# Generate speaker embedding
embed = encoder_inference.embed_utterance(wav)
print(f"Speaker embedding: {embed.shape}")
# Synthesize
mels = synthesizer.synthesize_spectrograms([text], [embed])
print(f"Mel-spectrogram: {mels[0].shape}")
# Vocode to audio
wav_generated = vocoder_inference.vocoder(mels[0])
print(f"Generated audio: {wav_generated.shape}")
return (22050, (wav_generated * 32768).astype(np.int16)), "✅ Success! Your voice has been cloned!"
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()
return None, f"❌ Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Voice Cloning - Real-Time Test", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🎤 Real-Time Voice Cloning")
gr.Markdown("**Record your voice, enter text, and hear it synthesized in your voice!**")
gr.Markdown("---")
with gr.Row():
with gr.Column():
gr.Markdown("### 📝 Step 1: Record Your Voice")
gr.Markdown("Record 5-10 seconds of clear audio in **Hindi or Kannada**")
voice_input = gr.Audio(
label="🎙️ Voice Sample (Microphone or Upload)",
type="numpy",
sources=["microphone", "upload"]
)
gr.Markdown("### ✍️ Step 2: Enter Text")
text_input = gr.Textbox(
label="📄 Text to Synthesize (Hindi or Kannada)",
placeholder="नमस्ते, यह एक परीक्षण है",
lines=3
)
with gr.Column():
gr.Markdown("### 🔊 Step 3: Generated Speech")
audio_output = gr.Audio(label="🎧 Cloned Voice Output", type="numpy")
status_output = gr.Textbox(label="📊 Status", interactive=False, lines=2)
clone_button = gr.Button("🎯 Clone Voice & Generate Speech", variant="primary", size="lg")
clone_button.click(
clone_voice,
inputs=[voice_input, text_input],
outputs=[audio_output, status_output]
)
gr.Markdown("""
---
### 📋 Instructions:
1. **Record your voice** using the microphone (5-10 seconds) OR upload a WAV/OGG file
- Speak clearly in Hindi or Kannada
- Avoid background noise
2. **Enter text** you want to generate in your voice (same language as recording)
3. **Click "Clone Voice & Generate Speech"**
4. **Wait** (10-30 seconds on CPU) and hear the result!
### 💡 Tips for Best Results:
- **Clear voice samples** = better results
- **10+ seconds** = better voice cloning accuracy
- **Same language** as input voice works best
- **Patience** - CPU processing takes time (GPU would be 2-3x faster)
- **Quality audio** - minimize background noise
### ⚠️ Limitations:
- CPU processing is slower (~10-30 seconds per request)
- Long texts (500+ characters) may timeout
- Best results with 10+ second voice samples
""")
if __name__ == "__main__":
demo.launch(share=False, server_name="0.0.0.0", server_port=7860) |