callentrin_test / app.py
rishidahiya's picture
Update app.py
2d146cc verified
import gradio as gr
from encoder import inference as encoder_inference
from synthesizer.inference import Synthesizer
from vocoder import inference as vocoder_inference
import librosa
import soundfile as sf
import numpy as np
import os
# Load models at startup
print("Loading models...")
try:
encoder_inference.load_model("saved_models/encoder.pt")
print("✓ Encoder loaded!")
except Exception as e:
print(f"Encoder load error: {e}")
try:
synthesizer = Synthesizer("saved_models/synthesizer.pt")
print("✓ Synthesizer loaded!")
except Exception as e:
print(f"Synthesizer load error: {e}")
try:
vocoder_inference.load_model("saved_models/vocoder.pt")
print("✓ Vocoder loaded!")
except Exception as e:
print(f"Vocoder load error: {e}")
print("Ready for voice cloning!")
def clone_voice(voice_sample, text):
"""Clone voice and generate speech"""
try:
if voice_sample is None:
return None, "❌ Error: No voice sample provided"
if not text or len(text.strip()) == 0:
return None, "❌ Error: No text provided"
print(f"Processing: text='{text}', voice_sample={voice_sample}")
# Extract audio data and sample rate
if isinstance(voice_sample, tuple):
sr, audio_data = voice_sample
wav = audio_data.astype(np.float32) / 32768.0
else:
wav, sr = librosa.load(voice_sample, sr=16000)
print(f"Audio loaded: sr={sr}, shape={wav.shape}")
# Resample if needed
if sr != 16000:
wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
# Preprocess audio
wav = encoder_inference.preprocess_wav(wav)
print(f"Preprocessed audio: {wav.shape}")
# Generate speaker embedding
embed = encoder_inference.embed_utterance(wav)
print(f"Speaker embedding: {embed.shape}")
# Synthesize
mels = synthesizer.synthesize_spectrograms([text], [embed])
print(f"Mel-spectrogram: {mels[0].shape}")
# Vocode to audio
wav_generated = vocoder_inference.vocoder(mels[0])
print(f"Generated audio: {wav_generated.shape}")
return (22050, (wav_generated * 32768).astype(np.int16)), "✅ Success! Your voice has been cloned!"
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()
return None, f"❌ Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Voice Cloning - Real-Time Test", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🎤 Real-Time Voice Cloning")
gr.Markdown("**Record your voice, enter text, and hear it synthesized in your voice!**")
gr.Markdown("---")
with gr.Row():
with gr.Column():
gr.Markdown("### 📝 Step 1: Record Your Voice")
gr.Markdown("Record 5-10 seconds of clear audio in **Hindi or Kannada**")
voice_input = gr.Audio(
label="🎙️ Voice Sample (Microphone or Upload)",
type="numpy",
sources=["microphone", "upload"]
)
gr.Markdown("### ✍️ Step 2: Enter Text")
text_input = gr.Textbox(
label="📄 Text to Synthesize (Hindi or Kannada)",
placeholder="नमस्ते, यह एक परीक्षण है",
lines=3
)
with gr.Column():
gr.Markdown("### 🔊 Step 3: Generated Speech")
audio_output = gr.Audio(label="🎧 Cloned Voice Output", type="numpy")
status_output = gr.Textbox(label="📊 Status", interactive=False, lines=2)
clone_button = gr.Button("🎯 Clone Voice & Generate Speech", variant="primary", size="lg")
clone_button.click(
clone_voice,
inputs=[voice_input, text_input],
outputs=[audio_output, status_output]
)
gr.Markdown("""
---
### 📋 Instructions:
1. **Record your voice** using the microphone (5-10 seconds) OR upload a WAV/OGG file
- Speak clearly in Hindi or Kannada
- Avoid background noise
2. **Enter text** you want to generate in your voice (same language as recording)
3. **Click "Clone Voice & Generate Speech"**
4. **Wait** (10-30 seconds on CPU) and hear the result!
### 💡 Tips for Best Results:
- **Clear voice samples** = better results
- **10+ seconds** = better voice cloning accuracy
- **Same language** as input voice works best
- **Patience** - CPU processing takes time (GPU would be 2-3x faster)
- **Quality audio** - minimize background noise
### ⚠️ Limitations:
- CPU processing is slower (~10-30 seconds per request)
- Long texts (500+ characters) may timeout
- Best results with 10+ second voice samples
""")
if __name__ == "__main__":
demo.launch(share=False, server_name="0.0.0.0", server_port=7860)