File size: 5,053 Bytes
f42952b
784779a
 
 
 
 
f42952b
784779a
 
f42952b
784779a
2d146cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
784779a
f42952b
 
784779a
f42952b
2d146cc
f42952b
 
2d146cc
 
 
784779a
f42952b
 
 
 
 
 
784779a
f42952b
784779a
f42952b
 
 
 
 
784779a
f42952b
784779a
 
 
f42952b
784779a
f42952b
784779a
f42952b
784779a
 
f42952b
 
784779a
2d146cc
784779a
 
f42952b
 
 
2d146cc
784779a
f42952b
2d146cc
 
 
 
f42952b
 
 
2d146cc
 
f42952b
2d146cc
f42952b
 
 
 
2d146cc
f42952b
2d146cc
f42952b
 
 
 
 
2d146cc
 
 
f42952b
 
 
 
 
 
 
 
 
2d146cc
 
 
 
 
 
f42952b
 
 
2d146cc
 
 
 
 
 
 
 
 
 
 
f42952b
784779a
f42952b
2d146cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import gradio as gr
from encoder import inference as encoder_inference
from synthesizer.inference import Synthesizer
from vocoder import inference as vocoder_inference
import librosa
import soundfile as sf
import numpy as np
import os

# Load models at startup
print("Loading models...")
try:
    encoder_inference.load_model("saved_models/encoder.pt")
    print("✓ Encoder loaded!")
except Exception as e:
    print(f"Encoder load error: {e}")

try:
    synthesizer = Synthesizer("saved_models/synthesizer.pt")
    print("✓ Synthesizer loaded!")
except Exception as e:
    print(f"Synthesizer load error: {e}")

try:
    vocoder_inference.load_model("saved_models/vocoder.pt")
    print("✓ Vocoder loaded!")
except Exception as e:
    print(f"Vocoder load error: {e}")

print("Ready for voice cloning!")

def clone_voice(voice_sample, text):
    """Clone voice and generate speech"""
    try:
        if voice_sample is None:
            return None, "❌ Error: No voice sample provided"
        
        if not text or len(text.strip()) == 0:
            return None, "❌ Error: No text provided"
        
        print(f"Processing: text='{text}', voice_sample={voice_sample}")
        
        # Extract audio data and sample rate
        if isinstance(voice_sample, tuple):
            sr, audio_data = voice_sample
            wav = audio_data.astype(np.float32) / 32768.0
        else:
            wav, sr = librosa.load(voice_sample, sr=16000)
        
        print(f"Audio loaded: sr={sr}, shape={wav.shape}")
        
        # Resample if needed
        if sr != 16000:
            wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
        
        # Preprocess audio
        wav = encoder_inference.preprocess_wav(wav)
        print(f"Preprocessed audio: {wav.shape}")
        
        # Generate speaker embedding
        embed = encoder_inference.embed_utterance(wav)
        print(f"Speaker embedding: {embed.shape}")
        
        # Synthesize
        mels = synthesizer.synthesize_spectrograms([text], [embed])
        print(f"Mel-spectrogram: {mels[0].shape}")
        
        # Vocode to audio
        wav_generated = vocoder_inference.vocoder(mels[0])
        print(f"Generated audio: {wav_generated.shape}")
        
        return (22050, (wav_generated * 32768).astype(np.int16)), "✅ Success! Your voice has been cloned!"
    
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        return None, f"❌ Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Voice Cloning - Real-Time Test", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🎤 Real-Time Voice Cloning")
    gr.Markdown("**Record your voice, enter text, and hear it synthesized in your voice!**")
    gr.Markdown("---")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### 📝 Step 1: Record Your Voice")
            gr.Markdown("Record 5-10 seconds of clear audio in **Hindi or Kannada**")
            voice_input = gr.Audio(
                label="🎙️ Voice Sample (Microphone or Upload)",
                type="numpy",
                sources=["microphone", "upload"]
            )
            
            gr.Markdown("### ✍️ Step 2: Enter Text")
            text_input = gr.Textbox(
                label="📄 Text to Synthesize (Hindi or Kannada)",
                placeholder="नमस्ते, यह एक परीक्षण है",
                lines=3
            )
        
        with gr.Column():
            gr.Markdown("### 🔊 Step 3: Generated Speech")
            audio_output = gr.Audio(label="🎧 Cloned Voice Output", type="numpy")
            status_output = gr.Textbox(label="📊 Status", interactive=False, lines=2)
    
    clone_button = gr.Button("🎯 Clone Voice & Generate Speech", variant="primary", size="lg")
    clone_button.click(
        clone_voice,
        inputs=[voice_input, text_input],
        outputs=[audio_output, status_output]
    )
    
    gr.Markdown("""
    ---
    ### 📋 Instructions:
    1. **Record your voice** using the microphone (5-10 seconds) OR upload a WAV/OGG file
       - Speak clearly in Hindi or Kannada
       - Avoid background noise
    2. **Enter text** you want to generate in your voice (same language as recording)
    3. **Click "Clone Voice & Generate Speech"**
    4. **Wait** (10-30 seconds on CPU) and hear the result!
    
    ### 💡 Tips for Best Results:
    - **Clear voice samples** = better results
    - **10+ seconds** = better voice cloning accuracy
    - **Same language** as input voice works best
    - **Patience** - CPU processing takes time (GPU would be 2-3x faster)
    - **Quality audio** - minimize background noise
    
    ### ⚠️ Limitations:
    - CPU processing is slower (~10-30 seconds per request)
    - Long texts (500+ characters) may timeout
    - Best results with 10+ second voice samples
    """)

if __name__ == "__main__":
    demo.launch(share=False, server_name="0.0.0.0", server_port=7860)