File size: 6,638 Bytes
f42952b
784779a
 
 
 
f42952b
784779a
1fae304
 
 
 
 
 
 
 
 
 
 
784779a
f42952b
784779a
3fb66d9
1fae304
 
3fb66d9
2d146cc
1fae304
2d146cc
 
 
 
 
3fb66d9
2d146cc
 
 
 
 
784779a
f42952b
 
784779a
f42952b
2d146cc
f42952b
 
2d146cc
 
d0850d2
784779a
f42952b
 
 
 
 
2c37cc3
784779a
2c37cc3
 
 
 
784779a
f42952b
2c37cc3
f42952b
2c37cc3
 
f42952b
 
784779a
f42952b
784779a
 
 
f42952b
784779a
f42952b
784779a
f42952b
784779a
1fae304
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
784779a
2d146cc
784779a
 
f42952b
 
 
2d146cc
784779a
f42952b
2d146cc
 
 
 
f42952b
 
 
2d146cc
 
f42952b
2d146cc
f42952b
 
 
 
2d146cc
f42952b
2d146cc
f42952b
 
 
 
 
2d146cc
 
 
f42952b
 
 
 
 
 
 
 
 
2d146cc
 
 
 
 
 
f42952b
 
 
2d146cc
 
 
 
 
 
 
 
 
 
 
f42952b
784779a
f42952b
2d146cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import gradio as gr
from encoder import inference as encoder_inference
from synthesizer.inference import Synthesizer
import librosa
import soundfile as sf
import numpy as np
import os
import torch

# Try to load HiFi-GAN vocoder
vocoder = None
try:
    from speechbrain.inference.vocoders import HIFIGAN
    vocoder = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="pretrained_models/hifigan", run_opts={"device":"cpu"})
    print("✓ HiFi-GAN vocoder loaded!")
except Exception as e:
    print(f"HiFi-GAN load error: {e}, will use Griffin-Lim fallback")
    vocoder = None

# Load models at startup
print("Loading models...")

encoder_path = "saved_models/encoder.pt"
synthesizer_path = "saved_models/synthesizer.pt"

try:
    encoder_inference.load_model(encoder_path)
    print("✓ Encoder loaded!")
except Exception as e:
    print(f"Encoder load error: {e}")

try:
    synthesizer = Synthesizer(synthesizer_path)
    print("✓ Synthesizer loaded!")
except Exception as e:
    print(f"Synthesizer load error: {e}")

print("Ready for voice cloning!")

def clone_voice(voice_sample, text):
    """Clone voice and generate speech"""
    try:
        if voice_sample is None:
            return None, "❌ Error: No voice sample provided"
        
        if not text or len(text.strip()) == 0:
            return None, "❌ Error: No text provided"
        
        print(f"Processing: text='{text}'")
        
        # Extract audio data and sample rate
        if isinstance(voice_sample, tuple):
            sr, audio_data = voice_sample
            wav = audio_data.astype(np.float32) / 32768.0
        else:
            wav, sr = librosa.load(voice_sample, sr=None)
        
        print(f"Audio loaded: sr={sr}, shape={wav.shape}, duration={len(wav)/sr:.2f}s")
        
        if len(wav) < sr:
            return None, f"❌ Error: Audio too short ({len(wav)/sr:.2f}s). Please record at least 2 seconds."
        
        # Resample if needed
        if sr != 16000 and len(wav) > 100:
            wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)
        elif sr != 16000:
            return None, f"❌ Error: Audio file corrupted or invalid (only {len(wav)} samples)"
        
        # Preprocess audio
        wav = encoder_inference.preprocess_wav(wav)
        print(f"Preprocessed audio: {wav.shape}")
        
        # Generate speaker embedding
        embed = encoder_inference.embed_utterance(wav)
        print(f"Speaker embedding: {embed.shape}")
        
        # Synthesize
        mels = synthesizer.synthesize_spectrograms([text], [embed])
        print(f"Mel-spectrogram: {mels[0].shape}")
        
        # Vocode to audio
        if vocoder is not None:
            try:
                # Use HiFi-GAN
                mel_spec_tensor = torch.from_numpy(mels[0]).unsqueeze(0).float()
                with torch.no_grad():
                    wav_generated = vocoder.decode_batch(mel_spec_tensor)
                wav_generated = wav_generated.squeeze().cpu().numpy()
                print(f"Generated audio with HiFi-GAN: {wav_generated.shape}")
            except Exception as e:
                print(f"HiFi-GAN failed: {e}, using Griffin-Lim fallback")
                wav_generated = librosa.feature.inverse.mel_to_audio(mels[0], sr=22050, n_iter=32)
        else:
            # Use Griffin-Lim as fallback
            print("Using Griffin-Lim vocoder (fallback)")
            wav_generated = librosa.feature.inverse.mel_to_audio(mels[0], sr=22050, n_iter=32)
        
        # Normalize audio
        if np.max(np.abs(wav_generated)) > 0:
            wav_generated = wav_generated / np.max(np.abs(wav_generated)) * 0.95
        
        print(f"Generated audio: {wav_generated.shape}, range: {np.min(wav_generated):.4f} to {np.max(np.abs(wav_generated)):.4f}")
        
        return (22050, (wav_generated * 32768).astype(np.int16)), "✅ Success! Your voice has been cloned!"
    
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        return None, f"❌ Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Voice Cloning - Real-Time Test", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🎤 Real-Time Voice Cloning")
    gr.Markdown("**Record your voice, enter text, and hear it synthesized in your voice!**")
    gr.Markdown("---")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### 📝 Step 1: Record Your Voice")
            gr.Markdown("Record 5-10 seconds of clear audio in **Hindi or Kannada**")
            voice_input = gr.Audio(
                label="🎙️ Voice Sample (Microphone or Upload)",
                type="numpy",
                sources=["microphone", "upload"]
            )
            
            gr.Markdown("### ✍️ Step 2: Enter Text")
            text_input = gr.Textbox(
                label="📄 Text to Synthesize (Hindi or Kannada)",
                placeholder="नमस्ते, यह एक परीक्षण है",
                lines=3
            )
        
        with gr.Column():
            gr.Markdown("### 🔊 Step 3: Generated Speech")
            audio_output = gr.Audio(label="🎧 Cloned Voice Output", type="numpy")
            status_output = gr.Textbox(label="📊 Status", interactive=False, lines=2)
    
    clone_button = gr.Button("🎯 Clone Voice & Generate Speech", variant="primary", size="lg")
    clone_button.click(
        clone_voice,
        inputs=[voice_input, text_input],
        outputs=[audio_output, status_output]
    )
    
    gr.Markdown("""
    ---
    ### 📋 Instructions:
    1. **Record your voice** using the microphone (5-10 seconds) OR upload a WAV/OGG file
       - Speak clearly in Hindi or Kannada
       - Avoid background noise
    2. **Enter text** you want to generate in your voice (same language as recording)
    3. **Click "Clone Voice & Generate Speech"**
    4. **Wait** (10-30 seconds on CPU) and hear the result!
    
    ### 💡 Tips for Best Results:
    - **Clear voice samples** = better results
    - **10+ seconds** = better voice cloning accuracy
    - **Same language** as input voice works best
    - **Patience** - CPU processing takes time (GPU would be 2-3x faster)
    - **Quality audio** - minimize background noise
    
    ### ⚠️ Limitations:
    - CPU processing is slower (~10-30 seconds per request)
    - Long texts (500+ characters) may timeout
    - Best results with 10+ second voice samples
    """)

if __name__ == "__main__":
    demo.launch(share=False, server_name="0.0.0.0", server_port=7860)