File size: 9,805 Bytes
f758d08
2c8d218
9a34a5d
19173b4
 
b3986a9
962aa9c
3ad5343
5f03eaa
 
bba9fab
b3986a9
27e1662
ba703e9
962aa9c
 
75fb8ef
962aa9c
75fb8ef
962aa9c
 
75fb8ef
962aa9c
 
 
 
b44fd2c
5f03eaa
825c475
3ad5343
 
5f03eaa
4857e6a
6465ea7
75fb8ef
95bd2d0
3ad5343
75fb8ef
 
7d67cb5
75fb8ef
 
 
5f03eaa
75fb8ef
 
3e9e2ab
75fb8ef
 
5f03eaa
 
75fb8ef
 
 
5f03eaa
75fb8ef
1879a3e
3ad5343
75fb8ef
 
 
1879a3e
75fb8ef
3ad5343
5f03eaa
75fb8ef
1879a3e
75fb8ef
 
7d67cb5
ba703e9
5280410
5f03eaa
 
 
 
3e9e2ab
3ad5343
 
 
5f03eaa
3ad5343
5f03eaa
3ad5343
5f03eaa
3ad5343
 
 
 
 
 
5f03eaa
ba703e9
3ad5343
5f03eaa
 
 
ba703e9
5f03eaa
 
 
 
 
 
 
 
 
 
 
 
 
ba703e9
5f03eaa
 
 
 
 
ba703e9
5f03eaa
 
 
 
 
 
ba703e9
 
5f03eaa
 
 
 
ba703e9
5f03eaa
 
71d678c
5f03eaa
ba703e9
5f03eaa
 
ba703e9
5f03eaa
 
 
ba703e9
5f03eaa
ba703e9
5f03eaa
 
ba703e9
71d678c
3ad5343
 
ba703e9
5f03eaa
3ad5343
71d678c
5f03eaa
 
 
ba703e9
71d678c
5f03eaa
 
1879a3e
 
5f03eaa
3ad5343
 
3e9e2ab
 
3ad5343
 
5f03eaa
 
ba703e9
5f03eaa
1879a3e
5f03eaa
 
 
 
71d678c
962aa9c
 
71d678c
5f03eaa
 
 
 
 
 
 
 
 
 
 
 
 
 
3ad5343
ba703e9
3ad5343
 
 
 
ba703e9
3ad5343
5f03eaa
 
 
 
ba703e9
 
 
 
5f03eaa
 
ba703e9
 
5f03eaa
3ad5343
af41746
ba703e9
71d678c
e6e0279
5f03eaa
71d678c
5f03eaa
 
71d678c
962aa9c
ba703e9
71d678c
ba703e9
71d678c
 
5f03eaa
ba703e9
71d678c
 
5f03eaa
ba703e9
71d678c
 
3ad5343
71d678c
 
 
 
 
5f03eaa
 
ba703e9
5f03eaa
 
 
ba703e9
5f03eaa
71d678c
ba703e9
 
71d678c
 
ba703e9
71d678c
a1bb412
75fb8ef
ba703e9
3e9e2ab
ba703e9
3ad5343
ba703e9
5f03eaa
 
3ad5343
71d678c
 
 
3ad5343
ba703e9
71d678c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
import gradio as gr
import torch
import torchaudio
import tempfile
import os
import warnings
from contextlib import contextmanager
import gc
import librosa
import soundfile as sf

warnings.filterwarnings("ignore")
os.environ["COQUI_TOS_AGREED"] = "1"
print("πŸš€ Starting FINAL CORRECTED Voice Cloning Studio...")

@contextmanager
def patch_torch_load():
    original_load = torch.load
    def patched_load(f, *args, **kwargs):
        kwargs['weights_only'] = False
        return original_load(f, *args, **kwargs)
    torch.load = patched_load
    try:
        yield
    finally:
        torch.load = original_load

# Hardware setup
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"πŸ”₯ Device: {DEVICE}")

# Global model variables
TTS_MODEL = None
WHISPER_MODEL = None
MODEL_STATUS = "Not Loaded"

def load_xtts_optimized():
    global TTS_MODEL, MODEL_STATUS
    if TTS_MODEL is not None:
        return True
    try:
        with patch_torch_load():
            from TTS.api import TTS
            print("πŸ“¦ Loading XTTS...")
            TTS_MODEL = TTS(
                model_name="tts_models/multilingual/multi-dataset/xtts_v2",
                progress_bar=False,
                gpu=(DEVICE == "cuda")
            )
            MODEL_STATUS = "XTTS-v2 Ready"
            print("βœ… XTTS loaded successfully!")
            return True
    except Exception as e:
        print(f"❌ XTTS loading failed: {e}")
        MODEL_STATUS = f"XTTS Failed: {str(e)}"
        return False

def load_whisper_optimized():
    global WHISPER_MODEL
    if WHISPER_MODEL is not None:
        return True
    try:
        import whisper
        WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
        print("βœ… Whisper loaded!")
        return True
    except Exception as e:
        print(f"❌ Whisper failed: {e}")
        return False

def optimize_audio_input(audio_path, max_duration=25):
    try:
        if not os.path.exists(audio_path):
            print(f"⚠️ Audio file not found: {audio_path}")
            return audio_path
            
        audio, sr = librosa.load(audio_path, sr=22050)
        max_samples = int(max_duration * sr)
        if len(audio) > max_samples:
            audio = audio[:max_samples]
            print(f"πŸ”„ Audio trimmed to {max_duration}s")
        
        optimized_path = audio_path.replace('.wav', '_opt.wav').replace('.mp3', '_opt.wav')
        sf.write(optimized_path, audio, sr)
        print(f"βœ… Audio optimized: {optimized_path}")
        return optimized_path
        
    except Exception as e:
        print(f"⚠️ Audio optimization failed: {e}")
        return audio_path

def safe_file_path(file_input, input_name="audio"):
    """Extract file path from various input formats"""
    try:
        if file_input is None:
            return None
            
        # If it's already a string path
        if isinstance(file_input, str):
            if os.path.exists(file_input):
                return file_input
            else:
                print(f"⚠️ File path doesn't exist: {file_input}")
                return None
        
        # If it's a file object with name attribute
        if hasattr(file_input, 'name'):
            file_path = file_input.name
            if file_path and os.path.exists(file_path):
                return file_path
        
        # If it's a dict-like object
        if hasattr(file_input, 'get'):
            file_path = file_input.get('name') or file_input.get('path')
            if file_path and os.path.exists(file_path):
                return file_path
        
        print(f"⚠️ Could not extract file path from {input_name}: {type(file_input)}")
        return None
        
    except Exception as e:
        print(f"❌ Error processing {input_name}: {e}")
        return None

def voice_to_voice_clone_final(reference_audio, input_audio, language="en"):
    """FINAL CORRECTED voice cloning function"""
    try:
        print(f"🎭 Voice cloning request: {language}")
        print(f"πŸ“ Input types - Ref: {type(reference_audio)}, Input: {type(input_audio)}")
        
        # Extract file paths safely
        reference_path = safe_file_path(reference_audio, "reference")
        input_path = safe_file_path(input_audio, "input")
        
        if not reference_path:
            return None, "❌ Could not process reference audio file."
        
        if not input_path:
            return None, "❌ Could not process input audio file."
        
        print(f"πŸ“ Processing files - Ref: {reference_path}, Input: {input_path}")
        
        # Validate files
        if not os.path.exists(reference_path) or os.path.getsize(reference_path) < 1000:
            return None, "❌ Reference audio file is invalid."
            
        if not os.path.exists(input_path) or os.path.getsize(input_path) < 1000:
            return None, "❌ Input audio file is invalid."
        
        # Load models
        if not load_xtts_optimized():
            return None, f"❌ XTTS model failed: {MODEL_STATUS}"
        
        load_whisper_optimized()
        
        # Optimize audio files
        print("πŸ”„ Optimizing audio files...")
        ref_optimized = optimize_audio_input(reference_path, max_duration=20)
        input_optimized = optimize_audio_input(input_path, max_duration=25)
        
        # Transcribe input audio
        extracted_text = "This is a voice cloning demonstration."
        if WHISPER_MODEL:
            try:
                print("🎀 Transcribing audio...")
                with torch.no_grad():
                    result = WHISPER_MODEL.transcribe(
                        input_optimized,
                        fp16=(DEVICE == "cuda"),
                        language=language if language != 'auto' else None
                    )
                text = result.get("text", "").strip()
                if text and len(text) > 5:
                    extracted_text = text[:400]
                print(f"βœ… Transcribed: '{extracted_text[:50]}...'")
            except Exception as e:
                print(f"⚠️ Transcription warning: {e}")
        
        # Generate cloned voice
        print("πŸš€ Generating cloned voice...")
        
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
            output_path = tmp_file.name
        
        try:
            with patch_torch_load(), torch.no_grad():
                TTS_MODEL.tts_to_file(
                    text=extracted_text,
                    speaker_wav=ref_optimized,
                    language=language,
                    file_path=output_path,
                    temperature=0.7,
                    length_penalty=1.0,
                    repetition_penalty=5.0
                )
        except Exception as tts_error:
            print(f"❌ TTS generation error: {tts_error}")
            return None, f"❌ Voice generation failed: {str(tts_error)}"
        
        # Memory cleanup
        if DEVICE == "cuda":
            torch.cuda.empty_cache()
        gc.collect()
        
        # Validate and return output
        if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
            file_size_kb = os.path.getsize(output_path) / 1024
            
            success_message = f"""βœ… VOICE CLONING SUCCESS! πŸŽ‰

πŸ“ Text: "{extracted_text[:100]}{'...' if len(extracted_text) > 100 else ''}"
🎭 Device: {DEVICE} | Model: {MODEL_STATUS}
πŸ“Š Output: {file_size_kb:.1f} KB | Language: {language.upper()}
πŸ”§ Optimizations Applied Successfully"""
            
            print("βœ… Voice cloning completed successfully!")
            
            # CRITICAL FIX: Return file path directly for Gradio compatibility
            return output_path, success_message
            
        else:
            return None, "❌ Voice cloning failed - output file is empty."
            
    except Exception as e:
        error_msg = f"❌ Voice cloning error: {str(e)}"
        print(error_msg)
        import traceback
        print("Full traceback:", traceback.format_exc())
        return None, error_msg

# CRITICAL: Use gr.Interface (not Blocks) for better API compatibility
interface = gr.Interface(
    fn=voice_to_voice_clone_final,
    inputs=[
        gr.Audio(
            label="🎀 Reference Audio (Voice to Clone)",
            type="filepath"  # CRITICAL: Must be filepath for API compatibility
        ),
        gr.Audio(
            label="🎡 Input Audio (Content to Transform)", 
            type="filepath"  # CRITICAL: Must be filepath for API compatibility
        ),
        gr.Dropdown(
            choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"],
            value="en",
            label="🌍 Language"
        )
    ],
    outputs=[
        gr.Audio(
            label="πŸŽ‰ Cloned Voice Result",
            type="filepath"  # CRITICAL: Must be filepath for proper return
        ),
        gr.Textbox(
            label="πŸ“‹ Processing Status",
            lines=8
        )
    ],
    title="🎭 AI Voice Cloning Studio - FINAL",
    description="Transform voices using XTTS-v2 and Whisper AI. Upload clear audio files (10-30 seconds each).",
    theme=gr.themes.Soft(),
    allow_flagging="never",
    api_name="voice_to_voice_clone"  # CRITICAL: API endpoint name
)

if __name__ == "__main__":
    print("🌐 Launching FINAL CORRECTED Voice Cloning Studio...")
    
    # CORRECTED: Proper queue configuration
    interface.queue(
        max_size=2,  # Reduced for stability
        api_open=True,
        default_concurrency_limit=1
    ).launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_api=True,
        debug=False  # Disable debug for production
    )