jeysshon commited on
Commit
7928ba7
·
verified ·
1 Parent(s): 25d5e84

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +748 -377
app.py CHANGED
@@ -1,378 +1,644 @@
1
  import os
2
  import gc
3
- import tempfile
4
- import warnings
5
- import traceback
6
- import numpy as np
 
 
 
7
  import librosa
 
8
  import soundfile as sf
9
  import torch
10
- import torch.nn as nn
11
- import gradio as gr
12
  from tqdm import tqdm
 
 
 
 
 
 
 
 
 
 
13
 
 
14
  warnings.filterwarnings("ignore")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- # Configuración
17
- SAMPLE_RATE = 44100
18
- MAX_FILE_SIZE_MB = 50
 
19
 
20
- # Arquitectura del modelo MDX simplificada
21
- class MDXNet(nn.Module):
22
- def __init__(self, dim_f=2048, dim_t=256, n_fft=6144, hop=1024, num_channels=2):
23
- super(MDXNet, self).__init__()
24
  self.dim_f = dim_f
25
  self.dim_t = dim_t
 
26
  self.n_fft = n_fft
27
  self.hop = hop
28
- self.num_channels = num_channels
29
-
30
- # Encoder
31
- self.encoder = nn.Sequential(
32
- nn.Conv2d(4, 48, 3, padding=1),
33
- nn.BatchNorm2d(48),
34
- nn.ReLU(),
35
- nn.Conv2d(48, 48, 3, padding=1),
36
- nn.BatchNorm2d(48),
37
- nn.ReLU(),
38
- )
39
-
40
- # Decoder
41
- self.decoder = nn.Sequential(
42
- nn.Conv2d(48, 48, 3, padding=1),
43
- nn.BatchNorm2d(48),
44
- nn.ReLU(),
45
- nn.Conv2d(48, 4, 3, padding=1),
46
- nn.Sigmoid(),
47
- )
48
 
49
- self.window = torch.hann_window(n_fft)
 
 
50
 
 
 
 
51
  def stft(self, x):
52
- """Short-time Fourier transform"""
53
- x = x.reshape(-1, x.shape[-1])
54
- spec = torch.stft(
55
- x,
56
- n_fft=self.n_fft,
57
- hop_length=self.hop,
58
- window=self.window.to(x.device),
59
- return_complex=True
60
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- # Convert to magnitude and phase
63
- mag = torch.abs(spec).unsqueeze(1)
64
- phase = torch.angle(spec).unsqueeze(1)
65
 
66
- # Stack real and imaginary parts
67
- real = spec.real.unsqueeze(1)
68
- imag = spec.imag.unsqueeze(1)
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- return torch.cat([real, imag, mag, phase], dim=1)
71
-
72
- def istft(self, x, length=None):
73
- """Inverse Short-time Fourier transform"""
74
- real, imag = x[:, 0], x[:, 1]
75
- complex_spec = torch.complex(real, imag)
76
-
77
- audio = torch.istft(
78
- complex_spec,
79
- n_fft=self.n_fft,
80
- hop_length=self.hop,
81
- window=self.window.to(x.device),
82
- length=length
83
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
- return audio
86
-
87
- def forward(self, x):
88
- length = x.shape[-1]
89
 
90
- # STFT
91
- spec = self.stft(x)
 
 
 
 
92
 
93
- # Limit frequency dimension
94
- spec = spec[:, :, :self.dim_f]
95
 
96
- # Process through network
97
- encoded = self.encoder(spec)
98
- mask = self.decoder(encoded)
99
 
100
- # Apply mask to magnitude
101
- masked_spec = spec * mask
 
102
 
103
- # Pad back to original frequency dimension if needed
104
- if masked_spec.shape[2] < self.n_fft // 2 + 1:
105
- pad_size = self.n_fft // 2 + 1 - masked_spec.shape[2]
106
- pad = torch.zeros(masked_spec.shape[0], masked_spec.shape[1], pad_size, masked_spec.shape[3]).to(masked_spec.device)
107
- masked_spec = torch.cat([masked_spec, pad], dim=2)
108
 
109
- # ISTFT
110
- output = self.istft(masked_spec, length=length)
111
 
112
- return output
113
 
114
- class AudioSeparator:
115
- def __init__(self):
116
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
117
- print(f"🔧 Usando dispositivo: {self.device}")
118
-
119
- # Configuraciones para diferentes tipos de separación
120
- self.models = {
121
- 'vocals': {
122
- 'dim_f': 2048,
123
- 'dim_t': 256,
124
- 'n_fft': 6144,
125
- 'compensation': 1.035
126
- },
127
- 'drums': {
128
- 'dim_f': 2048,
129
- 'dim_t': 128,
130
- 'n_fft': 4096,
131
- 'compensation': 1.040
132
- },
133
- 'bass': {
134
- 'dim_f': 2048,
135
- 'dim_t': 512,
136
- 'n_fft': 16384,
137
- 'compensation': 1.030
138
- },
139
- 'other': {
140
- 'dim_f': 2048,
141
- 'dim_t': 256,
142
- 'n_fft': 6144,
143
- 'compensation': 1.025
144
- }
145
- }
146
-
147
- def load_model(self, model_type='vocals'):
148
- """Cargar modelo para tipo específico de separación"""
149
- config = self.models.get(model_type, self.models['vocals'])
150
- model = MDXNet(
151
- dim_f=config['dim_f'],
152
- dim_t=config['dim_t'],
153
- n_fft=config['n_fft']
154
- ).to(self.device)
155
-
156
- # Inicializar con pesos aleatorios (en un caso real cargarías pesos entrenados)
157
- model.eval()
158
- return model, config['compensation']
159
 
160
- def preprocess_audio(self, audio_path):
161
- """Cargar y preprocesar audio"""
162
- try:
163
- # Verificar tamaño del archivo
164
- file_size = os.path.getsize(audio_path) / (1024 * 1024)
165
- if file_size > MAX_FILE_SIZE_MB:
166
- raise ValueError(f"Archivo muy grande: {file_size:.1f}MB (máximo {MAX_FILE_SIZE_MB}MB)")
167
 
168
- # Cargar audio
169
- audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
- # Asegurar que sea estéreo
172
- if len(audio.shape) == 1:
173
- audio = np.stack([audio, audio])
174
- elif audio.shape[0] > 2:
175
- audio = audio[:2]
 
176
 
177
- # Normalizar
178
- max_val = np.max(np.abs(audio))
179
- if max_val > 0:
180
- audio = audio / max_val
181
 
182
- return torch.FloatTensor(audio).to(self.device), max_val
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
- except Exception as e:
185
- raise Exception(f"Error cargando audio: {str(e)}")
186
-
187
- def separate_source(self, audio_tensor, model_type='vocals', chunk_size=None):
188
- """Separar una fuente específica del audio"""
189
- model, compensation = self.load_model(model_type)
 
 
190
 
191
- if chunk_size is None:
192
- chunk_size = SAMPLE_RATE * 30 # 30 segundos por chunk
 
193
 
194
- audio_length = audio_tensor.shape[1]
195
- separated_audio = torch.zeros_like(audio_tensor)
 
 
196
 
197
- # Procesar en chunks si el audio es muy largo
198
- for start in range(0, audio_length, chunk_size):
199
- end = min(start + chunk_size, audio_length)
200
- chunk = audio_tensor[:, start:end]
201
-
202
- with torch.no_grad():
203
- separated_chunk = model(chunk.unsqueeze(0)).squeeze(0)
204
- separated_chunk = separated_chunk * compensation
205
- separated_audio[:, start:end] = separated_chunk
206
 
207
- return separated_audio
208
-
209
- def enhance_separation(self, audio_tensor, model_type):
210
- """Mejorar separación usando técnicas adicionales"""
211
- audio_np = audio_tensor.cpu().numpy()
212
-
213
- if model_type == 'vocals':
214
- # Para voces, enfocar en frecuencias medias
215
- enhanced = np.zeros_like(audio_np)
216
- for i in range(audio_np.shape[0]):
217
- # Aplicar filtro de frecuencias medias
218
- stft = librosa.stft(audio_np[i], n_fft=2048)
219
- mag, phase = np.abs(stft), np.angle(stft)
220
-
221
- # Enfatizar frecuencias vocales (200-4000 Hz)
222
- freq_bins = mag.shape[0]
223
- vocal_start = int(200 * freq_bins / (SAMPLE_RATE / 2))
224
- vocal_end = int(4000 * freq_bins / (SAMPLE_RATE / 2))
225
-
226
- mask = np.zeros_like(mag)
227
- mask[vocal_start:vocal_end] = 1.0
228
-
229
- enhanced_mag = mag * mask
230
- enhanced_stft = enhanced_mag * np.exp(1j * phase)
231
- enhanced[i] = librosa.istft(enhanced_stft)
232
-
233
- return torch.FloatTensor(enhanced).to(audio_tensor.device)
234
-
235
- elif model_type == 'drums':
236
- # Para drums, usar separación percusiva
237
- enhanced = np.zeros_like(audio_np)
238
- for i in range(audio_np.shape[0]):
239
- harmonic, percussive = librosa.effects.hpss(audio_np[i], margin=3.0)
240
- enhanced[i] = percussive
241
-
242
- return torch.FloatTensor(enhanced).to(audio_tensor.device)
243
-
244
- elif model_type == 'bass':
245
- # Para bass, filtro pasa-bajos
246
- enhanced = np.zeros_like(audio_np)
247
- for i in range(audio_np.shape[0]):
248
- # Filtro pasa-bajos agresivo
249
- stft = librosa.stft(audio_np[i], n_fft=2048)
250
- mag, phase = np.abs(stft), np.angle(stft)
251
-
252
- # Solo frecuencias bajas (hasta 250 Hz)
253
- freq_bins = mag.shape[0]
254
- bass_cutoff = int(250 * freq_bins / (SAMPLE_RATE / 2))
255
-
256
- mask = np.zeros_like(mag)
257
- mask[:bass_cutoff] = 1.0
258
-
259
- enhanced_mag = mag * mask
260
- enhanced_stft = enhanced_mag * np.exp(1j * phase)
261
- enhanced[i] = librosa.istft(enhanced_stft)
262
-
263
- return torch.FloatTensor(enhanced).to(audio_tensor.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
- return audio_tensor
 
 
 
 
 
 
 
 
 
 
 
266
 
267
- def separate_complete(self, audio_path, mode='quick'):
268
- """Separación completa del audio"""
269
- try:
270
- # Cargar audio
271
- audio_tensor, original_max = self.preprocess_audio(audio_path)
272
-
273
- results = {}
274
- temp_dir = tempfile.mkdtemp()
275
-
276
- if mode == 'quick':
277
- # Separación rápida: solo voces
278
- print("🎤 Separando voces...")
279
- vocals = self.separate_source(audio_tensor, 'vocals')
280
- vocals = self.enhance_separation(vocals, 'vocals')
281
- instrumental = audio_tensor - vocals
282
-
283
- results['vocals'] = vocals
284
- results['instrumental'] = instrumental
285
-
286
- elif mode == 'complete':
287
- # Separación completa
288
- print("🎤 Separando voces...")
289
- vocals = self.separate_source(audio_tensor, 'vocals')
290
- vocals = self.enhance_separation(vocals, 'vocals')
291
-
292
- # Crear instrumental sin voces
293
- no_vocals = audio_tensor - vocals
294
-
295
- print("🥁 Separando batería...")
296
- drums = self.separate_source(no_vocals, 'drums')
297
- drums = self.enhance_separation(drums, 'drums')
298
-
299
- print("🎸 Separando bajo...")
300
- bass = self.separate_source(no_vocals - drums, 'bass')
301
- bass = self.enhance_separation(bass, 'bass')
302
-
303
- # Lo que queda es "other"
304
- other = no_vocals - drums - bass
305
-
306
- results['vocals'] = vocals
307
- results['drums'] = drums
308
- results['bass'] = bass
309
- results['other'] = other
310
-
311
- elif mode in ['vocals_only', 'drums_only', 'bass_only']:
312
- # Separación individual
313
- target = mode.replace('_only', '')
314
- print(f"🎵 Separando {target}...")
315
-
316
- separated = self.separate_source(audio_tensor, target)
317
- separated = self.enhance_separation(separated, target)
318
- remaining = audio_tensor - separated
319
-
320
- results[target] = separated
321
- results[f'no_{target}'] = remaining
322
 
323
- # Guardar resultados
324
- output_files = []
325
- for name, audio_data in results.items():
326
- # Restaurar amplitud original y normalizar
327
- audio_np = audio_data.cpu().numpy() * original_max
328
 
329
- # Normalizar para evitar clipping
330
- max_val = np.max(np.abs(audio_np))
331
- if max_val > 0:
332
- audio_np = audio_np / max_val * 0.95
333
-
334
- # Guardar archivo
335
- output_path = os.path.join(temp_dir, f"{name}.wav")
336
- sf.write(output_path, audio_np.T, SAMPLE_RATE)
337
- output_files.append(output_path)
338
-
339
- print(f"✅ Guardado: {name}.wav")
340
-
341
- # Limpiar memoria
342
- del audio_tensor, results
343
- torch.cuda.empty_cache()
344
- gc.collect()
345
 
346
- return output_files, f"✅ Separación exitosa: {len(output_files)} archivos generados"
 
 
 
 
 
 
347
 
348
- except Exception as e:
349
- error_msg = f"❌ Error en separación: {str(e)}"
350
- print(error_msg)
351
- traceback.print_exc()
352
- return [], error_msg
353
-
354
- def process_audio(audio_file, separation_mode, progress=gr.Progress()):
355
- """Función principal para procesar audio"""
356
- if audio_file is None:
357
- return [], "⚠️ Por favor sube un archivo de audio"
358
-
359
- progress(0.1, desc="Inicializando...")
360
-
361
- try:
362
- separator = AudioSeparator()
363
 
364
- progress(0.3, desc="Separando audio...")
365
- output_files, status = separator.separate_complete(audio_file, separation_mode)
 
366
 
367
- progress(1.0, desc="¡Completado!")
368
- return output_files, status
 
 
 
369
 
370
  except Exception as e:
371
- error_msg = f"❌ Error: {str(e)}"
372
- return [], error_msg
 
 
373
 
374
- # Crear interfaz Gradio
375
  def create_interface():
 
376
  with gr.Blocks(
377
  title="🎵 Audio Separator Pro",
378
  theme=gr.themes.Soft(),
@@ -381,33 +647,39 @@ def create_interface():
381
  max-width: 1200px !important;
382
  }
383
  """
384
- ) as demo:
385
 
386
- gr.Markdown("""
387
- # 🎵 Audio Separator Pro
388
- ### Separador de audio inteligente usando técnicas avanzadas de procesamiento de señales
389
- """)
390
 
391
  with gr.Row():
392
- with gr.Column(scale=1):
393
  audio_input = gr.Audio(
394
  label="🎵 Subir archivo de audio",
395
  type="filepath",
396
  format="wav"
397
  )
398
 
399
- separation_mode = gr.Radio(
400
- label="🎛️ Modo de separación",
401
- choices=[
402
- ("🚀 Rápido (Voces + Instrumental)", "quick"),
403
- ("🎯 Completo (4 stems)", "complete"),
404
- ("🎤 Solo Voces", "vocals_only"),
405
- ("🥁 Solo Batería", "drums_only"),
406
- ("🎸 Solo Bajo", "bass_only")
407
- ],
408
- value="quick",
409
- info="Selecciona el tipo de separación que deseas"
410
- )
 
 
 
 
 
 
 
 
411
 
412
  process_btn = gr.Button(
413
  "🚀 Separar Audio",
@@ -418,7 +690,7 @@ def create_interface():
418
  with gr.Column(scale=1):
419
  status_output = gr.Textbox(
420
  label="📊 Estado del procesamiento",
421
- lines=8,
422
  interactive=False,
423
  info="Aquí verás el progreso de la separación"
424
  )
@@ -429,46 +701,145 @@ def create_interface():
429
  interactive=False
430
  )
431
 
432
- gr.Markdown("""
433
- ### 📝 Instrucciones:
434
- 1. **Sube tu archivo de audio** (formato: WAV, MP3, FLAC - máximo 50MB)
435
- 2. **Selecciona el modo de separación** según tus necesidades
436
- 3. **Haz clic en "Separar Audio"** y espera el procesamiento
437
- 4. **Descarga los archivos** generados
438
-
439
- ### 🎯 Modos disponibles:
440
- - **🚀 Rápido**: Separa voces del instrumental (2 archivos)
441
- - **🎯 Completo**: Separa en voces, batería, bajo y otros (4 archivos)
442
- - **🎤 Solo Voces**: Extrae únicamente las voces
443
- - **🥁 Solo Batería**: Extrae únicamente la batería
444
- - **🎸 Solo Bajo**: Extrae únicamente el bajo
445
-
446
- ### Características:
447
- - Procesamiento con IA usando arquitectura MDX-Net
448
- - Optimización automática para cada tipo de instrumento
449
- - ✅ Filtros de frecuencia especializados
450
- - ✅ Normalización automática de audio
451
- - ✅ Soporte para archivos largos (procesamiento por chunks)
452
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
 
454
  # Configurar eventos
455
  process_btn.click(
456
- fn=process_audio,
457
- inputs=[audio_input, separation_mode],
458
  outputs=[output_files, status_output],
459
  show_progress=True
460
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
 
462
- return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
 
464
  if __name__ == "__main__":
465
- print("🎵 Iniciando Audio Separator Pro")
466
- print(f"🔧 PyTorch: {torch.__version__}")
467
- print(f"🔧 CUDA disponible: {torch.cuda.is_available()}")
468
-
469
- demo = create_interface()
470
- demo.launch(
471
- server_name="0.0.0.0",
472
- server_port=7860,
473
- share=True
474
- )
 
1
  import os
2
  import gc
3
+ import hashlib
4
+ import queue
5
+ import threading
6
+ import json
7
+ import sys
8
+ import shlex
9
+ import subprocess
10
  import librosa
11
+ import numpy as np
12
  import soundfile as sf
13
  import torch
 
 
14
  from tqdm import tqdm
15
+ import random
16
+ import spaces
17
+ import onnxruntime as ort
18
+ import warnings
19
+ import gradio as gr
20
+ import logging
21
+ import time
22
+ import traceback
23
+ import tempfile
24
+ from pathlib import Path
25
 
26
+ # Configuración mejorada
27
  warnings.filterwarnings("ignore")
28
+ logging.basicConfig(level=logging.INFO)
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # Instalar onnxruntime-gpu si está disponible
32
+ try:
33
+ os.system("pip install ort-nightly-gpu --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ort-cuda-12-nightly/pypi/simple/")
34
+ except:
35
+ logger.warning("No se pudo instalar ort-nightly-gpu, usando CPU")
36
+
37
+ title = "<center><strong><font size='7'>🎵 Audio Separator Pro</font></strong></center>"
38
+ description = """
39
+ ### 🚀 Separador de audio avanzado usando modelos MDX-Net
40
+ - **Funciona garantizado** - Basado en el código exitoso de r3gm
41
+ - **Separación de alta calidad** - Voces + Instrumental con efectos opcionales
42
+ - **Procesamiento inteligente** - Optimizado para diferentes tipos de audio
43
+ """
44
+
45
+ # Configuración de modelos
46
+ stem_naming = {
47
+ "Vocals": "Instrumental",
48
+ "Other": "Instruments",
49
+ "Instrumental": "Vocals",
50
+ "Drums": "Drumless",
51
+ "Bass": "Bassless",
52
+ }
53
+
54
+ # URLs de descarga de modelos
55
+ MDX_DOWNLOAD_LINK = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/"
56
+ UVR_MODELS = [
57
+ "UVR-MDX-NET-Voc_FT.onnx",
58
+ "UVR_MDXNET_KARA_2.onnx",
59
+ "Reverb_HQ_By_FoxJoy.onnx",
60
+ "UVR-MDX-NET-Inst_HQ_4.onnx",
61
+ ]
62
 
63
+ # Directorios
64
+ BASE_DIR = "."
65
+ mdxnet_models_dir = os.path.join(BASE_DIR, "mdx_models")
66
+ output_dir = os.path.join(BASE_DIR, "separated_audio")
67
 
68
+ class MDXModel:
69
+ def __init__(self, device, dim_f, dim_t, n_fft, hop=1024, stem_name=None, compensation=1.000):
 
 
70
  self.dim_f = dim_f
71
  self.dim_t = dim_t
72
+ self.dim_c = 4
73
  self.n_fft = n_fft
74
  self.hop = hop
75
+ self.stem_name = stem_name
76
+ self.compensation = compensation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ self.n_bins = self.n_fft // 2 + 1
79
+ self.chunk_size = hop * (self.dim_t - 1)
80
+ self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(device)
81
 
82
+ out_c = self.dim_c
83
+ self.freq_pad = torch.zeros([1, out_c, self.n_bins - self.dim_f, self.dim_t]).to(device)
84
+
85
  def stft(self, x):
86
+ x = x.reshape([-1, self.chunk_size])
87
+ x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True, return_complex=True)
88
+ x = torch.view_as_real(x)
89
+ x = x.permute([0, 3, 1, 2])
90
+ x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape([-1, 4, self.n_bins, self.dim_t])
91
+ return x[:, :, : self.dim_f]
92
+
93
+ def istft(self, x, freq_pad=None):
94
+ freq_pad = self.freq_pad.repeat([x.shape[0], 1, 1, 1]) if freq_pad is None else freq_pad
95
+ x = torch.cat([x, freq_pad], -2)
96
+ x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape([-1, 2, self.n_bins, self.dim_t])
97
+ x = x.permute([0, 2, 3, 1])
98
+ x = x.contiguous()
99
+ x = torch.view_as_complex(x)
100
+ x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True)
101
+ return x.reshape([-1, 2, self.chunk_size])
102
+
103
+ class MDX:
104
+ DEFAULT_SR = 44100
105
+ DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR
106
+ DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR
107
+
108
+ def __init__(self, model_path: str, params: MDXModel, processor=0):
109
+ # Configurar dispositivo
110
+ self.device = torch.device(f"cuda:{processor}") if processor >= 0 else torch.device("cpu")
111
+ self.provider = ["CUDAExecutionProvider"] if processor >= 0 else ["CPUExecutionProvider"]
112
 
113
+ self.model = params
 
 
114
 
115
+ try:
116
+ # Cargar modelo ONNX
117
+ self.ort = ort.InferenceSession(model_path, providers=self.provider)
118
+
119
+ # Precargar modelo
120
+ dummy_input = torch.rand(1, 4, params.dim_f, params.dim_t).numpy()
121
+ self.ort.run(None, {"input": dummy_input})
122
+
123
+ self.process = lambda spec: self.ort.run(None, {"input": spec.cpu().numpy()})[0]
124
+ logger.info(f"✅ Modelo cargado: {model_path}")
125
+
126
+ except Exception as e:
127
+ logger.error(f"❌ Error cargando modelo: {e}")
128
+ raise
129
 
130
+ self.prog = None
131
+
132
+ @staticmethod
133
+ def get_hash(model_path):
134
+ try:
135
+ with open(model_path, "rb") as f:
136
+ f.seek(-10000 * 1024, 2)
137
+ model_hash = hashlib.md5(f.read()).hexdigest()
138
+ except:
139
+ model_hash = hashlib.md5(open(model_path, "rb").read()).hexdigest()
140
+ return model_hash
141
+
142
+ @staticmethod
143
+ def segment(wave, combine=True, chunk_size=DEFAULT_CHUNK_SIZE, margin_size=DEFAULT_MARGIN_SIZE):
144
+ if combine:
145
+ processed_wave = None
146
+ for segment_count, segment in enumerate(wave):
147
+ start = 0 if segment_count == 0 else margin_size
148
+ end = None if segment_count == len(wave) - 1 else -margin_size
149
+ if margin_size == 0:
150
+ end = None
151
+ if processed_wave is None:
152
+ processed_wave = segment[:, start:end]
153
+ else:
154
+ processed_wave = np.concatenate((processed_wave, segment[:, start:end]), axis=-1)
155
+ else:
156
+ processed_wave = []
157
+ sample_count = wave.shape[-1]
158
+
159
+ if chunk_size <= 0 or chunk_size > sample_count:
160
+ chunk_size = sample_count
161
+
162
+ if margin_size > chunk_size:
163
+ margin_size = chunk_size
164
+
165
+ for segment_count, skip in enumerate(range(0, sample_count, chunk_size)):
166
+ margin = 0 if segment_count == 0 else margin_size
167
+ end = min(skip + chunk_size + margin_size, sample_count)
168
+ start = skip - margin
169
+
170
+ cut = wave[:, start:end].copy()
171
+ processed_wave.append(cut)
172
+
173
+ if end == sample_count:
174
+ break
175
+
176
+ return processed_wave
177
+
178
+ def pad_wave(self, wave):
179
+ n_sample = wave.shape[1]
180
+ trim = self.model.n_fft // 2
181
+ gen_size = self.model.chunk_size - 2 * trim
182
+ pad = gen_size - n_sample % gen_size
183
+
184
+ wave_p = np.concatenate((
185
+ np.zeros((2, trim)),
186
+ wave,
187
+ np.zeros((2, pad)),
188
+ np.zeros((2, trim)),
189
+ ), 1)
190
+
191
+ mix_waves = []
192
+ for i in range(0, n_sample + pad, gen_size):
193
+ waves = np.array(wave_p[:, i:i + self.model.chunk_size])
194
+ mix_waves.append(waves)
195
+
196
+ mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(self.device)
197
+ return mix_waves, pad, trim
198
+
199
+ def _process_wave(self, mix_waves, trim, pad, q: queue.Queue, _id: int):
200
+ mix_waves = mix_waves.split(1)
201
+ with torch.no_grad():
202
+ pw = []
203
+ for mix_wave in mix_waves:
204
+ if self.prog:
205
+ self.prog.update()
206
+ spec = self.model.stft(mix_wave)
207
+ processed_spec = torch.tensor(self.process(spec))
208
+ processed_wav = self.model.istft(processed_spec.to(self.device))
209
+ processed_wav = processed_wav[:, :, trim:-trim].transpose(0, 1).reshape(2, -1).cpu().numpy()
210
+ pw.append(processed_wav)
211
+
212
+ processed_signal = np.concatenate(pw, axis=-1)[:, :-pad]
213
+ q.put({_id: processed_signal})
214
+ return processed_signal
215
+
216
+ def process_wave(self, wave: np.array, mt_threads=1):
217
+ self.prog = tqdm(total=0, desc="Procesando audio")
218
+ chunk = wave.shape[-1] // mt_threads
219
+ waves = self.segment(wave, False, chunk)
220
 
221
+ q = queue.Queue()
222
+ threads = []
 
 
223
 
224
+ for c, batch in enumerate(waves):
225
+ mix_waves, pad, trim = self.pad_wave(batch)
226
+ self.prog.total = len(mix_waves) * mt_threads
227
+ thread = threading.Thread(target=self._process_wave, args=(mix_waves, trim, pad, q, c))
228
+ thread.start()
229
+ threads.append(thread)
230
 
231
+ for thread in threads:
232
+ thread.join()
233
 
234
+ if self.prog:
235
+ self.prog.close()
 
236
 
237
+ processed_batches = []
238
+ while not q.empty():
239
+ processed_batches.append(q.get())
240
 
241
+ processed_batches = [list(wave.values())[0] for wave in sorted(processed_batches, key=lambda d: list(d.keys())[0])]
 
 
 
 
242
 
243
+ assert len(processed_batches) == len(waves), "Error: Procesamiento incompleto"
 
244
 
245
+ return self.segment(processed_batches, True, chunk)
246
 
247
+ def create_directories():
248
+ """Crear directorios necesarios"""
249
+ os.makedirs(mdxnet_models_dir, exist_ok=True)
250
+ os.makedirs(output_dir, exist_ok=True)
251
+
252
+ def download_models():
253
+ """Descargar modelos necesarios"""
254
+ try:
255
+ for model in UVR_MODELS:
256
+ model_path = os.path.join(mdxnet_models_dir, model)
257
+ if not os.path.exists(model_path):
258
+ logger.info(f"📥 Descargando {model}...")
259
+ download_url = MDX_DOWNLOAD_LINK + model
260
+
261
+ # Usar curl o wget para descargar
262
+ try:
263
+ subprocess.run([
264
+ "curl", "-L", "-o", model_path, download_url
265
+ ], check=True, capture_output=True)
266
+ logger.info(f"✅ Descargado: {model}")
267
+ except subprocess.CalledProcessError:
268
+ try:
269
+ subprocess.run([
270
+ "wget", "-O", model_path, download_url
271
+ ], check=True, capture_output=True)
272
+ logger.info(f"✅ Descargado: {model}")
273
+ except subprocess.CalledProcessError as e:
274
+ logger.error(f"❌ Error descargando {model}: {e}")
275
+ return False
276
+ else:
277
+ logger.info(f"✅ Modelo ya existe: {model}")
278
+
279
+ # Crear data.json si no existe
280
+ data_json_path = os.path.join(mdxnet_models_dir, "data.json")
281
+ if not os.path.exists(data_json_path):
282
+ create_data_json(data_json_path)
283
+
284
+ return True
285
+ except Exception as e:
286
+ logger.error(f"❌ Error en descarga de modelos: {e}")
287
+ return False
288
+
289
+ def create_data_json(data_json_path):
290
+ """Crear archivo data.json con configuraciones de modelos"""
291
+ model_data = {}
292
 
293
+ # Calcular hashes y configuraciones para cada modelo
294
+ for model in UVR_MODELS:
295
+ model_path = os.path.join(mdxnet_models_dir, model)
296
+ if os.path.exists(model_path):
297
+ model_hash = MDX.get_hash(model_path)
 
 
298
 
299
+ if "Voc_FT" in model:
300
+ model_data[model_hash] = {
301
+ "compensate": 1.035,
302
+ "mdx_dim_f_set": 2048,
303
+ "mdx_dim_t_set": 8,
304
+ "mdx_n_fft_scale_set": 6144,
305
+ "primary_stem": "Vocals"
306
+ }
307
+ elif "KARA" in model:
308
+ model_data[model_hash] = {
309
+ "compensate": 1.025,
310
+ "mdx_dim_f_set": 2048,
311
+ "mdx_dim_t_set": 8,
312
+ "mdx_n_fft_scale_set": 6144,
313
+ "primary_stem": "Vocals"
314
+ }
315
+ elif "Reverb" in model:
316
+ model_data[model_hash] = {
317
+ "compensate": 1.035,
318
+ "mdx_dim_f_set": 2048,
319
+ "mdx_dim_t_set": 8,
320
+ "mdx_n_fft_scale_set": 6144,
321
+ "primary_stem": "Reverb"
322
+ }
323
+ elif "Inst_HQ" in model:
324
+ model_data[model_hash] = {
325
+ "compensate": 1.035,
326
+ "mdx_dim_f_set": 2048,
327
+ "mdx_dim_t_set": 8,
328
+ "mdx_n_fft_scale_set": 6144,
329
+ "primary_stem": "Other"
330
+ }
331
+
332
+ with open(data_json_path, 'w') as f:
333
+ json.dump(model_data, f, indent=2)
334
+
335
+ logger.info(f"✅ Creado data.json con {len(model_data)} modelos")
336
+
337
+ def convert_to_stereo_and_wav(audio_path):
338
+ """Convertir audio a estéreo WAV"""
339
+ try:
340
+ wave, sr = librosa.load(audio_path, mono=False, sr=44100)
341
+
342
+ if len(wave.shape) == 1 or audio_path.lower().endswith('.wav') == False:
343
+ stereo_path = os.path.join(output_dir, f"{Path(audio_path).stem}_stereo.wav")
344
 
345
+ # Usar FFmpeg para conversión
346
+ command = [
347
+ 'ffmpeg', '-y', '-loglevel', 'error',
348
+ '-i', audio_path,
349
+ '-ac', '2', '-f', 'wav', stereo_path
350
+ ]
351
 
352
+ result = subprocess.run(command, capture_output=True, text=True)
 
 
 
353
 
354
+ if result.returncode == 0 and os.path.exists(stereo_path):
355
+ return stereo_path
356
+ else:
357
+ logger.warning(f"FFmpeg falló, usando librosa para {audio_path}")
358
+ # Fallback con librosa
359
+ if len(wave.shape) == 1:
360
+ wave = np.stack([wave, wave])
361
+ sf.write(stereo_path, wave.T, 44100)
362
+ return stereo_path
363
+ else:
364
+ return audio_path
365
+ except Exception as e:
366
+ logger.error(f"Error convirtiendo audio: {e}")
367
+ return audio_path
368
+
369
+ @spaces.GPU
370
+ def run_mdx(model_params, output_dir, model_path, filename,
371
+ exclude_main=False, exclude_inversion=False, suffix=None,
372
+ invert_suffix=None, denoise=False, keep_orig=True,
373
+ m_threads=2, device_base="cuda"):
374
+ """Ejecutar separación MDX"""
375
+ try:
376
+ # Configurar dispositivo
377
+ if device_base == "cuda" and torch.cuda.is_available():
378
+ device = torch.device("cuda:0")
379
+ processor_num = 0
380
+ device_properties = torch.cuda.get_device_properties(device)
381
+ vram_gb = device_properties.total_memory / 1024**3
382
+ m_threads = 1 if vram_gb < 8 else (8 if vram_gb > 32 else 2)
383
+ logger.info(f"🔧 CUDA - Threads: {m_threads}, VRAM: {vram_gb:.1f}GB")
384
+ else:
385
+ device = torch.device("cpu")
386
+ processor_num = -1
387
+ m_threads = 1
388
+ logger.info("🔧 Usando CPU")
389
+
390
+ # Obtener parámetros del modelo
391
+ model_hash = MDX.get_hash(model_path)
392
+ mp = model_params.get(model_hash)
393
+
394
+ if not mp:
395
+ raise ValueError(f"Parámetros no encontrados para modelo {model_path}")
396
+
397
+ # Crear modelo
398
+ model = MDXModel(
399
+ device,
400
+ dim_f=mp["mdx_dim_f_set"],
401
+ dim_t=2 ** mp["mdx_dim_t_set"],
402
+ n_fft=mp["mdx_n_fft_scale_set"],
403
+ stem_name=mp["primary_stem"],
404
+ compensation=mp["compensate"],
405
+ )
406
+
407
+ # Crear sesión MDX
408
+ mdx_sess = MDX(model_path, model, processor=processor_num)
409
+
410
+ # Cargar y procesar audio
411
+ wave, sr = librosa.load(filename, mono=False, sr=44100)
412
+
413
+ # Normalizar
414
+ peak = max(np.max(wave), abs(np.min(wave)))
415
+ if peak > 0:
416
+ wave /= peak
417
+
418
+ # Procesar
419
+ if denoise:
420
+ wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (mdx_sess.process_wave(wave, m_threads))
421
+ wave_processed *= 0.5
422
+ else:
423
+ wave_processed = mdx_sess.process_wave(wave, m_threads)
424
+
425
+ # Restaurar peak original
426
+ wave_processed *= peak
427
+
428
+ # Guardar archivos
429
+ stem_name = model.stem_name if suffix is None else suffix
430
+ main_filepath = None
431
+
432
+ if not exclude_main:
433
+ main_filepath = os.path.join(
434
+ output_dir,
435
+ f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav"
436
+ )
437
+ sf.write(main_filepath, wave_processed.T, sr)
438
+ logger.info(f"✅ Guardado: {stem_name}")
439
+
440
+ invert_filepath = None
441
+ if not exclude_inversion:
442
+ diff_stem_name = stem_naming.get(stem_name) if invert_suffix is None else invert_suffix
443
+ stem_name = f"{stem_name}_diff" if diff_stem_name is None else diff_stem_name
444
 
445
+ invert_filepath = os.path.join(
446
+ output_dir,
447
+ f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav"
448
+ )
449
+
450
+ inverted_audio = (-wave_processed.T * model.compensation) + wave.T
451
+ sf.write(invert_filepath, inverted_audio, sr)
452
+ logger.info(f"✅ Guardado: {stem_name}")
453
 
454
+ # Limpieza
455
+ if not keep_orig and os.path.exists(filename):
456
+ os.remove(filename)
457
 
458
+ del mdx_sess, wave_processed, wave
459
+ gc.collect()
460
+ if torch.cuda.is_available():
461
+ torch.cuda.empty_cache()
462
 
463
+ return main_filepath, invert_filepath
 
 
 
 
 
 
 
 
464
 
465
+ except Exception as e:
466
+ logger.error(f"❌ Error en run_mdx: {e}")
467
+ traceback.print_exc()
468
+ raise
469
+
470
+ def get_hash(filepath):
471
+ """Calcular hash de archivo"""
472
+ with open(filepath, 'rb') as f:
473
+ file_hash = hashlib.blake2b()
474
+ while chunk := f.read(8192):
475
+ file_hash.update(chunk)
476
+ return file_hash.hexdigest()[:18]
477
+
478
+ def process_uvr_task(orig_song_path: str, main_vocals: bool = False,
479
+ dereverb: bool = True, song_id: str = "mdx",
480
+ only_voiceless: bool = False):
481
+ """Tarea principal de separación UVR"""
482
+ try:
483
+ device_base = "cuda" if torch.cuda.is_available() else "cpu"
484
+ logger.info(f"🔧 Dispositivo: {device_base}")
485
+
486
+ # Cargar parámetros de modelos
487
+ data_json_path = os.path.join(mdxnet_models_dir, "data.json")
488
+ with open(data_json_path) as infile:
489
+ mdx_model_params = json.load(infile)
490
+
491
+ # Crear directorio de salida
492
+ song_output_dir = os.path.join(output_dir, song_id)
493
+ os.makedirs(song_output_dir, exist_ok=True)
494
+
495
+ # Convertir a estéreo WAV
496
+ orig_song_path = convert_to_stereo_and_wav(orig_song_path)
497
+
498
+ if only_voiceless:
499
+ logger.info("🎵 Separando instrumental...")
500
+ process = run_mdx(
501
+ mdx_model_params,
502
+ song_output_dir,
503
+ os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Inst_HQ_4.onnx"),
504
+ orig_song_path,
505
+ suffix="Instrumental",
506
+ denoise=False,
507
+ keep_orig=True,
508
+ exclude_inversion=True,
509
+ device_base=device_base,
510
+ )
511
+ return process
512
+
513
+ # Separación de voces
514
+ logger.info("🎤 Separando voces...")
515
+ vocals_path, instrumentals_path = run_mdx(
516
+ mdx_model_params,
517
+ song_output_dir,
518
+ os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Voc_FT.onnx"),
519
+ orig_song_path,
520
+ denoise=True,
521
+ keep_orig=True,
522
+ device_base=device_base,
523
+ )
524
+
525
+ # Separación de voces principales
526
+ if main_vocals:
527
+ logger.info("🎙️ Separando voces principales...")
528
+ try:
529
+ backup_vocals_path, main_vocals_path = run_mdx(
530
+ mdx_model_params,
531
+ song_output_dir,
532
+ os.path.join(mdxnet_models_dir, "UVR_MDXNET_KARA_2.onnx"),
533
+ vocals_path,
534
+ suffix="Backup",
535
+ invert_suffix="Main",
536
+ denoise=True,
537
+ device_base=device_base,
538
+ )
539
+ except Exception as e:
540
+ logger.warning(f"Error en separación principal: {e}")
541
+ backup_vocals_path, main_vocals_path = None, vocals_path
542
+ else:
543
+ backup_vocals_path, main_vocals_path = None, vocals_path
544
+
545
+ # Eliminación de reverb
546
+ if dereverb:
547
+ logger.info("🔄 Eliminando reverb...")
548
+ try:
549
+ _, vocals_dereverb_path = run_mdx(
550
+ mdx_model_params,
551
+ song_output_dir,
552
+ os.path.join(mdxnet_models_dir, "Reverb_HQ_By_FoxJoy.onnx"),
553
+ main_vocals_path,
554
+ invert_suffix="DeReverb",
555
+ exclude_main=True,
556
+ denoise=True,
557
+ device_base=device_base,
558
+ )
559
+ except Exception as e:
560
+ logger.warning(f"Error eliminando reverb: {e}")
561
+ vocals_dereverb_path = main_vocals_path
562
+ else:
563
+ vocals_dereverb_path = main_vocals_path
564
 
565
+ return vocals_path, instrumentals_path, backup_vocals_path, main_vocals_path, vocals_dereverb_path
566
+
567
+ except Exception as e:
568
+ logger.error(f"❌ Error en process_uvr_task: {e}")
569
+ traceback.print_exc()
570
+ raise
571
+
572
+ @spaces.GPU
573
+ def sound_separate(media_file, stem="vocal", main=False, dereverb=True):
574
+ """Función principal de separación de audio"""
575
+ if not media_file:
576
+ raise ValueError("⚠️ No se proporcionó archivo de audio")
577
 
578
+ if not stem:
579
+ raise ValueError("⚠️ Selecciona tipo de separación (vocal/background)")
580
+
581
+ try:
582
+ # Verificar tamaño del archivo
583
+ file_size = os.path.getsize(media_file) / (1024 * 1024) # MB
584
+ if file_size > 100: # Límite de 100MB
585
+ raise ValueError(f"❌ Archivo muy grande: {file_size:.1f}MB (máximo 100MB)")
586
+
587
+ # Generar ID único
588
+ hash_audio = get_hash(media_file)
589
+ song_id = hash_audio + "_separated"
590
+
591
+ outputs = []
592
+ start_time = time.time()
593
+
594
+ if stem == "vocal":
595
+ logger.info("🎤 Iniciando separación de voces...")
596
+ result = process_uvr_task(
597
+ orig_song_path=media_file,
598
+ song_id=song_id,
599
+ main_vocals=main,
600
+ dereverb=dereverb,
601
+ only_voiceless=False
602
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
603
 
604
+ if isinstance(result, tuple) and len(result) >= 5:
605
+ vocals_path, instrumentals_path, backup_vocals_path, main_vocals_path, vocals_dereverb_path = result
606
+ final_vocal_path = vocals_dereverb_path if vocals_dereverb_path else vocals_path
 
 
607
 
608
+ if final_vocal_path and os.path.exists(final_vocal_path):
609
+ outputs.append(final_vocal_path)
610
+ if instrumentals_path and os.path.exists(instrumentals_path):
611
+ outputs.append(instrumentals_path)
 
 
 
 
 
 
 
 
 
 
 
 
612
 
613
+ elif stem == "background":
614
+ logger.info("🎵 Iniciando separación de instrumental...")
615
+ instrumental_path = process_uvr_task(
616
+ orig_song_path=media_file,
617
+ song_id=song_id,
618
+ only_voiceless=True
619
+ )
620
 
621
+ if instrumental_path and os.path.exists(instrumental_path):
622
+ outputs.append(instrumental_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
623
 
624
+ end_time = time.time()
625
+ execution_time = end_time - start_time
626
+ logger.info(f"⏱️ Tiempo de ejecución: {execution_time:.1f} segundos")
627
 
628
+ if not outputs:
629
+ raise Exception("❌ No se generaron archivos de salida")
630
+
631
+ logger.info(f"✅ Separación exitosa: {len(outputs)} archivos")
632
+ return outputs
633
 
634
  except Exception as e:
635
+ error_msg = f"❌ Error en separación: {str(e)}"
636
+ logger.error(error_msg)
637
+ traceback.print_exc()
638
+ raise ValueError(error_msg)
639
 
 
640
  def create_interface():
641
+ """Crear interfaz Gradio"""
642
  with gr.Blocks(
643
  title="🎵 Audio Separator Pro",
644
  theme=gr.themes.Soft(),
 
647
  max-width: 1200px !important;
648
  }
649
  """
650
+ ) as app:
651
 
652
+ gr.Markdown(title)
653
+ gr.Markdown(description)
 
 
654
 
655
  with gr.Row():
656
+ with gr.Column(scale=2):
657
  audio_input = gr.Audio(
658
  label="🎵 Subir archivo de audio",
659
  type="filepath",
660
  format="wav"
661
  )
662
 
663
+ with gr.Row():
664
+ stem_choice = gr.Radio(
665
+ choices=["vocal", "background"],
666
+ value="vocal",
667
+ label="🎛️ Tipo de separación",
668
+ info="Selecciona qué quieres extraer"
669
+ )
670
+
671
+ with gr.Row():
672
+ main_vocals_check = gr.Checkbox(
673
+ label="🎙️ Separar voces principales",
674
+ value=False,
675
+ info="Separar voces principales de coros (solo para voces)"
676
+ )
677
+
678
+ dereverb_check = gr.Checkbox(
679
+ label="🔄 Eliminar reverb",
680
+ value=True,
681
+ info="Mejorar claridad de voces eliminando reverb"
682
+ )
683
 
684
  process_btn = gr.Button(
685
  "🚀 Separar Audio",
 
690
  with gr.Column(scale=1):
691
  status_output = gr.Textbox(
692
  label="📊 Estado del procesamiento",
693
+ lines=10,
694
  interactive=False,
695
  info="Aquí verás el progreso de la separación"
696
  )
 
701
  interactive=False
702
  )
703
 
704
+ # Función para mostrar/ocultar opciones según el tipo
705
+ def update_visibility(stem_type):
706
+ if stem_type == "vocal":
707
+ return gr.update(visible=True), gr.update(visible=True)
708
+ else:
709
+ return gr.update(visible=False), gr.update(visible=False)
710
+
711
+ stem_choice.change(
712
+ fn=update_visibility,
713
+ inputs=[stem_choice],
714
+ outputs=[main_vocals_check, dereverb_check]
715
+ )
716
+
717
+ # Función de procesamiento con manejo de errores mejorado
718
+ def process_audio_wrapper(audio_file, stem, main, dereverb, progress=gr.Progress()):
719
+ if audio_file is None:
720
+ return [], "⚠️ Por favor sube un archivo de audio"
721
+
722
+ try:
723
+ progress(0.1, desc="Inicializando...")
724
+
725
+ # Verificar que los modelos estén descargados
726
+ if not all(os.path.exists(os.path.join(mdxnet_models_dir, model)) for model in UVR_MODELS):
727
+ progress(0.2, desc="Descargando modelos...")
728
+ if not download_models():
729
+ return [], "❌ Error descargando modelos"
730
+
731
+ progress(0.4, desc="Separando audio...")
732
+
733
+ # Procesar audio
734
+ result_files = sound_separate(
735
+ media_file=audio_file,
736
+ stem=stem,
737
+ main=main,
738
+ dereverb=dereverb
739
+ )
740
+
741
+ progress(1.0, desc="¡Completado!")
742
+
743
+ success_msg = f"✅ Separación exitosa: {len(result_files)} archivo(s) generado(s)"
744
+ return result_files, success_msg
745
+
746
+ except Exception as e:
747
+ error_msg = f"❌ Error: {str(e)}"
748
+ logger.error(error_msg)
749
+ return [], error_msg
750
 
751
  # Configurar eventos
752
  process_btn.click(
753
+ fn=process_audio_wrapper,
754
+ inputs=[audio_input, stem_choice, main_vocals_check, dereverb_check],
755
  outputs=[output_files, status_output],
756
  show_progress=True
757
  )
758
+
759
+ # Ejemplos
760
+ gr.Examples(
761
+ examples=[
762
+ ["./test.mp3", "vocal", False, True],
763
+ ["./test.mp3", "background", False, False],
764
+ ],
765
+ inputs=[audio_input, stem_choice, main_vocals_check, dereverb_check],
766
+ outputs=[output_files, status_output],
767
+ fn=process_audio_wrapper,
768
+ cache_examples=False,
769
+ )
770
+
771
+ gr.Markdown("""
772
+ ### 📝 Instrucciones de uso:
773
+
774
+ 1. **📁 Sube tu archivo de audio** (formatos: MP3, WAV, FLAC, M4A - máximo 100MB)
775
+ 2. **🎛️ Selecciona el tipo de separación:**
776
+ - **🎤 Vocal**: Extrae las voces del audio
777
+ - **🎵 Background**: Extrae el instrumental (sin voces)
778
+ 3. **⚙️ Configura opciones avanzadas** (solo para voces):
779
+ - **🎙️ Separar voces principales**: Separa voces principales de coros
780
+ - **🔄 Eliminar reverb**: Mejora la claridad eliminando reverb
781
+ 4. **🚀 Haz clic en "Separar Audio"** y espera el procesamiento
782
+ 5. **📥 Descarga los archivos** generados
783
+
784
+ ### 🎯 Características:
785
+ - ✅ **Modelos MDX-Net de alta calidad** - Misma tecnología que el separador exitoso de r3gm
786
+ - ✅ **Separación inteligente** - Optimizada para voces e instrumentales
787
+ - ✅ **Procesamiento GPU/CPU** - Automáticamente optimizado según hardware disponible
788
+ - ✅ **Múltiples formatos** - Soporta MP3, WAV, FLAC, M4A
789
+ - ✅ **Descarga automática** - Los modelos se descargan automáticamente
790
+ - ✅ **Calidad profesional** - Resultados comparables a software comercial
791
+
792
+ ### ⚡ Rendimiento:
793
+ - **GPU**: Procesamiento rápido con CUDA
794
+ - **CPU**: Funciona en cualquier hardware
795
+ - **Memoria**: Optimizado para archivos grandes
796
+ - **Calidad**: Separación de alta fidelidad
797
+
798
+ ### 🔧 Tecnología:
799
+ - **MDX-Net**: Arquitectura de red neuronal especializada
800
+ - **ONNX Runtime**: Inferencia optimizada
801
+ - **Torch**: Procesamiento de tensores
802
+ - **Librosa**: Análisis de audio avanzado
803
+ """)
804
 
805
+ return app
806
+
807
+ def main():
808
+ """Función principal"""
809
+ try:
810
+ logger.info("🎵 Iniciando Audio Separator Pro")
811
+ logger.info(f"🔧 PyTorch: {torch.__version__}")
812
+ logger.info(f"🔧 CUDA disponible: {torch.cuda.is_available()}")
813
+
814
+ # Crear directorios
815
+ create_directories()
816
+
817
+ # Descargar modelos si es necesario
818
+ logger.info("📥 Verificando modelos...")
819
+ if not all(os.path.exists(os.path.join(mdxnet_models_dir, model)) for model in UVR_MODELS):
820
+ logger.info("📥 Descargando modelos...")
821
+ if not download_models():
822
+ logger.error("❌ Error descargando modelos")
823
+ return
824
+ else:
825
+ logger.info("✅ Todos los modelos están disponibles")
826
+
827
+ # Crear interfaz
828
+ app = create_interface()
829
+
830
+ # Lanzar aplicación
831
+ app.queue(default_concurrency_limit=10)
832
+ app.launch(
833
+ server_name="0.0.0.0",
834
+ server_port=7860,
835
+ share=True,
836
+ show_error=True,
837
+ quiet=False
838
+ )
839
+
840
+ except Exception as e:
841
+ logger.error(f"❌ Error en main: {e}")
842
+ traceback.print_exc()
843
 
844
  if __name__ == "__main__":
845
+ main()