Spaces:

jeysshon
/

DISBAND

Sleeping

App Files Files Community

jeysshon commited on Aug 15, 2025

Commit

7928ba7

verified ·

1 Parent(s): 25d5e84

Update app.py

Browse files

Files changed (1) hide show

app.py +748 -377

app.py CHANGED Viewed

@@ -1,378 +1,644 @@
 import os
 import gc
-import tempfile
-import warnings
-import traceback
-import numpy as np
 import librosa
 import soundfile as sf
 import torch
-import torch.nn as nn
-import gradio as gr
 from tqdm import tqdm
 warnings.filterwarnings("ignore")
-# Configuración
-SAMPLE_RATE = 44100
-MAX_FILE_SIZE_MB = 50
-# Arquitectura del modelo MDX simplificada
-class MDXNet(nn.Module):
-    def __init__(self, dim_f=2048, dim_t=256, n_fft=6144, hop=1024, num_channels=2):
-        super(MDXNet, self).__init__()
         self.dim_f = dim_f
         self.dim_t = dim_t
         self.n_fft = n_fft
         self.hop = hop
-        self.num_channels = num_channels
-        # Encoder
-        self.encoder = nn.Sequential(
-            nn.Conv2d(4, 48, 3, padding=1),
-            nn.BatchNorm2d(48),
-            nn.ReLU(),
-            nn.Conv2d(48, 48, 3, padding=1),
-            nn.BatchNorm2d(48),
-            nn.ReLU(),
-        )
-        # Decoder
-        self.decoder = nn.Sequential(
-            nn.Conv2d(48, 48, 3, padding=1),
-            nn.BatchNorm2d(48),
-            nn.ReLU(),
-            nn.Conv2d(48, 4, 3, padding=1),
-            nn.Sigmoid(),
-        )
-        self.window = torch.hann_window(n_fft)
     def stft(self, x):
-        """Short-time Fourier transform"""
-        x = x.reshape(-1, x.shape[-1])
-        spec = torch.stft(
-            x,
-            n_fft=self.n_fft,
-            hop_length=self.hop,
-            window=self.window.to(x.device),
-            return_complex=True
-        )
-        # Convert to magnitude and phase
-        mag = torch.abs(spec).unsqueeze(1)
-        phase = torch.angle(spec).unsqueeze(1)
-        # Stack real and imaginary parts
-        real = spec.real.unsqueeze(1)
-        imag = spec.imag.unsqueeze(1)
-        return torch.cat([real, imag, mag, phase], dim=1)
-    def istft(self, x, length=None):
-        """Inverse Short-time Fourier transform"""
-        real, imag = x[:, 0], x[:, 1]
-        complex_spec = torch.complex(real, imag)
-        audio = torch.istft(
-            complex_spec,
-            n_fft=self.n_fft,
-            hop_length=self.hop,
-            window=self.window.to(x.device),
-            length=length
-        )
-        return audio
-    def forward(self, x):
-        length = x.shape[-1]
-        # STFT
-        spec = self.stft(x)
-        # Limit frequency dimension
-        spec = spec[:, :, :self.dim_f]
-        # Process through network
-        encoded = self.encoder(spec)
-        mask = self.decoder(encoded)
-        # Apply mask to magnitude
-        masked_spec = spec * mask
-        # Pad back to original frequency dimension if needed
-        if masked_spec.shape[2] < self.n_fft // 2 + 1:
-            pad_size = self.n_fft // 2 + 1 - masked_spec.shape[2]
-            pad = torch.zeros(masked_spec.shape[0], masked_spec.shape[1], pad_size, masked_spec.shape[3]).to(masked_spec.device)
-            masked_spec = torch.cat([masked_spec, pad], dim=2)
-        # ISTFT
-        output = self.istft(masked_spec, length=length)
-        return output
-class AudioSeparator:
-    def __init__(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        print(f"🔧 Usando dispositivo: {self.device}")
-        # Configuraciones para diferentes tipos de separación
-        self.models = {
-            'vocals': {
-                'dim_f': 2048,
-                'dim_t': 256,
-                'n_fft': 6144,
-                'compensation': 1.035
-            },
-            'drums': {
-                'dim_f': 2048,
-                'dim_t': 128,
-                'n_fft': 4096,
-                'compensation': 1.040
-            },
-            'bass': {
-                'dim_f': 2048,
-                'dim_t': 512,
-                'n_fft': 16384,
-                'compensation': 1.030
-            },
-            'other': {
-                'dim_f': 2048,
-                'dim_t': 256,
-                'n_fft': 6144,
-                'compensation': 1.025
-            }
-        }
-    def load_model(self, model_type='vocals'):
-        """Cargar modelo para tipo específico de separación"""
-        config = self.models.get(model_type, self.models['vocals'])
-        model = MDXNet(
-            dim_f=config['dim_f'],
-            dim_t=config['dim_t'],
-            n_fft=config['n_fft']
-        ).to(self.device)
-        # Inicializar con pesos aleatorios (en un caso real cargarías pesos entrenados)
-        model.eval()
-        return model, config['compensation']
-    def preprocess_audio(self, audio_path):
-        """Cargar y preprocesar audio"""
-        try:
-            # Verificar tamaño del archivo
-            file_size = os.path.getsize(audio_path) / (1024 * 1024)
-            if file_size > MAX_FILE_SIZE_MB:
-                raise ValueError(f"Archivo muy grande: {file_size:.1f}MB (máximo {MAX_FILE_SIZE_MB}MB)")
-            # Cargar audio
-            audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=False)
-            # Asegurar que sea estéreo
-            if len(audio.shape) == 1:
-                audio = np.stack([audio, audio])
-            elif audio.shape[0] > 2:
-                audio = audio[:2]
-            # Normalizar
-            max_val = np.max(np.abs(audio))
-            if max_val > 0:
-                audio = audio / max_val
-            return torch.FloatTensor(audio).to(self.device), max_val
-        except Exception as e:
-            raise Exception(f"Error cargando audio: {str(e)}")
-    def separate_source(self, audio_tensor, model_type='vocals', chunk_size=None):
-        """Separar una fuente específica del audio"""
-        model, compensation = self.load_model(model_type)
-        if chunk_size is None:
-            chunk_size = SAMPLE_RATE * 30  # 30 segundos por chunk
-        audio_length = audio_tensor.shape[1]
-        separated_audio = torch.zeros_like(audio_tensor)
-        # Procesar en chunks si el audio es muy largo
-        for start in range(0, audio_length, chunk_size):
-            end = min(start + chunk_size, audio_length)
-            chunk = audio_tensor[:, start:end]
-            with torch.no_grad():
-                separated_chunk = model(chunk.unsqueeze(0)).squeeze(0)
-                separated_chunk = separated_chunk * compensation
-                separated_audio[:, start:end] = separated_chunk
-        return separated_audio
-    def enhance_separation(self, audio_tensor, model_type):
-        """Mejorar separación usando técnicas adicionales"""
-        audio_np = audio_tensor.cpu().numpy()
-        if model_type == 'vocals':
-            # Para voces, enfocar en frecuencias medias
-            enhanced = np.zeros_like(audio_np)
-            for i in range(audio_np.shape[0]):
-                # Aplicar filtro de frecuencias medias
-                stft = librosa.stft(audio_np[i], n_fft=2048)
-                mag, phase = np.abs(stft), np.angle(stft)
-                # Enfatizar frecuencias vocales (200-4000 Hz)
-                freq_bins = mag.shape[0]
-                vocal_start = int(200 * freq_bins / (SAMPLE_RATE / 2))
-                vocal_end = int(4000 * freq_bins / (SAMPLE_RATE / 2))
-                mask = np.zeros_like(mag)
-                mask[vocal_start:vocal_end] = 1.0
-                enhanced_mag = mag * mask
-                enhanced_stft = enhanced_mag * np.exp(1j * phase)
-                enhanced[i] = librosa.istft(enhanced_stft)
-            return torch.FloatTensor(enhanced).to(audio_tensor.device)
-        elif model_type == 'drums':
-            # Para drums, usar separación percusiva
-            enhanced = np.zeros_like(audio_np)
-            for i in range(audio_np.shape[0]):
-                harmonic, percussive = librosa.effects.hpss(audio_np[i], margin=3.0)
-                enhanced[i] = percussive
-            return torch.FloatTensor(enhanced).to(audio_tensor.device)
-        elif model_type == 'bass':
-            # Para bass, filtro pasa-bajos
-            enhanced = np.zeros_like(audio_np)
-            for i in range(audio_np.shape[0]):
-                # Filtro pasa-bajos agresivo
-                stft = librosa.stft(audio_np[i], n_fft=2048)
-                mag, phase = np.abs(stft), np.angle(stft)
-                # Solo frecuencias bajas (hasta 250 Hz)
-                freq_bins = mag.shape[0]
-                bass_cutoff = int(250 * freq_bins / (SAMPLE_RATE / 2))
-                mask = np.zeros_like(mag)
-                mask[:bass_cutoff] = 1.0
-                enhanced_mag = mag * mask
-                enhanced_stft = enhanced_mag * np.exp(1j * phase)
-                enhanced[i] = librosa.istft(enhanced_stft)
-            return torch.FloatTensor(enhanced).to(audio_tensor.device)
-        return audio_tensor
-    def separate_complete(self, audio_path, mode='quick'):
-        """Separación completa del audio"""
-        try:
-            # Cargar audio
-            audio_tensor, original_max = self.preprocess_audio(audio_path)
-            results = {}
-            temp_dir = tempfile.mkdtemp()
-            if mode == 'quick':
-                # Separación rápida: solo voces
-                print("🎤 Separando voces...")
-                vocals = self.separate_source(audio_tensor, 'vocals')
-                vocals = self.enhance_separation(vocals, 'vocals')
-                instrumental = audio_tensor - vocals
-                results['vocals'] = vocals
-                results['instrumental'] = instrumental
-            elif mode == 'complete':
-                # Separación completa
-                print("🎤 Separando voces...")
-                vocals = self.separate_source(audio_tensor, 'vocals')
-                vocals = self.enhance_separation(vocals, 'vocals')
-                # Crear instrumental sin voces
-                no_vocals = audio_tensor - vocals
-                print("🥁 Separando batería...")
-                drums = self.separate_source(no_vocals, 'drums')
-                drums = self.enhance_separation(drums, 'drums')
-                print("🎸 Separando bajo...")
-                bass = self.separate_source(no_vocals - drums, 'bass')
-                bass = self.enhance_separation(bass, 'bass')
-                # Lo que queda es "other"
-                other = no_vocals - drums - bass
-                results['vocals'] = vocals
-                results['drums'] = drums
-                results['bass'] = bass
-                results['other'] = other
-            elif mode in ['vocals_only', 'drums_only', 'bass_only']:
-                # Separación individual
-                target = mode.replace('_only', '')
-                print(f"🎵 Separando {target}...")
-                separated = self.separate_source(audio_tensor, target)
-                separated = self.enhance_separation(separated, target)
-                remaining = audio_tensor - separated
-                results[target] = separated
-                results[f'no_{target}'] = remaining
-            # Guardar resultados
-            output_files = []
-            for name, audio_data in results.items():
-                # Restaurar amplitud original y normalizar
-                audio_np = audio_data.cpu().numpy() * original_max
-                # Normalizar para evitar clipping
-                max_val = np.max(np.abs(audio_np))
-                if max_val > 0:
-                    audio_np = audio_np / max_val * 0.95
-                # Guardar archivo
-                output_path = os.path.join(temp_dir, f"{name}.wav")
-                sf.write(output_path, audio_np.T, SAMPLE_RATE)
-                output_files.append(output_path)
-                print(f"✅ Guardado: {name}.wav")
-            # Limpiar memoria
-            del audio_tensor, results
-            torch.cuda.empty_cache()
-            gc.collect()
-            return output_files, f"✅ Separación exitosa: {len(output_files)} archivos generados"
-        except Exception as e:
-            error_msg = f"❌ Error en separación: {str(e)}"
-            print(error_msg)
-            traceback.print_exc()
-            return [], error_msg
-def process_audio(audio_file, separation_mode, progress=gr.Progress()):
-    """Función principal para procesar audio"""
-    if audio_file is None:
-        return [], "⚠️ Por favor sube un archivo de audio"
-    progress(0.1, desc="Inicializando...")
-    try:
-        separator = AudioSeparator()
-        progress(0.3, desc="Separando audio...")
-        output_files, status = separator.separate_complete(audio_file, separation_mode)
-        progress(1.0, desc="¡Completado!")
-        return output_files, status
     except Exception as e:
-        error_msg = f"❌ Error: {str(e)}"
-        return [], error_msg
-# Crear interfaz Gradio
 def create_interface():
     with gr.Blocks(
         title="🎵 Audio Separator Pro",
         theme=gr.themes.Soft(),
@@ -381,33 +647,39 @@ def create_interface():
             max-width: 1200px !important;
         }
         """
-    ) as demo:
-        gr.Markdown("""
-        # 🎵 Audio Separator Pro
-        ### Separador de audio inteligente usando técnicas avanzadas de procesamiento de señales
-        """)
         with gr.Row():
-            with gr.Column(scale=1):
                 audio_input = gr.Audio(
                     label="🎵 Subir archivo de audio",
                     type="filepath",
                     format="wav"
                 )
-                separation_mode = gr.Radio(
-                    label="🎛️ Modo de separación",
-                    choices=[
-                        ("🚀 Rápido (Voces + Instrumental)", "quick"),
-                        ("🎯 Completo (4 stems)", "complete"),
-                        ("🎤 Solo Voces", "vocals_only"),
-                        ("🥁 Solo Batería", "drums_only"),
-                        ("🎸 Solo Bajo", "bass_only")
-                    ],
-                    value="quick",
-                    info="Selecciona el tipo de separación que deseas"
-                )
                 process_btn = gr.Button(
                     "🚀 Separar Audio",
@@ -418,7 +690,7 @@ def create_interface():
             with gr.Column(scale=1):
                 status_output = gr.Textbox(
                     label="📊 Estado del procesamiento",
-                    lines=8,
                     interactive=False,
                     info="Aquí verás el progreso de la separación"
                 )
@@ -429,46 +701,145 @@ def create_interface():
             interactive=False
         )
-        gr.Markdown("""
-        ### 📝 Instrucciones:
-        1. **Sube tu archivo de audio** (formato: WAV, MP3, FLAC - máximo 50MB)
-        2. **Selecciona el modo de separación** según tus necesidades
-        3. **Haz clic en "Separar Audio"** y espera el procesamiento
-        4. **Descarga los archivos** generados
-        ### 🎯 Modos disponibles:
-        - **🚀 Rápido**: Separa voces del instrumental (2 archivos)
-        - **🎯 Completo**: Separa en voces, batería, bajo y otros (4 archivos)
-        - **🎤 Solo Voces**: Extrae únicamente las voces
-        - **🥁 Solo Batería**: Extrae únicamente la batería
-        - **🎸 Solo Bajo**: Extrae únicamente el bajo
-        ### ⚡ Características:
-        - ✅ Procesamiento con IA usando arquitectura MDX-Net
-        - ✅ Optimización automática para cada tipo de instrumento
-        - ✅ Filtros de frecuencia especializados
-        - ✅ Normalización automática de audio
-        - ✅ Soporte para archivos largos (procesamiento por chunks)
-        """)
         # Configurar eventos
         process_btn.click(
-            fn=process_audio,
-            inputs=[audio_input, separation_mode],
             outputs=[output_files, status_output],
             show_progress=True
         )
-    return demo
 if __name__ == "__main__":
-    print("🎵 Iniciando Audio Separator Pro")
-    print(f"🔧 PyTorch: {torch.__version__}")
-    print(f"🔧 CUDA disponible: {torch.cuda.is_available()}")
-    demo = create_interface()
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True
-    )

 import os
 import gc
+import hashlib
+import queue
+import threading
+import json
+import sys
+import shlex
+import subprocess
 import librosa
+import numpy as np
 import soundfile as sf
 import torch
 from tqdm import tqdm
+import random
+import spaces
+import onnxruntime as ort
+import warnings
+import gradio as gr
+import logging
+import time
+import traceback
+import tempfile
+from pathlib import Path
+# Configuración mejorada
 warnings.filterwarnings("ignore")
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Instalar onnxruntime-gpu si está disponible
+try:
+    os.system("pip install ort-nightly-gpu --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ort-cuda-12-nightly/pypi/simple/")
+except:
+    logger.warning("No se pudo instalar ort-nightly-gpu, usando CPU")
+title = "<center><strong><font size='7'>🎵 Audio Separator Pro</font></strong></center>"
+description = """
+### 🚀 Separador de audio avanzado usando modelos MDX-Net
+- **Funciona garantizado** - Basado en el código exitoso de r3gm
+- **Separación de alta calidad** - Voces + Instrumental con efectos opcionales
+- **Procesamiento inteligente** - Optimizado para diferentes tipos de audio
+"""
+# Configuración de modelos
+stem_naming = {
+    "Vocals": "Instrumental",
+    "Other": "Instruments",
+    "Instrumental": "Vocals",
+    "Drums": "Drumless",
+    "Bass": "Bassless",
+}
+# URLs de descarga de modelos
+MDX_DOWNLOAD_LINK = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/"
+UVR_MODELS = [
+    "UVR-MDX-NET-Voc_FT.onnx",
+    "UVR_MDXNET_KARA_2.onnx",
+    "Reverb_HQ_By_FoxJoy.onnx",
+    "UVR-MDX-NET-Inst_HQ_4.onnx",
+]
+# Directorios
+BASE_DIR = "."
+mdxnet_models_dir = os.path.join(BASE_DIR, "mdx_models")
+output_dir = os.path.join(BASE_DIR, "separated_audio")
+class MDXModel:
+    def __init__(self, device, dim_f, dim_t, n_fft, hop=1024, stem_name=None, compensation=1.000):
         self.dim_f = dim_f
         self.dim_t = dim_t
+        self.dim_c = 4
         self.n_fft = n_fft
         self.hop = hop
+        self.stem_name = stem_name
+        self.compensation = compensation
+        self.n_bins = self.n_fft // 2 + 1
+        self.chunk_size = hop * (self.dim_t - 1)
+        self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(device)
+        out_c = self.dim_c
+        self.freq_pad = torch.zeros([1, out_c, self.n_bins - self.dim_f, self.dim_t]).to(device)
     def stft(self, x):
+        x = x.reshape([-1, self.chunk_size])
+        x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True, return_complex=True)
+        x = torch.view_as_real(x)
+        x = x.permute([0, 3, 1, 2])
+        x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape([-1, 4, self.n_bins, self.dim_t])
+        return x[:, :, : self.dim_f]
+    def istft(self, x, freq_pad=None):
+        freq_pad = self.freq_pad.repeat([x.shape[0], 1, 1, 1]) if freq_pad is None else freq_pad
+        x = torch.cat([x, freq_pad], -2)
+        x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape([-1, 2, self.n_bins, self.dim_t])
+        x = x.permute([0, 2, 3, 1])
+        x = x.contiguous()
+        x = torch.view_as_complex(x)
+        x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True)
+        return x.reshape([-1, 2, self.chunk_size])
+class MDX:
+    DEFAULT_SR = 44100
+    DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR
+    DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR
+    def __init__(self, model_path: str, params: MDXModel, processor=0):
+        # Configurar dispositivo
+        self.device = torch.device(f"cuda:{processor}") if processor >= 0 else torch.device("cpu")
+        self.provider = ["CUDAExecutionProvider"] if processor >= 0 else ["CPUExecutionProvider"]
+        self.model = params
+        try:
+            # Cargar modelo ONNX
+            self.ort = ort.InferenceSession(model_path, providers=self.provider)
+            # Precargar modelo
+            dummy_input = torch.rand(1, 4, params.dim_f, params.dim_t).numpy()
+            self.ort.run(None, {"input": dummy_input})
+            self.process = lambda spec: self.ort.run(None, {"input": spec.cpu().numpy()})[0]
+            logger.info(f"✅ Modelo cargado: {model_path}")
+        except Exception as e:
+            logger.error(f"❌ Error cargando modelo: {e}")
+            raise
+        self.prog = None
+    @staticmethod
+    def get_hash(model_path):
+        try:
+            with open(model_path, "rb") as f:
+                f.seek(-10000 * 1024, 2)
+                model_hash = hashlib.md5(f.read()).hexdigest()
+        except:
+            model_hash = hashlib.md5(open(model_path, "rb").read()).hexdigest()
+        return model_hash
+    @staticmethod
+    def segment(wave, combine=True, chunk_size=DEFAULT_CHUNK_SIZE, margin_size=DEFAULT_MARGIN_SIZE):
+        if combine:
+            processed_wave = None
+            for segment_count, segment in enumerate(wave):
+                start = 0 if segment_count == 0 else margin_size
+                end = None if segment_count == len(wave) - 1 else -margin_size
+                if margin_size == 0:
+                    end = None
+                if processed_wave is None:
+                    processed_wave = segment[:, start:end]
+                else:
+                    processed_wave = np.concatenate((processed_wave, segment[:, start:end]), axis=-1)
+        else:
+            processed_wave = []
+            sample_count = wave.shape[-1]
+            if chunk_size <= 0 or chunk_size > sample_count:
+                chunk_size = sample_count
+            if margin_size > chunk_size:
+                margin_size = chunk_size
+            for segment_count, skip in enumerate(range(0, sample_count, chunk_size)):
+                margin = 0 if segment_count == 0 else margin_size
+                end = min(skip + chunk_size + margin_size, sample_count)
+                start = skip - margin
+                cut = wave[:, start:end].copy()
+                processed_wave.append(cut)
+                if end == sample_count:
+                    break
+        return processed_wave
+    def pad_wave(self, wave):
+        n_sample = wave.shape[1]
+        trim = self.model.n_fft // 2
+        gen_size = self.model.chunk_size - 2 * trim
+        pad = gen_size - n_sample % gen_size
+        wave_p = np.concatenate((
+            np.zeros((2, trim)),
+            wave,
+            np.zeros((2, pad)),
+            np.zeros((2, trim)),
+        ), 1)
+        mix_waves = []
+        for i in range(0, n_sample + pad, gen_size):
+            waves = np.array(wave_p[:, i:i + self.model.chunk_size])
+            mix_waves.append(waves)
+        mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(self.device)
+        return mix_waves, pad, trim
+    def _process_wave(self, mix_waves, trim, pad, q: queue.Queue, _id: int):
+        mix_waves = mix_waves.split(1)
+        with torch.no_grad():
+            pw = []
+            for mix_wave in mix_waves:
+                if self.prog:
+                    self.prog.update()
+                spec = self.model.stft(mix_wave)
+                processed_spec = torch.tensor(self.process(spec))
+                processed_wav = self.model.istft(processed_spec.to(self.device))
+                processed_wav = processed_wav[:, :, trim:-trim].transpose(0, 1).reshape(2, -1).cpu().numpy()
+                pw.append(processed_wav)
+        processed_signal = np.concatenate(pw, axis=-1)[:, :-pad]
+        q.put({_id: processed_signal})
+        return processed_signal
+    def process_wave(self, wave: np.array, mt_threads=1):
+        self.prog = tqdm(total=0, desc="Procesando audio")
+        chunk = wave.shape[-1] // mt_threads
+        waves = self.segment(wave, False, chunk)
+        q = queue.Queue()
+        threads = []
+        for c, batch in enumerate(waves):
+            mix_waves, pad, trim = self.pad_wave(batch)
+            self.prog.total = len(mix_waves) * mt_threads
+            thread = threading.Thread(target=self._process_wave, args=(mix_waves, trim, pad, q, c))
+            thread.start()
+            threads.append(thread)
+        for thread in threads:
+            thread.join()
+        if self.prog:
+            self.prog.close()
+        processed_batches = []
+        while not q.empty():
+            processed_batches.append(q.get())
+        processed_batches = [list(wave.values())[0] for wave in sorted(processed_batches, key=lambda d: list(d.keys())[0])]
+        assert len(processed_batches) == len(waves), "Error: Procesamiento incompleto"
+        return self.segment(processed_batches, True, chunk)
+def create_directories():
+    """Crear directorios necesarios"""
+    os.makedirs(mdxnet_models_dir, exist_ok=True)
+    os.makedirs(output_dir, exist_ok=True)
+def download_models():
+    """Descargar modelos necesarios"""
+    try:
+        for model in UVR_MODELS:
+            model_path = os.path.join(mdxnet_models_dir, model)
+            if not os.path.exists(model_path):
+                logger.info(f"📥 Descargando {model}...")
+                download_url = MDX_DOWNLOAD_LINK + model
+                # Usar curl o wget para descargar
+                try:
+                    subprocess.run([
+                        "curl", "-L", "-o", model_path, download_url
+                    ], check=True, capture_output=True)
+                    logger.info(f"✅ Descargado: {model}")
+                except subprocess.CalledProcessError:
+                    try:
+                        subprocess.run([
+                            "wget", "-O", model_path, download_url
+                        ], check=True, capture_output=True)
+                        logger.info(f"✅ Descargado: {model}")
+                    except subprocess.CalledProcessError as e:
+                        logger.error(f"❌ Error descargando {model}: {e}")
+                        return False
+            else:
+                logger.info(f"✅ Modelo ya existe: {model}")
+        # Crear data.json si no existe
+        data_json_path = os.path.join(mdxnet_models_dir, "data.json")
+        if not os.path.exists(data_json_path):
+            create_data_json(data_json_path)
+        return True
+    except Exception as e:
+        logger.error(f"❌ Error en descarga de modelos: {e}")
+        return False
+def create_data_json(data_json_path):
+    """Crear archivo data.json con configuraciones de modelos"""
+    model_data = {}
+    # Calcular hashes y configuraciones para cada modelo
+    for model in UVR_MODELS:
+        model_path = os.path.join(mdxnet_models_dir, model)
+        if os.path.exists(model_path):
+            model_hash = MDX.get_hash(model_path)
+            if "Voc_FT" in model:
+                model_data[model_hash] = {
+                    "compensate": 1.035,
+                    "mdx_dim_f_set": 2048,
+                    "mdx_dim_t_set": 8,
+                    "mdx_n_fft_scale_set": 6144,
+                    "primary_stem": "Vocals"
+                }
+            elif "KARA" in model:
+                model_data[model_hash] = {
+                    "compensate": 1.025,
+                    "mdx_dim_f_set": 2048,
+                    "mdx_dim_t_set": 8,
+                    "mdx_n_fft_scale_set": 6144,
+                    "primary_stem": "Vocals"
+                }
+            elif "Reverb" in model:
+                model_data[model_hash] = {
+                    "compensate": 1.035,
+                    "mdx_dim_f_set": 2048,
+                    "mdx_dim_t_set": 8,
+                    "mdx_n_fft_scale_set": 6144,
+                    "primary_stem": "Reverb"
+                }
+            elif "Inst_HQ" in model:
+                model_data[model_hash] = {
+                    "compensate": 1.035,
+                    "mdx_dim_f_set": 2048,
+                    "mdx_dim_t_set": 8,
+                    "mdx_n_fft_scale_set": 6144,
+                    "primary_stem": "Other"
+                }
+    with open(data_json_path, 'w') as f:
+        json.dump(model_data, f, indent=2)
+    logger.info(f"✅ Creado data.json con {len(model_data)} modelos")
+def convert_to_stereo_and_wav(audio_path):
+    """Convertir audio a estéreo WAV"""
+    try:
+        wave, sr = librosa.load(audio_path, mono=False, sr=44100)
+        if len(wave.shape) == 1 or audio_path.lower().endswith('.wav') == False:
+            stereo_path = os.path.join(output_dir, f"{Path(audio_path).stem}_stereo.wav")
+            # Usar FFmpeg para conversión
+            command = [
+                'ffmpeg', '-y', '-loglevel', 'error',
+                '-i', audio_path,
+                '-ac', '2', '-f', 'wav', stereo_path
+            ]
+            result = subprocess.run(command, capture_output=True, text=True)
+            if result.returncode == 0 and os.path.exists(stereo_path):
+                return stereo_path
+            else:
+                logger.warning(f"FFmpeg falló, usando librosa para {audio_path}")
+                # Fallback con librosa
+                if len(wave.shape) == 1:
+                    wave = np.stack([wave, wave])
+                sf.write(stereo_path, wave.T, 44100)
+                return stereo_path
+        else:
+            return audio_path
+    except Exception as e:
+        logger.error(f"Error convirtiendo audio: {e}")
+        return audio_path
+@spaces.GPU
+def run_mdx(model_params, output_dir, model_path, filename,
+           exclude_main=False, exclude_inversion=False, suffix=None,
+           invert_suffix=None, denoise=False, keep_orig=True,
+           m_threads=2, device_base="cuda"):
+    """Ejecutar separación MDX"""
+    try:
+        # Configurar dispositivo
+        if device_base == "cuda" and torch.cuda.is_available():
+            device = torch.device("cuda:0")
+            processor_num = 0
+            device_properties = torch.cuda.get_device_properties(device)
+            vram_gb = device_properties.total_memory / 1024**3
+            m_threads = 1 if vram_gb < 8 else (8 if vram_gb > 32 else 2)
+            logger.info(f"🔧 CUDA - Threads: {m_threads}, VRAM: {vram_gb:.1f}GB")
+        else:
+            device = torch.device("cpu")
+            processor_num = -1
+            m_threads = 1
+            logger.info("🔧 Usando CPU")
+        # Obtener parámetros del modelo
+        model_hash = MDX.get_hash(model_path)
+        mp = model_params.get(model_hash)
+        if not mp:
+            raise ValueError(f"Parámetros no encontrados para modelo {model_path}")
+        # Crear modelo
+        model = MDXModel(
+            device,
+            dim_f=mp["mdx_dim_f_set"],
+            dim_t=2 ** mp["mdx_dim_t_set"],
+            n_fft=mp["mdx_n_fft_scale_set"],
+            stem_name=mp["primary_stem"],
+            compensation=mp["compensate"],
+        )
+        # Crear sesión MDX
+        mdx_sess = MDX(model_path, model, processor=processor_num)
+        # Cargar y procesar audio
+        wave, sr = librosa.load(filename, mono=False, sr=44100)
+        # Normalizar
+        peak = max(np.max(wave), abs(np.min(wave)))
+        if peak > 0:
+            wave /= peak
+        # Procesar
+        if denoise:
+            wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (mdx_sess.process_wave(wave, m_threads))
+            wave_processed *= 0.5
+        else:
+            wave_processed = mdx_sess.process_wave(wave, m_threads)
+        # Restaurar peak original
+        wave_processed *= peak
+        # Guardar archivos
+        stem_name = model.stem_name if suffix is None else suffix
+        main_filepath = None
+        if not exclude_main:
+            main_filepath = os.path.join(
+                output_dir,
+                f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav"
+            )
+            sf.write(main_filepath, wave_processed.T, sr)
+            logger.info(f"✅ Guardado: {stem_name}")
+        invert_filepath = None
+        if not exclude_inversion:
+            diff_stem_name = stem_naming.get(stem_name) if invert_suffix is None else invert_suffix
+            stem_name = f"{stem_name}_diff" if diff_stem_name is None else diff_stem_name
+            invert_filepath = os.path.join(
+                output_dir,
+                f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav"
+            )
+            inverted_audio = (-wave_processed.T * model.compensation) + wave.T
+            sf.write(invert_filepath, inverted_audio, sr)
+            logger.info(f"✅ Guardado: {stem_name}")
+        # Limpieza
+        if not keep_orig and os.path.exists(filename):
+            os.remove(filename)
+        del mdx_sess, wave_processed, wave
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return main_filepath, invert_filepath
+    except Exception as e:
+        logger.error(f"❌ Error en run_mdx: {e}")
+        traceback.print_exc()
+        raise
+def get_hash(filepath):
+    """Calcular hash de archivo"""
+    with open(filepath, 'rb') as f:
+        file_hash = hashlib.blake2b()
+        while chunk := f.read(8192):
+            file_hash.update(chunk)
+    return file_hash.hexdigest()[:18]
+def process_uvr_task(orig_song_path: str, main_vocals: bool = False,
+                    dereverb: bool = True, song_id: str = "mdx",
+                    only_voiceless: bool = False):
+    """Tarea principal de separación UVR"""
+    try:
+        device_base = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"🔧 Dispositivo: {device_base}")
+        # Cargar parámetros de modelos
+        data_json_path = os.path.join(mdxnet_models_dir, "data.json")
+        with open(data_json_path) as infile:
+            mdx_model_params = json.load(infile)
+        # Crear directorio de salida
+        song_output_dir = os.path.join(output_dir, song_id)
+        os.makedirs(song_output_dir, exist_ok=True)
+        # Convertir a estéreo WAV
+        orig_song_path = convert_to_stereo_and_wav(orig_song_path)
+        if only_voiceless:
+            logger.info("🎵 Separando instrumental...")
+            process = run_mdx(
+                mdx_model_params,
+                song_output_dir,
+                os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Inst_HQ_4.onnx"),
+                orig_song_path,
+                suffix="Instrumental",
+                denoise=False,
+                keep_orig=True,
+                exclude_inversion=True,
+                device_base=device_base,
+            )
+            return process
+        # Separación de voces
+        logger.info("🎤 Separando voces...")
+        vocals_path, instrumentals_path = run_mdx(
+            mdx_model_params,
+            song_output_dir,
+            os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Voc_FT.onnx"),
+            orig_song_path,
+            denoise=True,
+            keep_orig=True,
+            device_base=device_base,
+        )
+        # Separación de voces principales
+        if main_vocals:
+            logger.info("🎙️ Separando voces principales...")
+            try:
+                backup_vocals_path, main_vocals_path = run_mdx(
+                    mdx_model_params,
+                    song_output_dir,
+                    os.path.join(mdxnet_models_dir, "UVR_MDXNET_KARA_2.onnx"),
+                    vocals_path,
+                    suffix="Backup",
+                    invert_suffix="Main",
+                    denoise=True,
+                    device_base=device_base,
+                )
+            except Exception as e:
+                logger.warning(f"Error en separación principal: {e}")
+                backup_vocals_path, main_vocals_path = None, vocals_path
+        else:
+            backup_vocals_path, main_vocals_path = None, vocals_path
+        # Eliminación de reverb
+        if dereverb:
+            logger.info("🔄 Eliminando reverb...")
+            try:
+                _, vocals_dereverb_path = run_mdx(
+                    mdx_model_params,
+                    song_output_dir,
+                    os.path.join(mdxnet_models_dir, "Reverb_HQ_By_FoxJoy.onnx"),
+                    main_vocals_path,
+                    invert_suffix="DeReverb",
+                    exclude_main=True,
+                    denoise=True,
+                    device_base=device_base,
+                )
+            except Exception as e:
+                logger.warning(f"Error eliminando reverb: {e}")
+                vocals_dereverb_path = main_vocals_path
+        else:
+            vocals_dereverb_path = main_vocals_path
+        return vocals_path, instrumentals_path, backup_vocals_path, main_vocals_path, vocals_dereverb_path
+    except Exception as e:
+        logger.error(f"❌ Error en process_uvr_task: {e}")
+        traceback.print_exc()
+        raise
+@spaces.GPU
+def sound_separate(media_file, stem="vocal", main=False, dereverb=True):
+    """Función principal de separación de audio"""
+    if not media_file:
+        raise ValueError("⚠️ No se proporcionó archivo de audio")
+    if not stem:
+        raise ValueError("⚠️ Selecciona tipo de separación (vocal/background)")
+    try:
+        # Verificar tamaño del archivo
+        file_size = os.path.getsize(media_file) / (1024 * 1024)  # MB
+        if file_size > 100:  # Límite de 100MB
+            raise ValueError(f"❌ Archivo muy grande: {file_size:.1f}MB (máximo 100MB)")
+        # Generar ID único
+        hash_audio = get_hash(media_file)
+        song_id = hash_audio + "_separated"
+        outputs = []
+        start_time = time.time()
+        if stem == "vocal":
+            logger.info("🎤 Iniciando separación de voces...")
+            result = process_uvr_task(
+                orig_song_path=media_file,
+                song_id=song_id,
+                main_vocals=main,
+                dereverb=dereverb,
+                only_voiceless=False
+            )
+            if isinstance(result, tuple) and len(result) >= 5:
+                vocals_path, instrumentals_path, backup_vocals_path, main_vocals_path, vocals_dereverb_path = result
+                final_vocal_path = vocals_dereverb_path if vocals_dereverb_path else vocals_path
+                if final_vocal_path and os.path.exists(final_vocal_path):
+                    outputs.append(final_vocal_path)
+                if instrumentals_path and os.path.exists(instrumentals_path):
+                    outputs.append(instrumentals_path)
+        elif stem == "background":
+            logger.info("🎵 Iniciando separación de instrumental...")
+            instrumental_path = process_uvr_task(
+                orig_song_path=media_file,
+                song_id=song_id,
+                only_voiceless=True
+            )
+            if instrumental_path and os.path.exists(instrumental_path):
+                outputs.append(instrumental_path)
+        end_time = time.time()
+        execution_time = end_time - start_time
+        logger.info(f"⏱️ Tiempo de ejecución: {execution_time:.1f} segundos")
+        if not outputs:
+            raise Exception("❌ No se generaron archivos de salida")
+        logger.info(f"✅ Separación exitosa: {len(outputs)} archivos")
+        return outputs
     except Exception as e:
+        error_msg = f"❌ Error en separación: {str(e)}"
+        logger.error(error_msg)
+        traceback.print_exc()
+        raise ValueError(error_msg)
 def create_interface():
+    """Crear interfaz Gradio"""
     with gr.Blocks(
         title="🎵 Audio Separator Pro",
         theme=gr.themes.Soft(),
             max-width: 1200px !important;
         }
         """
+    ) as app:
+        gr.Markdown(title)
+        gr.Markdown(description)
         with gr.Row():
+            with gr.Column(scale=2):
                 audio_input = gr.Audio(
                     label="🎵 Subir archivo de audio",
                     type="filepath",
                     format="wav"
                 )
+                with gr.Row():
+                    stem_choice = gr.Radio(
+                        choices=["vocal", "background"],
+                        value="vocal",
+                        label="🎛️ Tipo de separación",
+                        info="Selecciona qué quieres extraer"
+                    )
+                with gr.Row():
+                    main_vocals_check = gr.Checkbox(
+                        label="🎙️ Separar voces principales",
+                        value=False,
+                        info="Separar voces principales de coros (solo para voces)"
+                    )
+                    dereverb_check = gr.Checkbox(
+                        label="🔄 Eliminar reverb",
+                        value=True,
+                        info="Mejorar claridad de voces eliminando reverb"
+                    )
                 process_btn = gr.Button(
                     "🚀 Separar Audio",
             with gr.Column(scale=1):
                 status_output = gr.Textbox(
                     label="📊 Estado del procesamiento",
+                    lines=10,
                     interactive=False,
                     info="Aquí verás el progreso de la separación"
                 )
             interactive=False
         )
+        # Función para mostrar/ocultar opciones según el tipo
+        def update_visibility(stem_type):
+            if stem_type == "vocal":
+                return gr.update(visible=True), gr.update(visible=True)
+            else:
+                return gr.update(visible=False), gr.update(visible=False)
+        stem_choice.change(
+            fn=update_visibility,
+            inputs=[stem_choice],
+            outputs=[main_vocals_check, dereverb_check]
+        )
+        # Función de procesamiento con manejo de errores mejorado
+        def process_audio_wrapper(audio_file, stem, main, dereverb, progress=gr.Progress()):
+            if audio_file is None:
+                return [], "⚠️ Por favor sube un archivo de audio"
+            try:
+                progress(0.1, desc="Inicializando...")
+                # Verificar que los modelos estén descargados
+                if not all(os.path.exists(os.path.join(mdxnet_models_dir, model)) for model in UVR_MODELS):
+                    progress(0.2, desc="Descargando modelos...")
+                    if not download_models():
+                        return [], "❌ Error descargando modelos"
+                progress(0.4, desc="Separando audio...")
+                # Procesar audio
+                result_files = sound_separate(
+                    media_file=audio_file,
+                    stem=stem,
+                    main=main,
+                    dereverb=dereverb
+                )
+                progress(1.0, desc="¡Completado!")
+                success_msg = f"✅ Separación exitosa: {len(result_files)} archivo(s) generado(s)"
+                return result_files, success_msg
+            except Exception as e:
+                error_msg = f"❌ Error: {str(e)}"
+                logger.error(error_msg)
+                return [], error_msg
         # Configurar eventos
         process_btn.click(
+            fn=process_audio_wrapper,
+            inputs=[audio_input, stem_choice, main_vocals_check, dereverb_check],
             outputs=[output_files, status_output],
             show_progress=True
         )
+        # Ejemplos
+        gr.Examples(
+            examples=[
+                ["./test.mp3", "vocal", False, True],
+                ["./test.mp3", "background", False, False],
+            ],
+            inputs=[audio_input, stem_choice, main_vocals_check, dereverb_check],
+            outputs=[output_files, status_output],
+            fn=process_audio_wrapper,
+            cache_examples=False,
+        )
+        gr.Markdown("""
+        ### 📝 Instrucciones de uso:
+        1. **📁 Sube tu archivo de audio** (formatos: MP3, WAV, FLAC, M4A - máximo 100MB)
+        2. **🎛️ Selecciona el tipo de separación:**
+           - **🎤 Vocal**: Extrae las voces del audio
+           - **🎵 Background**: Extrae el instrumental (sin voces)
+        3. **⚙️ Configura opciones avanzadas** (solo para voces):
+           - **🎙️ Separar voces principales**: Separa voces principales de coros
+           - **🔄 Eliminar reverb**: Mejora la claridad eliminando reverb
+        4. **🚀 Haz clic en "Separar Audio"** y espera el procesamiento
+        5. **📥 Descarga los archivos** generados
+        ### 🎯 Características:
+        - ✅ **Modelos MDX-Net de alta calidad** - Misma tecnología que el separador exitoso de r3gm
+        - ✅ **Separación inteligente** - Optimizada para voces e instrumentales
+        - ✅ **Procesamiento GPU/CPU** - Automáticamente optimizado según hardware disponible
+        - ✅ **Múltiples formatos** - Soporta MP3, WAV, FLAC, M4A
+        - ✅ **Descarga automática** - Los modelos se descargan automáticamente
+        - ✅ **Calidad profesional** - Resultados comparables a software comercial
+        ### ⚡ Rendimiento:
+        - **GPU**: Procesamiento rápido con CUDA
+        - **CPU**: Funciona en cualquier hardware
+        - **Memoria**: Optimizado para archivos grandes
+        - **Calidad**: Separación de alta fidelidad
+        ### 🔧 Tecnología:
+        - **MDX-Net**: Arquitectura de red neuronal especializada
+        - **ONNX Runtime**: Inferencia optimizada
+        - **Torch**: Procesamiento de tensores
+        - **Librosa**: Análisis de audio avanzado
+        """)
+    return app
+def main():
+    """Función principal"""
+    try:
+        logger.info("🎵 Iniciando Audio Separator Pro")
+        logger.info(f"🔧 PyTorch: {torch.__version__}")
+        logger.info(f"🔧 CUDA disponible: {torch.cuda.is_available()}")
+        # Crear directorios
+        create_directories()
+        # Descargar modelos si es necesario
+        logger.info("📥 Verificando modelos...")
+        if not all(os.path.exists(os.path.join(mdxnet_models_dir, model)) for model in UVR_MODELS):
+            logger.info("📥 Descargando modelos...")
+            if not download_models():
+                logger.error("❌ Error descargando modelos")
+                return
+        else:
+            logger.info("✅ Todos los modelos están disponibles")
+        # Crear interfaz
+        app = create_interface()
+        # Lanzar aplicación
+        app.queue(default_concurrency_limit=10)
+        app.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            share=True,
+            show_error=True,
+            quiet=False
+        )
+    except Exception as e:
+        logger.error(f"❌ Error en main: {e}")
+        traceback.print_exc()
 if __name__ == "__main__":
+    main()