Spaces:

anggars
/

tunebase

Sleeping

File size: 10,769 Bytes

import torch
import torchaudio

# Force backend to soundfile to avoid TorchCodec requirement on Windows
try:
    torchaudio.set_audio_backend("soundfile")
except:
    pass # In newer versions it might be automatic or different, but this is a safe attempt

from demucs.apply import apply_model
from demucs.pretrained import get_model
import os
import pathlib

# Konfigurasi Model
# Support multiple models for different separation modes
MODELS = {
    "2stem": "htdemucs",    # Will merge to 2 stems after
    "4stem": "htdemucs",    # Default 4 stem (drums, bass, vocals, other)
    "6stem": "htdemucs_6s", # Full 6 stem with guitar and piano
}

class AudioSeparator:
    def __init__(self):
        # Load all models on startup
        self.models = {}
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Load unique models only
        unique_models = set(MODELS.values())
        for model_name in unique_models:
            print(f"Loading Demucs Model: {model_name}...")
            model = get_model(model_name)
            model.to(self.device)
            self.models[model_name] = model
        print(f"All models loaded on {self.device}")

    def separate(self, audio_path: str, output_dir: str, callback=None, mode="4stem"):
        """
        Memisahkan file audio menjadi stems.
        Modes: 2stem, 4stem, 6stem
        """
        # Select model based on mode
        model_name = MODELS.get(mode, "htdemucs")
        model = self.models[model_name]
        
        # Load audio using soundfile directly (bypass torchaudio backend issues)
        import soundfile as sf
        wav_np, sr = sf.read(audio_path)
        
        # Convert to tensor
        wav = torch.from_numpy(wav_np).float()
        
        if wav.ndim == 1:
            wav = wav.unsqueeze(0)
        else:
            wav = wav.t()
        
        # Resample if needed
        if sr != 44100:
            if callback: callback("Resampling audio...", 15)
            resampler = torchaudio.transforms.Resample(sr, 44100)
            wav = resampler(wav)
            sr = 44100
        
        wav = wav.unsqueeze(0).to(self.device)

        # Apply model
        ref = wav.mean(0)
        wav = (wav - ref.mean()) / ref.std()
        
        if callback: callback("Running Demucs Inference...", 20)
        print(f"Starting separation with {model_name} (mode: {mode})...")
        
        sources = apply_model(model, wav, shifts=1, split=True, overlap=0.25, progress=True)[0]
        source_names = model.sources
        
        results = {}
        os.makedirs(output_dir, exist_ok=True)
        
        total_sources = len(source_names)
        source_tensors = {name: source for name, source in zip(source_names, sources)}
        
        # Handle different modes
        if mode == "2stem":
            # Merge to Vocals + Instruments
            if callback: callback("Merging to 2 stems...", 45)
            
            vocals = source_tensors.get('vocals')
            instruments = None
            for name, src in source_tensors.items():
                if name != 'vocals':
                    if instruments is None:
                        instruments = src.clone()
                    else:
                        instruments += src
            
            if vocals is not None:
                self._save_audio(vocals, sr, os.path.join(output_dir, "vocals.mp3"))
                results['vocals'] = os.path.join(output_dir, "vocals.mp3")
            if instruments is not None:
                self._save_audio(instruments, sr, os.path.join(output_dir, "instruments.mp3"))
                results['instruments'] = os.path.join(output_dir, "instruments.mp3")
                
        elif mode == "6stem":
            # Full separation with guitar processing
            for i, (name, source) in enumerate(source_tensors.items()):
                progress = 30 + int((i / total_sources) * 20)
                if callback: callback(f"Saving stem: {name}", progress)
                
                if name == 'guitar':
                    results.update(self._process_guitar(source, sr, output_dir))
                else:
                    stem_path = os.path.join(output_dir, f"{name}.mp3")
                    self._save_audio(source, sr, stem_path)
                    results[name] = stem_path
        else:
            # 4stem - standard separation
            for i, (name, source) in enumerate(source_tensors.items()):
                progress = 30 + int((i / total_sources) * 20)
                if callback: callback(f"Saving stem: {name}", progress)
                
                stem_path = os.path.join(output_dir, f"{name}.mp3")
                self._save_audio(source, sr, stem_path)
                results[name] = stem_path
                
        input_duration = len(wav_np) / sr
        return results, input_duration

    def _process_guitar(self, source, sr, output_dir):
        """
        Memisahkan stem gitar menjadi Lead dan Rhythm menggunakan Mid-Side processing.
        - Mid (center) = Rhythm (biasanya power chords, strumming di center)
        - Side (stereo difference) = Lead (biasanya di-pan atau dengan stereo effects)
        """
        # source shape: (2, samples)
        
        # Check integrity
        if source.shape[0] < 2:
             print("Warning: Guitar stem is Mono. Cannot split Rhythm/Lead.")
             path = os.path.join(output_dir, "guitar.mp3")
             self._save_audio(source, sr, path)
             return {"guitar_rhythm": path, "guitar_lead": path}

        # Define Left and Right channels for processing
        left = source[0:1, :]
        right = source[1:2, :]

        # 1. Smart Spatial Split Check
        # Calculate correlation between L and R to detect Hard Panning (Math Rock Style)
        # If correlation is low, it means L and R are playing different things.
        # We assume Left = Rhythm (often dropped D/lower), Right = Lead (often ornate/higher) - OR provide both as is.
        
        # Calculate cross-correlation at lag 0
        mean_l = left.mean()
        mean_r = right.mean()
        var_l = ((left - mean_l)**2).mean()
        var_r = ((right - mean_r)**2).mean()
        cov = ((left - mean_l) * (right - mean_r)).mean()
        
        correlation = 0.0
        if var_l > 0 and var_r > 0:
            correlation = cov / torch.sqrt(var_l * var_r)
            
        print(f"Guitar Stereo Correlation: {correlation:.4f}")
        
        # Threshold for "Wide Stereo"
        if abs(correlation) < 0.6:
            print("Detected Wide Stereo Guitar (Math Rock Style). Using Spatial Split (L=Rhythm, R=Lead).")
            # Force Hard Split
            # Rhythm = Left Channel (Duplicated to Stereo)
            # Lead = Right Channel (Duplicated to Stereo)
            rhythm_stereo = torch.cat([left, left], dim=0)
            lead_stereo = torch.cat([right, right], dim=0)
        else:
            print("Detected Narrow/Mono Guitar. Using Mid-Side Frequency Split.")
            # Standard Mid-Side with Tighter Filters
            
            mid = (left + right) / 2.0
            side = (left - right) / 2.0
            
            try:
                import scipy.signal as signal
                nyquist = sr / 2
                
                # Rhythm: Low-Mid focus (80-1500Hz) - tighter top end
                # To distinguish from lead which often occupies 800+
                rhythm_low = 80 / nyquist
                rhythm_high = 1200 / nyquist
                b_r, a_r = signal.butter(4, [rhythm_low, rhythm_high], btype='band')
                
                # Lead: High-Mid focus (1000-8000Hz)
                lead_low = 1000 / nyquist
                lead_high = 8000 / nyquist
                b_l, a_l = signal.butter(4, [lead_low, lead_high], btype='band')
                
                # Apply to Mid (Center info usually has both, but we try to separate by freq)
                # We interpret 'Mid' as the main source.
                rhythm_from_mid = signal.filtfilt(b_r, a_r, mid.numpy())
                lead_from_mid = signal.filtfilt(b_l, a_l, mid.numpy())
                
                # Reconstruct
                # If Side exists (some stereo), add it to Lead (often spatial effects are on lead)
                side_np = side.numpy()
                
                rhythm_final = rhythm_from_mid 
                lead_final = lead_from_mid + (side_np * 1.5) # Boost side for lead
                
                rhythm_stereo = torch.from_numpy(rhythm_final).float()
                rhythm_stereo = torch.cat([rhythm_stereo, rhythm_stereo], dim=0)
                
                lead_stereo = torch.from_numpy(lead_final).float()
                lead_stereo = torch.cat([lead_stereo, lead_stereo], dim=0)
                
            except Exception as e:
                print(f"Filter failed: {e}. Fallback to raw.")
                rhythm_stereo = torch.cat([left, left], dim=0)
                lead_stereo = torch.cat([right, right], dim=0)
        
        # Normalize
        def normalize(tensor):
            peak = tensor.abs().max()
            if peak > 0:
                target_peak = 0.89  # -1dB
                return tensor * (target_peak / peak)
            return tensor

        rhythm_stereo = normalize(rhythm_stereo)
        lead_stereo = normalize(lead_stereo)
        
        # MERGE TO SINGLE STEREO FILE (L=Rhythm, R=Lead)
        # We take the Left channel of the Rhythm stereo mix (which is mono-ish)
        # And the Right channel of the Lead stereo mix (which is mono-ish)
        # Or better: Just use the Mono mix of each.
        
        rhythm_mono = rhythm_stereo.mean(dim=0, keepdim=True)
        lead_mono = lead_stereo.mean(dim=0, keepdim=True)
        
        # Combine: Channel 0 = Rhythm, Channel 1 = Lead
        guitar_split = torch.cat([rhythm_mono, lead_mono], dim=0)
        
        guitar_split = normalize(guitar_split)
        
        # Save as single file named "guitar.mp3" (Special split)
        path = os.path.join(output_dir, "guitar.mp3")
        self._save_audio(guitar_split, sr, path)
        
        return {
            "guitar": path 
        }

    def _save_audio(self, source, sr, path):
        # source is tensor (channels, samples) on device
        # Move to cpu
        source = source.cpu()

        # Normalize to prevent clipping (limit to -1dB peak)
        peak = source.abs().max()
        if peak > 0.89: # approx -1dB
            source = source / peak * 0.89
            
        # Save using soundfile
        # source is (channels, samples) -> need (samples, channels)
        import soundfile as sf
        sf.write(path, source.t().numpy(), sr)