stem-separator

Sleeping

File size: 9,427 Bytes

7dfae77

import logging
import os
from typing import Callable

import numpy as np
import soundfile as sf
import torch
import torchaudio
import yaml
from pydub import AudioSegment
from huggingface_hub import hf_hub_download
from audio_separator.separator.uvr_lib_v5.roformer.bs_roformer import BSRoformer

logger = logging.getLogger(__name__)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {DEVICE}")

if DEVICE.type == "cpu":
    cpu_count = os.cpu_count() or 1
    torch.set_num_threads(cpu_count)
    torch.set_num_interop_threads(max(1, cpu_count // 2))
    logger.info(f"CPU mode: set torch threads={cpu_count}, interop={max(1, cpu_count // 2)}")

MODEL_REPO = "jarredou/BS-ROFO-SW-Fixed"
MODEL_FILENAME = "BS-Rofo-SW-Fixed.ckpt"
MODEL_CONFIG = "BS-Rofo-SW-Fixed.yaml"
MODEL_DIR = "/tmp/models"

# Stem order matches the model's training config
STEM_ORDER = ["bass", "drums", "other", "vocals", "guitar", "piano"]
STEM_NAME_MAP = {
    "bass": "Bass",
    "drums": "Drums",
    "other": "Other",
    "vocals": "Vocals",
    "guitar": "Guitar",
    "piano": "Piano",
}


class StemSeparatorService:
    _instance = None
    _model_loaded = False

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super().__new__(cls)
        return cls._instance

    def load_model(self):
        if self._model_loaded:
            return

        os.makedirs(MODEL_DIR, exist_ok=True)
        logger.info(f"Downloading model from HF Hub: {MODEL_REPO}")

        hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILENAME, local_dir=MODEL_DIR)
        hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_CONFIG, local_dir=MODEL_DIR)

        # Parse config
        with open(os.path.join(MODEL_DIR, MODEL_CONFIG)) as f:
            config = yaml.load(f, Loader=yaml.FullLoader)

        model_cfg = config["model"]
        audio_cfg = config.get("audio", {})
        inference_cfg = config.get("inference", {})

        self.sample_rate = audio_cfg.get("sample_rate", 44100)
        self.chunk_size = audio_cfg.get("chunk_size", 588800)
        self.num_overlap = inference_cfg.get("num_overlap", 2)

        # Use flash_attn only on CUDA where it's supported
        use_flash = DEVICE.type == "cuda"

        # Create model directly — bypass audio-separator's Separator wrapper entirely
        self.model = BSRoformer(
            dim=model_cfg["dim"],
            depth=model_cfg["depth"],
            stereo=model_cfg.get("stereo", True),
            num_stems=model_cfg.get("num_stems", 6),
            time_transformer_depth=model_cfg.get("time_transformer_depth", 1),
            freq_transformer_depth=model_cfg.get("freq_transformer_depth", 1),
            linear_transformer_depth=model_cfg.get("linear_transformer_depth", 0),
            freqs_per_bands=tuple(model_cfg["freqs_per_bands"]),
            dim_head=model_cfg.get("dim_head", 64),
            heads=model_cfg.get("heads", 8),
            attn_dropout=model_cfg.get("attn_dropout", 0.1),
            ff_dropout=model_cfg.get("ff_dropout", 0.1),
            flash_attn=use_flash,
            dim_freqs_in=model_cfg.get("dim_freqs_in", 1025),
            stft_n_fft=model_cfg.get("stft_n_fft", 2048),
            stft_hop_length=model_cfg.get("stft_hop_length", 512),
            stft_win_length=model_cfg.get("stft_win_length", 2048),
            stft_normalized=model_cfg.get("stft_normalized", False),
            mask_estimator_depth=model_cfg.get("mask_estimator_depth", 2),
            multi_stft_resolution_loss_weight=model_cfg.get("multi_stft_resolution_loss_weight", 1.0),
            multi_stft_resolutions_window_sizes=tuple(
                model_cfg.get("multi_stft_resolutions_window_sizes", (4096, 2048, 1024, 512, 256))
            ),
            multi_stft_hop_size=model_cfg.get("multi_stft_hop_size", 147),
            multi_stft_normalized=model_cfg.get("multi_stft_normalized", False),
            mlp_expansion_factor=model_cfg.get("mlp_expansion_factor", 4),
            use_torch_checkpoint=model_cfg.get("use_torch_checkpoint", False),
            skip_connection=model_cfg.get("skip_connection", False),
        )

        # Load checkpoint weights
        ckpt_path = os.path.join(MODEL_DIR, MODEL_FILENAME)
        try:
            state_dict = torch.load(ckpt_path, map_location=DEVICE, weights_only=True)
        except TypeError:
            state_dict = torch.load(ckpt_path, map_location=DEVICE)

        if isinstance(state_dict, dict) and "state_dict" in state_dict:
            state_dict = state_dict["state_dict"]
        elif isinstance(state_dict, dict) and "model" in state_dict:
            state_dict = state_dict["model"]

        self.model.load_state_dict(state_dict)
        self.model.to(DEVICE)
        self.model.eval()

        logger.info(f"BS-RoFormer model loaded successfully on {DEVICE}")
        self._model_loaded = True

    def _process_audio(self, audio_tensor: torch.Tensor, progress_callback) -> torch.Tensor:
        """Run inference with chunking and overlap-add."""
        chunk_size = self.chunk_size
        step = chunk_size // self.num_overlap

        channels, total_samples = audio_tensor.shape
        num_stems = 6

        # Pad so we cover the full audio
        pad_needed = max(0, chunk_size - total_samples)
        if total_samples > chunk_size:
            remainder = (total_samples - chunk_size) % step
            if remainder != 0:
                pad_needed = step - remainder

        if pad_needed > 0:
            audio_tensor = torch.nn.functional.pad(audio_tensor, (0, pad_needed))

        padded_len = audio_tensor.shape[1]

        # Move input to device
        audio_tensor = audio_tensor.to(DEVICE)

        # Output accumulators (keep on CPU to save GPU memory)
        result = torch.zeros(num_stems, channels, padded_len)
        weight = torch.zeros(padded_len)

        # Hann window for smooth crossfading
        window = torch.hann_window(chunk_size, device=DEVICE)

        # Build chunk positions
        starts = list(range(0, padded_len - chunk_size + 1, step))
        total_chunks = len(starts)

        for i, start in enumerate(starts):
            chunk = audio_tensor[:, start : start + chunk_size]

            with torch.no_grad():
                # BSRoformer: (batch, channels, time) -> (batch, stems, channels, time)
                output = self.model(chunk.unsqueeze(0))

            output = output.squeeze(0)  # (stems, channels, time)

            # Move output to CPU for accumulation
            output_cpu = output.cpu()
            window_cpu = window.cpu()
            result[:, :, start : start + chunk_size] += output_cpu * window_cpu
            weight[start : start + chunk_size] += window_cpu

            frac = (i + 1) / total_chunks
            progress_callback("separating", 0.2 + frac * 0.7)

        # Normalize by overlap weight
        result = result / weight.clamp(min=1e-8).unsqueeze(0).unsqueeze(0)

        # Remove padding
        return result[:, :, :total_samples]

    def separate(
        self,
        input_path: str,
        output_dir: str,
        stems: list[str],
        output_format: str,
        progress_callback: Callable[[str, float], None],
    ) -> dict[str, str]:
        progress_callback("loading_model", 0.05)
        self.load_model()

        progress_callback("separating", 0.15)

        # Load audio
        audio, sr = sf.read(input_path)
        if audio.ndim == 1:
            audio = np.stack([audio, audio], axis=1)  # Mono to stereo

        audio_tensor = torch.tensor(audio.T, dtype=torch.float32)  # (channels, samples)

        # Resample if needed
        if sr != self.sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.sample_rate)
            audio_tensor = resampler(audio_tensor)

        # Run inference
        separated = self._process_audio(audio_tensor, progress_callback)

        progress_callback("finalizing", 0.92)

        # Save requested stems
        result: dict[str, str] = {}
        for i, stem_key in enumerate(STEM_ORDER):
            canonical = STEM_NAME_MAP[stem_key]
            if canonical in stems:
                stem_audio = separated[i].numpy().T  # (samples, channels)
                # Clip to prevent clipping artifacts
                stem_audio = np.clip(stem_audio, -1.0, 1.0)
                clean_name = f"{canonical}.{output_format}"
                out_path = os.path.join(output_dir, clean_name)
                self._write_output(out_path, stem_audio, output_format)
                result[canonical] = clean_name

        progress_callback("done", 1.0)
        return result

    def _write_output(self, output_path: str, stem_audio: np.ndarray, output_format: str):
        if output_format == "wav":
            sf.write(output_path, stem_audio, self.sample_rate, subtype="FLOAT")
            return

        pcm = (stem_audio * 32767.0).astype(np.int16)
        segment = AudioSegment(
            data=pcm.tobytes(),
            sample_width=2,
            frame_rate=self.sample_rate,
            channels=pcm.shape[1] if pcm.ndim > 1 else 1,
        )

        export_format = "mp3" if output_format == "mp3" else "adts"
        export_kwargs = {"format": export_format}
        if output_format in {"mp3", "aac"}:
            export_kwargs["bitrate"] = "192k"
        segment.export(output_path, **export_kwargs)