Spaces:

sourav-das
/

stem-separator

Running

App Files Files Community

stem-separator / backend /separator.py

sourav-das

Upload folder using huggingface_hub

7dfae77 verified 17 days ago

raw

history blame contribute delete

9.43 kB

	import logging
	import os
	from typing import Callable

	import numpy as np
	import soundfile as sf
	import torch
	import torchaudio
	import yaml
	from pydub import AudioSegment
	from huggingface_hub import hf_hub_download
	from audio_separator.separator.uvr_lib_v5.roformer.bs_roformer import BSRoformer

	logger = logging.getLogger(__name__)

	DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	logger.info(f"Using device: {DEVICE}")

	if DEVICE.type == "cpu":
	cpu_count = os.cpu_count() or 1
	torch.set_num_threads(cpu_count)
	torch.set_num_interop_threads(max(1, cpu_count // 2))
	logger.info(f"CPU mode: set torch threads={cpu_count}, interop={max(1, cpu_count // 2)}")

	MODEL_REPO = "jarredou/BS-ROFO-SW-Fixed"
	MODEL_FILENAME = "BS-Rofo-SW-Fixed.ckpt"
	MODEL_CONFIG = "BS-Rofo-SW-Fixed.yaml"
	MODEL_DIR = "/tmp/models"

	# Stem order matches the model's training config
	STEM_ORDER = ["bass", "drums", "other", "vocals", "guitar", "piano"]
	STEM_NAME_MAP = {
	"bass": "Bass",
	"drums": "Drums",
	"other": "Other",
	"vocals": "Vocals",
	"guitar": "Guitar",
	"piano": "Piano",
	}


	class StemSeparatorService:
	_instance = None
	_model_loaded = False

	def __new__(cls):
	if cls._instance is None:
	cls._instance = super().__new__(cls)
	return cls._instance

	def load_model(self):
	if self._model_loaded:
	return

	os.makedirs(MODEL_DIR, exist_ok=True)
	logger.info(f"Downloading model from HF Hub: {MODEL_REPO}")

	hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILENAME, local_dir=MODEL_DIR)
	hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_CONFIG, local_dir=MODEL_DIR)

	# Parse config
	with open(os.path.join(MODEL_DIR, MODEL_CONFIG)) as f:
	config = yaml.load(f, Loader=yaml.FullLoader)

	model_cfg = config["model"]
	audio_cfg = config.get("audio", {})
	inference_cfg = config.get("inference", {})

	self.sample_rate = audio_cfg.get("sample_rate", 44100)
	self.chunk_size = audio_cfg.get("chunk_size", 588800)
	self.num_overlap = inference_cfg.get("num_overlap", 2)

	# Use flash_attn only on CUDA where it's supported
	use_flash = DEVICE.type == "cuda"

	# Create model directly — bypass audio-separator's Separator wrapper entirely
	self.model = BSRoformer(
	dim=model_cfg["dim"],
	depth=model_cfg["depth"],
	stereo=model_cfg.get("stereo", True),
	num_stems=model_cfg.get("num_stems", 6),
	time_transformer_depth=model_cfg.get("time_transformer_depth", 1),
	freq_transformer_depth=model_cfg.get("freq_transformer_depth", 1),
	linear_transformer_depth=model_cfg.get("linear_transformer_depth", 0),
	freqs_per_bands=tuple(model_cfg["freqs_per_bands"]),
	dim_head=model_cfg.get("dim_head", 64),
	heads=model_cfg.get("heads", 8),
	attn_dropout=model_cfg.get("attn_dropout", 0.1),
	ff_dropout=model_cfg.get("ff_dropout", 0.1),
	flash_attn=use_flash,
	dim_freqs_in=model_cfg.get("dim_freqs_in", 1025),
	stft_n_fft=model_cfg.get("stft_n_fft", 2048),
	stft_hop_length=model_cfg.get("stft_hop_length", 512),
	stft_win_length=model_cfg.get("stft_win_length", 2048),
	stft_normalized=model_cfg.get("stft_normalized", False),
	mask_estimator_depth=model_cfg.get("mask_estimator_depth", 2),
	multi_stft_resolution_loss_weight=model_cfg.get("multi_stft_resolution_loss_weight", 1.0),
	multi_stft_resolutions_window_sizes=tuple(
	model_cfg.get("multi_stft_resolutions_window_sizes", (4096, 2048, 1024, 512, 256))
	),
	multi_stft_hop_size=model_cfg.get("multi_stft_hop_size", 147),
	multi_stft_normalized=model_cfg.get("multi_stft_normalized", False),
	mlp_expansion_factor=model_cfg.get("mlp_expansion_factor", 4),
	use_torch_checkpoint=model_cfg.get("use_torch_checkpoint", False),
	skip_connection=model_cfg.get("skip_connection", False),
	)

	# Load checkpoint weights
	ckpt_path = os.path.join(MODEL_DIR, MODEL_FILENAME)
	try:
	state_dict = torch.load(ckpt_path, map_location=DEVICE, weights_only=True)
	except TypeError:
	state_dict = torch.load(ckpt_path, map_location=DEVICE)

	if isinstance(state_dict, dict) and "state_dict" in state_dict:
	state_dict = state_dict["state_dict"]
	elif isinstance(state_dict, dict) and "model" in state_dict:
	state_dict = state_dict["model"]

	self.model.load_state_dict(state_dict)
	self.model.to(DEVICE)
	self.model.eval()

	logger.info(f"BS-RoFormer model loaded successfully on {DEVICE}")
	self._model_loaded = True

	def _process_audio(self, audio_tensor: torch.Tensor, progress_callback) -> torch.Tensor:
	"""Run inference with chunking and overlap-add."""
	chunk_size = self.chunk_size
	step = chunk_size // self.num_overlap

	channels, total_samples = audio_tensor.shape
	num_stems = 6

	# Pad so we cover the full audio
	pad_needed = max(0, chunk_size - total_samples)
	if total_samples > chunk_size:
	remainder = (total_samples - chunk_size) % step
	if remainder != 0:
	pad_needed = step - remainder

	if pad_needed > 0:
	audio_tensor = torch.nn.functional.pad(audio_tensor, (0, pad_needed))

	padded_len = audio_tensor.shape[1]

	# Move input to device
	audio_tensor = audio_tensor.to(DEVICE)

	# Output accumulators (keep on CPU to save GPU memory)
	result = torch.zeros(num_stems, channels, padded_len)
	weight = torch.zeros(padded_len)

	# Hann window for smooth crossfading
	window = torch.hann_window(chunk_size, device=DEVICE)

	# Build chunk positions
	starts = list(range(0, padded_len - chunk_size + 1, step))
	total_chunks = len(starts)

	for i, start in enumerate(starts):
	chunk = audio_tensor[:, start : start + chunk_size]

	with torch.no_grad():
	# BSRoformer: (batch, channels, time) -> (batch, stems, channels, time)
	output = self.model(chunk.unsqueeze(0))

	output = output.squeeze(0) # (stems, channels, time)

	# Move output to CPU for accumulation
	output_cpu = output.cpu()
	window_cpu = window.cpu()
	result[:, :, start : start + chunk_size] += output_cpu * window_cpu
	weight[start : start + chunk_size] += window_cpu

	frac = (i + 1) / total_chunks
	progress_callback("separating", 0.2 + frac * 0.7)

	# Normalize by overlap weight
	result = result / weight.clamp(min=1e-8).unsqueeze(0).unsqueeze(0)

	# Remove padding
	return result[:, :, :total_samples]

	def separate(
	self,
	input_path: str,
	output_dir: str,
	stems: list[str],
	output_format: str,
	progress_callback: Callable[[str, float], None],
	) -> dict[str, str]:
	progress_callback("loading_model", 0.05)
	self.load_model()

	progress_callback("separating", 0.15)

	# Load audio
	audio, sr = sf.read(input_path)
	if audio.ndim == 1:
	audio = np.stack([audio, audio], axis=1) # Mono to stereo

	audio_tensor = torch.tensor(audio.T, dtype=torch.float32) # (channels, samples)

	# Resample if needed
	if sr != self.sample_rate:
	resampler = torchaudio.transforms.Resample(sr, self.sample_rate)
	audio_tensor = resampler(audio_tensor)

	# Run inference
	separated = self._process_audio(audio_tensor, progress_callback)

	progress_callback("finalizing", 0.92)

	# Save requested stems
	result: dict[str, str] = {}
	for i, stem_key in enumerate(STEM_ORDER):
	canonical = STEM_NAME_MAP[stem_key]
	if canonical in stems:
	stem_audio = separated[i].numpy().T # (samples, channels)
	# Clip to prevent clipping artifacts
	stem_audio = np.clip(stem_audio, -1.0, 1.0)
	clean_name = f"{canonical}.{output_format}"
	out_path = os.path.join(output_dir, clean_name)
	self._write_output(out_path, stem_audio, output_format)
	result[canonical] = clean_name

	progress_callback("done", 1.0)
	return result

	def _write_output(self, output_path: str, stem_audio: np.ndarray, output_format: str):
	if output_format == "wav":
	sf.write(output_path, stem_audio, self.sample_rate, subtype="FLOAT")
	return

	pcm = (stem_audio * 32767.0).astype(np.int16)
	segment = AudioSegment(
	data=pcm.tobytes(),
	sample_width=2,
	frame_rate=self.sample_rate,
	channels=pcm.shape[1] if pcm.ndim > 1 else 1,
	)

	export_format = "mp3" if output_format == "mp3" else "adts"
	export_kwargs = {"format": export_format}
	if output_format in {"mp3", "aac"}:
	export_kwargs["bitrate"] = "192k"
	segment.export(output_path, **export_kwargs)