Spaces:

vincewin
/

stem_worker

Running on Zero

App Files Files Community

stem_worker / separate.py

vincewin

Deploy AI stem splitter

6af6d8d verified 6 days ago

Raw

History Blame Contribute Delete

14.3 kB

	"""
	Stem separation core (Demucs) with optional cascades.

	- base 4-stem / 6-stem separation (official Meta models, trusted)
	- drum split: the 'drums' stem -> kick / snare / toms / cymbals
	(community 'drumsep' Hybrid-Demucs model; loaded with the full unpickler,
	which the user explicitly authorized)
	- vocal split: the 'vocals' stem -> lead / backing (added on top of drums)

	Files are copied, never modified; analysis runs only on copies.
	"""
	from __future__ import annotations
	import os
	import re
	from typing import Iterable

	import numpy as np

	HERE = os.path.dirname(os.path.abspath(__file__))

	# ZeroGPU: @spaces.GPU marks the GPU entry point so HF attaches a GPU for that
	# call only. Off Spaces (local / CPU), `spaces` isn't installed -> no-op decorator.
	try:
	import spaces
	_gpu = spaces.GPU(duration=300)
	except Exception:
	def _gpu(fn):
	return fn

	# ---- base models (official Demucs) ----
	MODEL_STEMS = {
	"htdemucs": ["drums", "bass", "other", "vocals"],
	"htdemucs_ft": ["drums", "bass", "other", "vocals"],
	"htdemucs_6s": ["drums", "bass", "other", "vocals", "guitar", "piano"],
	"mdx_extra": ["drums", "bass", "other", "vocals"],
	}
	DEFAULT_MODEL = "htdemucs"

	# ---- drum-split (community 'drumsep' model) ----
	DRUM_REPO = os.path.join(HERE, "models")
	DRUM_MODEL = "49469ca8"
	DRUM_HF = "vincewin/drumsep" # mirror the Space downloads from
	DRUM_MAP = {"bombo": "kick", "redoblante": "snare", "platillos": "cymbals", "toms": "toms"}
	DRUM_ORDER = ["kick", "snare", "toms", "cymbals"]

	# ---- RoFormer engine (newer architecture, via the 'audio-separator' package) ----
	# BS-RoFormer vocals model: cleaner vocals/instrumental split than Demucs, 2 stems.
	ROFORMER_MODEL = "model_bs_roformer_ep_317_sdr_12.9755.ckpt"

	# Engines selectable in the app: label-friendly quality tiers.
	ENGINES = ("demucs", "demucs_ft", "roformer")

	# ---- modes offered in the app: how many stems and how to reach them ----
	MODES = {
	"4": {"base": "htdemucs", "drums": False, "vocals": False},
	"6": {"base": "htdemucs_6s", "drums": False, "vocals": False},
	"9": {"base": "htdemucs_6s", "drums": True, "vocals": False},
	}
	DEFAULT_MODE = "4"


	def available_models() -> list[str]:
	return list(MODEL_STEMS.keys())


	def list_stems(model_name: str = DEFAULT_MODEL) -> list[str]:
	return MODEL_STEMS.get(model_name, MODEL_STEMS[DEFAULT_MODEL])


	def mode_stems(mode: str) -> list[str]:
	"""Final stem names a mode produces (for the checkbox UI), in display order."""
	cfg = MODES[mode]
	out: list[str] = []
	for s in MODEL_STEMS[cfg["base"]]:
	if s == "drums" and cfg["drums"]:
	out += DRUM_ORDER
	elif s == "vocals" and cfg["vocals"]:
	out += ["lead vocal", "backing vocal"]
	else:
	out.append(s)
	return out


	def _safe(name: str) -> str:
	name = re.sub(r'[<>:"/\\\|?*\x00-\x1f]', "_", name).strip(" .")
	return name or "track"


	def _pick_device(device: str \| None) -> str:
	if device:
	return device
	try:
	import torch
	return "cuda" if torch.cuda.is_available() else "cpu"
	except Exception:
	return "cpu"


	# ---------------- model loading (cached) ----------------
	_CACHE: dict = {}
	_PATCHED = False


	def _full_unpickle():
	"""Allow torch.load to deserialize the community drumsep checkpoint.

	PyTorch >=2.6 defaults to weights_only=True, which refuses pickled model
	objects. The user explicitly authorized loading this third-party model.
	"""
	global _PATCHED
	if _PATCHED:
	return
	import torch
	_orig = torch.load
	torch.load = lambda a, k: _orig(a, {k, "weights_only": False})
	_PATCHED = True


	def _ensure_drum_repo() -> str:
	path = os.path.join(DRUM_REPO, DRUM_MODEL + ".th")
	if not os.path.exists(path):
	from huggingface_hub import hf_hub_download
	os.makedirs(DRUM_REPO, exist_ok=True)
	hf_hub_download(repo_id=DRUM_HF, filename=DRUM_MODEL + ".th", local_dir=DRUM_REPO)
	return DRUM_REPO


	def _load(name: str, repo: str \| None = None):
	key = (name, repo)
	if key not in _CACHE:
	_full_unpickle()
	from pathlib import Path
	from demucs.pretrained import get_model
	m = get_model(name=name, repo=Path(repo) if repo else None)
	m.eval()
	_CACHE[key] = m
	return _CACHE[key]


	def _read_audio(path: str, sr: int, ch: int):
	from demucs.audio import AudioFile
	return AudioFile(path).read(streams=0, samplerate=sr, channels=ch)


	def _run(model, wav, device, shifts=0, overlap=0.25):
	"""Normalize, run apply_model, de-normalize. wav: tensor [channels, samples].

	shifts > 0 enables Demucs's "shift trick" (test-time augmentation): the input
	is shifted a few times and the results averaged, which reduces artifacts at a
	roughly linear cost in time. overlap controls window blending at chunk edges.
	"""
	import torch
	from demucs.apply import apply_model
	ref = wav.mean(0)
	x = (wav - ref.mean()) / (ref.std() + 1e-8)
	with torch.no_grad():
	out = apply_model(model, x[None].to(device), shifts=int(shifts), split=True,
	overlap=overlap, progress=False, device=device)[0]
	return out * (ref.std() + 1e-8) + ref.mean()


	def _write(tensor, out_path: str, sr: int):
	import soundfile as sf
	sf.write(out_path, np.clip(tensor.cpu().numpy().T, -1.0, 1.0), sr, subtype="PCM_16")


	# ---------------- public API ----------------
	def separate(input_path, out_dir, model_name=DEFAULT_MODEL, stems=None,
	device=None, progress=None, shifts=0, overlap=0.25) -> list[str]:
	"""Base (single-model) separation. Used by the CLI and batch tool."""
	device = _pick_device(device)
	if progress:
	progress(0.05, f"Loading model '{model_name}' on {device}...")
	model = _load(model_name)
	sources = list(model.sources)
	wanted = [s for s in (list(stems) if stems else sources) if s in sources]
	if not wanted:
	raise ValueError(f"No valid stems for {model_name}. Choose from: {sources}")
	if progress:
	progress(0.1, "Reading audio...")
	wav = _read_audio(input_path, model.samplerate, model.audio_channels)
	if progress:
	progress(0.2, "Separating audio (slow on CPU)...")
	out = _run(model, wav, device, shifts=shifts, overlap=overlap)
	os.makedirs(out_dir, exist_ok=True)
	song = _safe(os.path.splitext(os.path.basename(input_path))[0])
	written = []
	for i, name in enumerate(sources):
	if name not in wanted:
	continue
	p = os.path.join(out_dir, f"{song} - {name}.wav")
	_write(out[i], p, model.samplerate)
	written.append(p)
	if progress:
	progress(1.0, "Done")
	return written


	def separate_mode(input_path, out_dir, mode=DEFAULT_MODE, stems=None,
	device=None, progress=None, base_override=None,
	shifts=0, overlap=0.25) -> list[str]:
	"""Cascaded separation by mode ('4', '6', '9'). Writes '<song> - <stem>.wav'.

	base_override swaps the mode's base model (e.g. the fine-tuned 'htdemucs_ft'
	for cleaner 4-stem output). shifts/overlap tune separation quality vs. speed.
	"""
	cfg = MODES[mode]
	device = _pick_device(device)
	if progress:
	progress(0.05, "Loading model...")
	base = _load(base_override or cfg["base"])
	sr = base.samplerate
	if progress:
	progress(0.1, "Reading audio...")
	wav = _read_audio(input_path, sr, base.audio_channels)
	if progress:
	progress(0.15, "Separating main stems (slow on CPU)...")
	out = _run(base, wav, device, shifts=shifts, overlap=overlap)
	result = {name: out[i] for i, name in enumerate(base.sources)}

	if cfg["drums"] and "drums" in result:
	if progress:
	progress(0.6, "Splitting drums into kick / snare / toms / cymbals...")
	dm = _load(DRUM_MODEL, repo=_ensure_drum_repo())
	parts = _run(dm, result.pop("drums"), device, shifts=shifts, overlap=overlap)
	for i, src in enumerate(dm.sources):
	result[DRUM_MAP.get(src, src)] = parts[i]

	os.makedirs(out_dir, exist_ok=True)
	song = _safe(os.path.splitext(os.path.basename(input_path))[0])
	order = mode_stems(mode)
	wanted = [s for s in (list(stems) if stems else order) if s in result]
	written = []
	for name in order:
	if name not in wanted:
	continue
	p = os.path.join(out_dir, f"{song} - {name}.wav")
	_write(result[name], p, sr)
	written.append(p)
	if progress:
	progress(1.0, "Done")
	return written


	def separate_roformer(input_path, out_dir, progress=None) -> list[str]:
	"""High-quality vocals/instrumental split using a BS-RoFormer model.

	Uses the 'audio-separator' package, which downloads the model on first use.
	Produces two stems; files are renamed to the '<song> - <stem>.wav' convention.
	"""
	from audio_separator.separator import Separator
	os.makedirs(out_dir, exist_ok=True)
	if progress:
	progress(0.1, "Loading RoFormer model (first run downloads it ~1 min)...")
	sep = Separator(output_dir=out_dir, output_format="WAV")
	sep.load_model(model_filename=ROFORMER_MODEL)
	if progress:
	progress(0.3, "Separating with RoFormer (slow on CPU)...")
	produced = sep.separate(input_path) # list of output file names (in out_dir)
	song = _safe(os.path.splitext(os.path.basename(input_path))[0])
	written = []
	for fn in produced:
	src = fn if os.path.isabs(fn) else os.path.join(out_dir, fn)
	low = os.path.basename(fn).lower()
	stem = "vocals" if "vocal" in low else ("instrumental" if "instrument" in low else _safe(os.path.splitext(os.path.basename(fn))[0]))
	dst = os.path.join(out_dir, f"{song} - {stem}.wav")
	if os.path.abspath(src) != os.path.abspath(dst):
	os.replace(src, dst)
	written.append(dst)
	if progress:
	progress(1.0, "Done")
	return written


	def run_separation(input_path, out_dir, engine="demucs", mode=DEFAULT_MODE,
	stems=None, device=None, progress=None, shifts=0, overlap=0.25) -> list[str]:
	"""Single entry point the app uses; dispatches on the chosen engine.

	- 'demucs' : the mode's standard model (4/6/9-stem), with optional shifts.
	- 'demucs_ft' : fine-tuned model for 4-stem (falls back to standard for 6/9),
	cleaner but ~4x slower.
	- 'roformer' : BS-RoFormer (newest architecture) — best vocals/instrumental,
	always 2 stems; mode/stem selection is ignored.

	shifts/overlap are quality knobs (higher = cleaner, slower).
	"""
	if engine == "roformer":
	return separate_roformer(input_path, out_dir, progress=progress)
	base_override = None
	if engine == "demucs_ft" and MODES[mode]["base"] == "htdemucs":
	base_override = "htdemucs_ft"
	return separate_mode(input_path, out_dir, mode, stems, device=device,
	progress=progress, base_override=base_override,
	shifts=shifts, overlap=overlap)


	@_gpu
	def gpu_separate(input_path, out_dir, engine="demucs", mode=DEFAULT_MODE,
	stems=None, shifts=0, overlap=0.25) -> list[str]:
	"""GPU entry point for ZeroGPU. Takes only picklable args (no progress callback,
	which can't cross the GPU process boundary). Device auto-selects cuda when present."""
	return run_separation(input_path, out_dir, engine=engine, mode=mode,
	stems=stems, device=None, shifts=shifts, overlap=overlap,
	progress=None)


	def merge_stems(paths, out_path) -> str:
	"""Mix several stem .wav files back into one track by summing their samples.

	Demucs stems are additive (they sum to the original), so combining a subset
	is just a sample-wise sum. Inputs are aligned on length/channels defensively
	in case the caller mixes files from different sources.
	"""
	import soundfile as sf
	paths = list(paths)
	if len(paths) < 2:
	raise ValueError("Select at least two stems to merge.")
	mix = None
	sr = None
	for p in paths:
	data, file_sr = sf.read(p, always_2d=True) # shape [samples, channels]
	data = data.astype(np.float64)
	if mix is None:
	mix, sr = data, file_sr
	continue
	if file_sr != sr:
	raise ValueError("Stems have different sample rates; cannot merge.")
	if data.shape[1] != mix.shape[1]: # mono vs stereo -> upmix mono
	wide = max(data.shape[1], mix.shape[1])
	if data.shape[1] == 1:
	data = np.repeat(data, wide, axis=1)
	if mix.shape[1] == 1:
	mix = np.repeat(mix, wide, axis=1)
	if data.shape[0] != mix.shape[0]: # trim to the shorter one
	n = min(data.shape[0], mix.shape[0])
	mix, data = mix[:n], data[:n]
	mix = mix + data
	os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
	sf.write(out_path, np.clip(mix, -1.0, 1.0), sr, subtype="PCM_16")
	return out_path


	def stem_of(path: str) -> str:
	"""Recover the stem label from a '<song> - <stem>.wav' filename."""
	base = os.path.splitext(os.path.basename(path))[0]
	return base.split(" - ")[-1] if " - " in base else base


	if __name__ == "__main__":
	import argparse
	ap = argparse.ArgumentParser(description="Separate an audio file into stems.")
	ap.add_argument("input")
	ap.add_argument("-o", "--out", default="stems_out")
	ap.add_argument("--mode", default=None, choices=list(MODES.keys()),
	help="4 / 6 / 9 stems (cascaded). Overrides --model.")
	ap.add_argument("-m", "--model", default=DEFAULT_MODEL, choices=available_models())
	ap.add_argument("-s", "--stems", nargs="*", default=None)
	ap.add_argument("--device", default=None)
	a = ap.parse_args()
	pr = lambda f, m: print(f"[{f*100:5.1f}%] {m}")
	if a.mode:
	paths = separate_mode(a.input, a.out, a.mode, a.stems, a.device, progress=pr)
	else:
	paths = separate(a.input, a.out, a.model, a.stems, a.device, progress=pr)
	print("\nWrote:")
	for p in paths:
	print(" ", p)