Spaces:

ASLP-lab
/

YingMusic-Singer-Plus

Running on Zero

YingMusic-Singer-Plus / src /third_party /MusicSourceSeparationTraining /models /bandit_v2 /bandit.py

64ec292 25 days ago

11.2 kB

	from typing import Dict, List, Optional

	import pytorch_lightning as pl
	import torch
	import torchaudio as ta
	from torch import nn

	from .bandsplit import BandSplitModule
	from .maskestim import OverlappingMaskEstimationModule
	from .tfmodel import SeqBandModellingModule
	from .utils import MusicalBandsplitSpecification


	class BaseEndToEndModule(pl.LightningModule):
	def __init__(
	self,
	) -> None:
	super().__init__()


	class BaseBandit(BaseEndToEndModule):
	def __init__(
	self,
	in_channels: int,
	fs: int,
	band_type: str = "musical",
	n_bands: int = 64,
	require_no_overlap: bool = False,
	require_no_gap: bool = True,
	normalize_channel_independently: bool = False,
	treat_channel_as_feature: bool = True,
	n_sqm_modules: int = 12,
	emb_dim: int = 128,
	rnn_dim: int = 256,
	bidirectional: bool = True,
	rnn_type: str = "LSTM",
	n_fft: int = 2048,
	win_length: Optional[int] = 2048,
	hop_length: int = 512,
	window_fn: str = "hann_window",
	wkwargs: Optional[Dict] = None,
	power: Optional[int] = None,
	center: bool = True,
	normalized: bool = True,
	pad_mode: str = "constant",
	onesided: bool = True,
	):
	super().__init__()

	self.in_channels = in_channels

	self.instantitate_spectral(
	n_fft=n_fft,
	win_length=win_length,
	hop_length=hop_length,
	window_fn=window_fn,
	wkwargs=wkwargs,
	power=power,
	normalized=normalized,
	center=center,
	pad_mode=pad_mode,
	onesided=onesided,
	)

	self.instantiate_bandsplit(
	in_channels=in_channels,
	band_type=band_type,
	n_bands=n_bands,
	require_no_overlap=require_no_overlap,
	require_no_gap=require_no_gap,
	normalize_channel_independently=normalize_channel_independently,
	treat_channel_as_feature=treat_channel_as_feature,
	emb_dim=emb_dim,
	n_fft=n_fft,
	fs=fs,
	)

	self.instantiate_tf_modelling(
	n_sqm_modules=n_sqm_modules,
	emb_dim=emb_dim,
	rnn_dim=rnn_dim,
	bidirectional=bidirectional,
	rnn_type=rnn_type,
	)

	def instantitate_spectral(
	self,
	n_fft: int = 2048,
	win_length: Optional[int] = 2048,
	hop_length: int = 512,
	window_fn: str = "hann_window",
	wkwargs: Optional[Dict] = None,
	power: Optional[int] = None,
	normalized: bool = True,
	center: bool = True,
	pad_mode: str = "constant",
	onesided: bool = True,
	):
	assert power is None

	window_fn = torch.__dict__[window_fn]

	self.stft = ta.transforms.Spectrogram(
	n_fft=n_fft,
	win_length=win_length,
	hop_length=hop_length,
	pad_mode=pad_mode,
	pad=0,
	window_fn=window_fn,
	wkwargs=wkwargs,
	power=power,
	normalized=normalized,
	center=center,
	onesided=onesided,
	)

	self.istft = ta.transforms.InverseSpectrogram(
	n_fft=n_fft,
	win_length=win_length,
	hop_length=hop_length,
	pad_mode=pad_mode,
	pad=0,
	window_fn=window_fn,
	wkwargs=wkwargs,
	normalized=normalized,
	center=center,
	onesided=onesided,
	)

	def instantiate_bandsplit(
	self,
	in_channels: int,
	band_type: str = "musical",
	n_bands: int = 64,
	require_no_overlap: bool = False,
	require_no_gap: bool = True,
	normalize_channel_independently: bool = False,
	treat_channel_as_feature: bool = True,
	emb_dim: int = 128,
	n_fft: int = 2048,
	fs: int = 44100,
	):
	assert band_type == "musical"

	self.band_specs = MusicalBandsplitSpecification(
	nfft=n_fft, fs=fs, n_bands=n_bands
	)

	self.band_split = BandSplitModule(
	in_channels=in_channels,
	band_specs=self.band_specs.get_band_specs(),
	require_no_overlap=require_no_overlap,
	require_no_gap=require_no_gap,
	normalize_channel_independently=normalize_channel_independently,
	treat_channel_as_feature=treat_channel_as_feature,
	emb_dim=emb_dim,
	)

	def instantiate_tf_modelling(
	self,
	n_sqm_modules: int = 12,
	emb_dim: int = 128,
	rnn_dim: int = 256,
	bidirectional: bool = True,
	rnn_type: str = "LSTM",
	):
	try:
	self.tf_model = torch.compile(
	SeqBandModellingModule(
	n_modules=n_sqm_modules,
	emb_dim=emb_dim,
	rnn_dim=rnn_dim,
	bidirectional=bidirectional,
	rnn_type=rnn_type,
	),
	disable=True,
	)
	except Exception:
	self.tf_model = SeqBandModellingModule(
	n_modules=n_sqm_modules,
	emb_dim=emb_dim,
	rnn_dim=rnn_dim,
	bidirectional=bidirectional,
	rnn_type=rnn_type,
	)

	def mask(self, x, m):
	return x * m

	def forward(self, batch, mode="train"):
	# Model takes mono as input we give stereo, so we do process of each channel independently
	init_shape = batch.shape
	if not isinstance(batch, dict):
	mono = batch.view(-1, 1, batch.shape[-1])
	batch = {"mixture": {"audio": mono}}

	with torch.no_grad():
	mixture = batch["mixture"]["audio"]

	x = self.stft(mixture)
	batch["mixture"]["spectrogram"] = x

	if "sources" in batch.keys():
	for stem in batch["sources"].keys():
	s = batch["sources"][stem]["audio"]
	s = self.stft(s)
	batch["sources"][stem]["spectrogram"] = s

	batch = self.separate(batch)

	if 1:
	b = []
	for s in self.stems:
	# We need to obtain stereo again
	r = batch["estimates"][s]["audio"].view(
	-1, init_shape[1], init_shape[2]
	)
	b.append(r)
	# And we need to return back tensor and not independent stems
	batch = torch.stack(b, dim=1)
	return batch

	def encode(self, batch):
	x = batch["mixture"]["spectrogram"]
	length = batch["mixture"]["audio"].shape[-1]

	z = self.band_split(x) # (batch, emb_dim, n_band, n_time)
	q = self.tf_model(z) # (batch, emb_dim, n_band, n_time)

	return x, q, length

	def separate(self, batch):
	raise NotImplementedError


	class Bandit(BaseBandit):
	def __init__(
	self,
	in_channels: int,
	stems: List[str],
	band_type: str = "musical",
	n_bands: int = 64,
	require_no_overlap: bool = False,
	require_no_gap: bool = True,
	normalize_channel_independently: bool = False,
	treat_channel_as_feature: bool = True,
	n_sqm_modules: int = 12,
	emb_dim: int = 128,
	rnn_dim: int = 256,
	bidirectional: bool = True,
	rnn_type: str = "LSTM",
	mlp_dim: int = 512,
	hidden_activation: str = "Tanh",
	hidden_activation_kwargs: Dict \| None = None,
	complex_mask: bool = True,
	use_freq_weights: bool = True,
	n_fft: int = 2048,
	win_length: int \| None = 2048,
	hop_length: int = 512,
	window_fn: str = "hann_window",
	wkwargs: Dict \| None = None,
	power: int \| None = None,
	center: bool = True,
	normalized: bool = True,
	pad_mode: str = "constant",
	onesided: bool = True,
	fs: int = 44100,
	stft_precisions="32",
	bandsplit_precisions="bf16",
	tf_model_precisions="bf16",
	mask_estim_precisions="bf16",
	):
	super().__init__(
	in_channels=in_channels,
	band_type=band_type,
	n_bands=n_bands,
	require_no_overlap=require_no_overlap,
	require_no_gap=require_no_gap,
	normalize_channel_independently=normalize_channel_independently,
	treat_channel_as_feature=treat_channel_as_feature,
	n_sqm_modules=n_sqm_modules,
	emb_dim=emb_dim,
	rnn_dim=rnn_dim,
	bidirectional=bidirectional,
	rnn_type=rnn_type,
	n_fft=n_fft,
	win_length=win_length,
	hop_length=hop_length,
	window_fn=window_fn,
	wkwargs=wkwargs,
	power=power,
	center=center,
	normalized=normalized,
	pad_mode=pad_mode,
	onesided=onesided,
	fs=fs,
	)

	self.stems = stems

	self.instantiate_mask_estim(
	in_channels=in_channels,
	stems=stems,
	emb_dim=emb_dim,
	mlp_dim=mlp_dim,
	hidden_activation=hidden_activation,
	hidden_activation_kwargs=hidden_activation_kwargs,
	complex_mask=complex_mask,
	n_freq=n_fft // 2 + 1,
	use_freq_weights=use_freq_weights,
	)

	def instantiate_mask_estim(
	self,
	in_channels: int,
	stems: List[str],
	emb_dim: int,
	mlp_dim: int,
	hidden_activation: str,
	hidden_activation_kwargs: Optional[Dict] = None,
	complex_mask: bool = True,
	n_freq: Optional[int] = None,
	use_freq_weights: bool = False,
	):
	if hidden_activation_kwargs is None:
	hidden_activation_kwargs = {}

	assert n_freq is not None

	self.mask_estim = nn.ModuleDict(
	{
	stem: OverlappingMaskEstimationModule(
	band_specs=self.band_specs.get_band_specs(),
	freq_weights=self.band_specs.get_freq_weights(),
	n_freq=n_freq,
	emb_dim=emb_dim,
	mlp_dim=mlp_dim,
	in_channels=in_channels,
	hidden_activation=hidden_activation,
	hidden_activation_kwargs=hidden_activation_kwargs,
	complex_mask=complex_mask,
	use_freq_weights=use_freq_weights,
	)
	for stem in stems
	}
	)

	def separate(self, batch):
	batch["estimates"] = {}

	x, q, length = self.encode(batch)

	for stem, mem in self.mask_estim.items():
	m = mem(q)

	s = self.mask(x, m.to(x.dtype))
	s = torch.reshape(s, x.shape)
	batch["estimates"][stem] = {
	"audio": self.istft(s, length),
	"spectrogram": s,
	}

	return batch