ClearSep / model /CLAPSep.py

Tianhao Wang

first commit

dbbd709 12 months ago

10.9 kB

	#!/usr/bin/env python
	# -- coding: UTF-8 --
	'''
	@Project ：Waveformer-main
	@File ：CLAPSep.py
	@IDE ：PyCharm
	@Author ：Aisaka/Hao Ma @SDU
	@Date ：2024/2/28 下午1:12
	'''

	import torch
	import laion_clap
	from torchmetrics.audio.snr import(
	scale_invariant_signal_noise_ratio as si_snr,
	signal_noise_ratio as snr)
	from torchmetrics.audio.sdr import(
	signal_distortion_ratio as sdr,
	scale_invariant_signal_distortion_ratio as si_sdr)
	import copy
	import loralib as lora
	from torchlibrosa import ISTFT, STFT, SpecAugmentation
	from torchlibrosa.stft import magphase
	import librosa
	import pytorch_lightning as pl


	def loss_fn(pred, tgt):
	return -0.9 * snr(pred, tgt).mean() - 0.1 * si_snr(pred, tgt).mean()


	def set_module(model, submodule_key, module):
	tokens = submodule_key.split('.')
	sub_tokens = tokens[:-1]
	cur_mod = model
	for s in sub_tokens:
	cur_mod = getattr(cur_mod, s)
	setattr(cur_mod, tokens[-1], module)


	def process_model(model, rank):
	for n, module in model.named_modules():
	if 'WindowAttention' in str(type(module)):
	for n_, layer in module.named_modules():
	if isinstance(layer, torch.nn.Linear):
	lora_layer = lora.Linear(layer.in_features, layer.out_features, r=rank,
	bias=hasattr(layer, 'bias'), merge_weights=False)
	lora_layer.weight = layer.weight
	if hasattr(layer, 'bias'):
	lora_layer.bias = layer.bias
	set_module(model, n+'.'+n_, lora_layer)
	return model


	class LightningModule(pl.LightningModule):
	def __init__(self, clap_model, decoder_model, lr, use_lora=False, rank=8, nfft=1024):
	super().__init__()
	self.phase = decoder_model.phase
	self.lr = lr
	self.clap_model = clap_model
	for p in self.clap_model.parameters():
	p.requires_grad = False
	self.audio_branch = copy.deepcopy(self.clap_model.model.audio_branch)
	if use_lora:
	process_model(self.audio_branch, rank)
	lora.mark_only_lora_as_trainable(self.audio_branch, bias='lora_only')

	self.decoder_model = decoder_model
	self.stft = STFT(n_fft=nfft, hop_length=320,
	win_length=nfft, window='hann', center=True, pad_mode='reflect',
	freeze_parameters=True)
	self.istft = ISTFT(n_fft=nfft, hop_length=320,
	win_length=nfft, window='hann', center=True, pad_mode='reflect',
	freeze_parameters=True)
	self.features = self.install_forward_hooks()

	def training_step(self, batch, batch_idx):
	self.clap_model.eval()
	self.audio_branch.eval()
	# print([len(x) for x in batch])
	mixed, mixed_resample, pos_cap, neg_cap, gt, pos_sample, neg_sample = batch
	real, imag = self.stft(mixed)
	mag, cos, sin = magphase(real, imag)
	with torch.no_grad():
	a = torch.rand((1,)).type_as(gt)
	embed_pos_a, embed_neg_a = torch.chunk(
	self.clap_model.get_audio_embedding_from_data(torch.concat([pos_sample, neg_sample], dim=0),
	use_tensor=True), dim=0, chunks=2)
	embed_pos_t, embed_neg_t = torch.chunk(
	self.clap_model.get_text_embedding(pos_cap + neg_cap, use_tensor=True), dim=0, chunks=2)
	embed_pos = a * embed_pos_a + (1 - a) * embed_pos_t
	embed_neg = a * embed_neg_a + (1 - a) * embed_neg_t
	del self.features[:]
	self.features.append(mag)
	self.audio_branch({"waveform": mixed_resample})
	a = torch.rand((1,))
	if a < 0.25:
	loss = self.cal_loss(embed_pos, torch.zeros_like(embed_pos), mag, cos, sin, length=mixed.size(-1), gt=gt)
	elif a < 0.5:
	loss = self.cal_loss(torch.zeros_like(embed_neg), embed_neg, mag, cos, sin, length=mixed.size(-1), gt=gt)
	else:
	loss = self.cal_loss(embed_pos, embed_neg, mag, cos, sin, length=mixed.size(-1), gt=gt)
	self.log("train_loss", loss.item(), on_epoch=True, prog_bar=True, sync_dist=True, batch_size=len(mixed))
	del self.features[:]
	return loss

	def cal_loss(self, embed_p, embed_n, mag, cos, sin, length, gt):
	embed = torch.nn.functional.normalize(torch.concat([embed_p, embed_n], dim=-1), dim=-1)
	mask = self.decoder_model(hidden_state=self.features[-1], skip_features=self.features[:-1], embed=embed)
	pred = self.wav_reconstruct(mask, mag, cos, sin, length=length)
	return loss_fn(pred, gt)

	def wav_reconstruct(self, mask, mag_x, cos_x, sin_x, length):
	# ref: https://github.com/Audio-AGI/AudioSep/blob/main/models/resunet.py
	# Y = \|Y\|cos∠Y + j\|Y\|sin∠Y
	# = \|Y\|cos(∠X + ∠M) + j\|Y\|sin(∠X + ∠M)
	# = \|Y\|(cos∠X cos∠M - sin∠X sin∠M) + j\|Y\|(sin∠X cos∠M + cos∠X sin∠M)
	if self.phase:
	mag_y = torch.nn.functional.relu_(mag_x * mask[0])
	_, mask_cos, mask_sin = magphase(mask[1], mask[2])
	cos_y = cos_x * mask_cos - sin_x * mask_sin
	sin_y = sin_x * mask_cos + cos_x * mask_sin
	else:
	mag_y = torch.nn.functional.relu_(mag_x * mask)
	cos_y = cos_x
	sin_y = sin_x
	pred = self.istft(mag_y * cos_y, mag_y * sin_y, length=length)
	return pred

	def validation_step(self, batch, batch_idx):
	mixed, mixed_resample, label, neg_label, gt, _, _ = batch
	real, imag = self.stft(mixed)
	mag, cos, sin = magphase(real, imag)
	self.features.append(mag)
	with torch.no_grad():
	embed_pos = self.clap_model.get_text_embedding(label, use_tensor=True)
	embed_neg = self.clap_model.get_text_embedding(neg_label, use_tensor=True)
	embed = torch.concat([embed_pos, embed_neg], dim=-1)
	self.audio_branch({"waveform": mixed_resample})
	mask = self.decoder_model(hidden_state=self.features[-1], skip_features=self.features[:-1], embed=embed)
	pred = self.wav_reconstruct(mask, mag, cos, sin, length=mixed.size(-1))
	loss = si_snr(pred, gt).mean() - si_snr(mixed, gt).mean()
	del self.features[:]
	self.log("val_loss", loss, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=len(mixed))
	return {"val_loss": loss}

	def on_test_start(self) -> None:
	self.sdr_vals = torch.tensor([])
	self.sdri_vals = torch.tensor([])
	self.sisdr_vals = torch.tensor([])
	self.sisdri_vals = torch.tensor([])

	def test_step(self, batch, batch_idx):
	mixed, mixed_resample, label, neg_label, gt = batch
	real, imag = self.stft(mixed)
	mag, cos, sin = magphase(real, imag)
	with torch.no_grad():
	embed_pos_bached, embed_neg_bached = torch.chunk(self.clap_model.get_text_embedding(label + neg_label, use_tensor=True), chunks=2, dim=0)
	del self.features[:]
	# only positive
	# embed = torch.concat([embed_pos_bached, torch.zeros_like(embed_neg_bached)], dim=1)
	# only negative
	# embed = torch.concat([torch.zeros_like(embed_pos_bached), embed_neg_bached], dim=1)
	# positive and negative
	embed = torch.concat([embed_pos_bached, embed_neg_bached], dim=1)
	self.features.append(mag)
	self.audio_branch({"waveform": mixed_resample})
	mask = self.decoder_model(hidden_state=self.features[-1], skip_features=self.features[:-1], embed=embed)
	pred = self.wav_reconstruct(mask, mag, cos, sin, length=mixed.size(-1))
	sisdr = si_sdr(pred, gt).cpu()
	self.sisdr_vals = torch.concat([self.sisdr_vals, sisdr])
	self.sisdri_vals = torch.concat([self.sisdri_vals, sisdr - si_sdr(mixed, gt).cpu()])
	sdr_ = sdr(pred, gt).cpu()
	self.sdr_vals = torch.concat([self.sdr_vals, sdr_])
	self.sdri_vals = torch.concat([self.sdri_vals, sdr_ - sdr(mixed, gt).cpu()])
	del self.features[:]

	def on_test_end(self) -> None:
	print(f"SDR-mean: {torch.mean(self.sdr_vals).cpu().numpy():.4f}, SDR-std: {torch.std(self.sdr_vals).cpu().numpy():.4f}")
	print(f"SDRi-mean: {torch.mean(self.sdri_vals).cpu().numpy():.4f}, SDRi-std: {torch.std(self.sdri_vals).cpu().numpy():.4f}")
	print(f"SISDR-mean: {torch.mean(self.sisdr_vals).cpu().numpy():.4f}, SISDR-std: {torch.std(self.sisdr_vals).cpu().numpy():.4f}")
	print(f"SISDRi-mean: {torch.mean(self.sisdri_vals).cpu().numpy():.4f}, SISDRi-std: {torch.std(self.sisdri_vals).cpu().numpy():.4f}")

	def configure_optimizers(self):
	optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
	schedular = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.3, patience=5,
	verbose=True, min_lr=5e-6)
	return {
	"optimizer": optimizer,
	"lr_scheduler": {
	"scheduler": schedular,
	"interval": "epoch",
	"monitor": "val_loss"
	},
	}

	def install_forward_hooks(self):
	features = []
	spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
	freq_drop_width=8, freq_stripes_num=2)

	def get_features_list(_, __, output):
	features.append(output)

	def get_features_list_basic_layer(_, __, output):
	features.append(output[0])

	def spec_augmentation_hook(_, __, out):
	out = out.transpose(1, 3)
	out = spec_augmenter(out)
	return out.transpose(1, 3)

	def spectrogram_padding(_, __, out):
	return torch.nn.functional.pad(out, (0, 0, 0, 1024 - out.size(2)))

	self.clap_model.model.audio_branch.bn0.register_forward_hook(spec_augmentation_hook)
	self.audio_branch.spectrogram_extractor.register_forward_hook(spectrogram_padding)
	self.audio_branch.patch_embed.register_forward_hook(get_features_list)
	for module in self.audio_branch.layers:
	module.register_forward_hook(get_features_list_basic_layer)
	return features

	# # this will only save tuned parameters during training
	# def on_save_checkpoint(self, checkpoint):
	# weights = checkpoint['state_dict']
	# new_dict = {}
	# for k, v in weights.items():
	# if any(e in k for e in ['lora', 'attn.qkv.bias', 'attn.proj.bias', 'decoder_model']):
	# new_dict[k] = v
	# checkpoint['state_dict'] = new_dict