Spaces:

qgyd2021
/

cc_denoise

Paused

App Files Files Community

cc_denoise / toolbox /torchaudio /models /frcrn /modeling_frcrn.py

HoneyTian

first commit

1af34cd 10 months ago

raw

history blame contribute delete

12 kB

	#!/usr/bin/python3
	# -- coding: utf-8 --
	"""
	https://arxiv.org/abs/2206.07293

	https://github.com/modelscope/modelscope/blob/master/modelscope/models/audio/ans/frcrn.py
	https://huggingface.co/spaces/alibabasglab/ClearVoice/blob/main/models/frcrn_se/frcrn.py

	https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice/clearvoice/models/frcrn_se

	"""
	import os
	from typing import Optional, Union

	import torch
	import torch.nn as nn
	from torch.nn import functional as F

	from toolbox.torchaudio.configuration_utils import CONFIG_FILE
	from toolbox.torchaudio.models.frcrn.configuration_frcrn import FRCRNConfig
	from toolbox.torchaudio.models.frcrn.conv_stft import ConviSTFT, ConvSTFT
	from toolbox.torchaudio.models.frcrn.unet import UNet


	class FRCRN(nn.Module):
	""" Frequency Recurrent CRN """

	def __init__(self,
	use_complex_networks: bool = True,
	model_complexity: int = 45,
	model_depth: int = 14,
	padding_mode: str = "zeros",
	nfft: int = 640,
	win_size: int = 640,
	hop_size: int = 320,
	win_type: str = "hann",
	):
	"""
	:param use_complex_networks: bool, Whether to use complex networks.
	:param model_complexity: int, define the model complexity with the number of layers
	:param model_depth: int, Only two options are available : 10, 20
	:param padding_mode: str, Encoder's convolution filter. 'zeros', 'reflect'
	:param nfft: int, number of Short Time Fourier Transform (STFT) points
	:param win_size: int, length of window used for defining one frame of sample points
	:param hop_size: int, length of window shifting (equivalent to hop_size)
	:param win_type: str, windowing type used in STFT, eg. 'hanning', 'hamming'
	"""
	super().__init__()
	self.freq_bins = nfft // 2 + 1

	self.nfft = nfft
	self.win_size = win_size
	self.hop_size = hop_size
	self.win_type = win_type

	self.eps = 1e-8

	self.stft = ConvSTFT(
	nfft=self.nfft,
	win_size=self.win_size,
	hop_size=self.hop_size,
	win_type=self.win_type,
	feature_type="complex",
	requires_grad=False
	)
	self.istft = ConviSTFT(
	nfft=self.nfft,
	win_size=self.win_size,
	hop_size=self.hop_size,
	win_type=self.win_type,
	feature_type="complex",
	requires_grad=False
	)
	self.unet = UNet(
	in_channels=1,
	use_complex_networks=use_complex_networks,
	model_complexity=model_complexity,
	model_depth=model_depth,
	padding_mode=padding_mode
	)
	self.unet2 = UNet(
	in_channels=1,
	use_complex_networks=use_complex_networks,
	model_complexity=model_complexity,
	model_depth=model_depth,
	padding_mode=padding_mode
	)

	def forward(self, noisy: torch.Tensor):
	"""
	:param noisy: torch.Tensor, shape: [b, n_samples] or [b, c, n_samples]
	:return:
	"""
	if noisy.dim() == 2:
	noisy = torch.unsqueeze(noisy, dim=1)
	_, _, n_samples = noisy.shape
	remainder = (n_samples - self.win_size) % self.hop_size
	if remainder > 0:
	n_samples_pad = self.hop_size - remainder
	noisy = F.pad(noisy, pad=(0, n_samples_pad), mode="constant", value=0)

	# [batch_size, freq_bins * 2, num_samples]
	cmp_spec = self.stft.forward(noisy)
	# [batch_size, 1, freq_bins * 2, time_steps]
	# time_steps = (num_samples - win_size) / hop_size + 1
	cmp_spec = torch.unsqueeze(cmp_spec, 1)

	# [batch_size, 2, freq_bins, time_steps]
	cmp_spec = torch.cat([
	cmp_spec[:, :, :self.freq_bins, :],
	cmp_spec[:, :, self.freq_bins:, :],
	], dim=1)

	# [batch_size, 2, freq_bins, time_steps, 1]
	cmp_spec = torch.unsqueeze(cmp_spec, dim=4)

	cmp_spec = torch.transpose(cmp_spec, 1, 4)
	# [batch_size, 1, freq_bins, time_steps, 2]

	unet1_out = self.unet.forward(cmp_spec)
	cmp_mask1 = torch.tanh(unet1_out)
	unet2_out = self.unet2.forward(unet1_out)
	cmp_mask2 = torch.tanh(unet2_out)

	# est_spec, est_wav, est_mask = self.apply_mask(cmp_spec, cmp_mask1)

	cmp_mask2 = cmp_mask2 + cmp_mask1
	est_spec, est_wav, est_mask = self.apply_mask(cmp_spec, cmp_mask2)
	# est_wav shape: [b, n_samples]

	est_wav = est_wav[:, :n_samples]
	return est_spec, est_wav, est_mask

	def apply_mask(self,
	cmp_spec: torch.Tensor,
	cmp_mask: torch.Tensor,
	):
	"""
	:param cmp_spec: torch.Tensor, shape: [batch_size, 1, freq_bins, time_steps, 2]
	:param cmp_mask: torch.Tensor, shape: [batch_size, 1, freq_bins, time_steps, 2]
	:return:
	"""
	est_spec = torch.cat(
	tensors=[
	cmp_spec[..., 0] * cmp_mask[..., 0] - cmp_spec[..., 1] * cmp_mask[..., 1],
	cmp_spec[..., 0] * cmp_mask[..., 1] + cmp_spec[..., 1] * cmp_mask[..., 0]
	], dim=1
	)
	# est_spec shape: [b, 2, n//2+1, t]
	est_spec = torch.cat(tensors=[est_spec[:, 0, :, :], est_spec[:, 1, :, :]], dim=1)
	# est_spec shape: [b, n+2, t]

	# cmp_mask shape: [b, 1, n//2+1, t, 2]
	cmp_mask = torch.squeeze(cmp_mask, dim=1)
	# cmp_mask shape: [b, n//2+1, t, 2]
	cmp_mask = torch.cat(tensors=[cmp_mask[:, :, :, 0], cmp_mask[:, :, :, 1]], dim=1)
	# cmp_mask shape: [b, n+2, t]

	# est_spec shape: [b, n+2, t]
	est_wav = self.istft(est_spec)
	# est_wav shape: [b, 1, n_samples]
	est_wav = torch.squeeze(est_wav, 1)
	# est_wav shape: [b, n_samples]
	return est_spec, est_wav, cmp_mask

	def get_params(self, weight_decay=0.0):
	"""
	为可训练参数配置 weight_decay (权重衰减) 的作用是实现 L2 正则化。
	1. 防止过拟合: 通过向损失函数添加参数的 L2 范数 (平方和) 作为惩罚项, weight_decay 会限制模型权重的大小.
	这使得模型倾向于学习更小的权重值, 降低对训练数据的过度敏感, 从而提高泛化能力.
	2. 控制模型复杂度: 权重衰减直接作用于优化过程, 在梯度更新时对权重进行衰减,
	公式: weight = weight - lr * (gradient + weight_decay * weight).
	这相当于在梯度下降中额外引入了一个与当前权重值成正比的衰减力, 抑制权重快速增长.
	3. 与优化器的具体实现相关
	在 SGD 等传统优化器中, weight_decay 直接等价于 L2 正则化.
	在 Adam 优化器中, 权重衰减的实现与参数更新耦合, 可能因学习率调整而效果减弱.
	在 AdamW 优化器改进了这一点, 将权重衰减与学习率解耦, 使其更符合 L2 正则化的理论效果.

	注意:
	值过大会导致欠拟合, 过小则正则化效果弱, 常用范围是 1e-4到 1e-2.
	某些场景 (如 BatchNorm 层) 可能需要通过参数分组对不同层设置不同的 weight_decay.
	:param weight_decay:
	:return:
	"""
	weights, biases = [], []
	for name, param in self.named_parameters():
	if "bias" in name:
	biases += [param]
	else:
	weights += [param]

	params = [{
	'params': weights,
	'weight_decay': weight_decay,
	}, {
	'params': biases,
	'weight_decay': 0.0,
	}]
	return params

	def mask_loss_fn(self, est_mask: torch.Tensor, clean: torch.Tensor, noisy: torch.Tensor):
	"""

	:param est_mask: torch.Tensor, shape: [b, n+2, t]
	:param clean:
	:param noisy:
	:return:
	"""
	clean_stft = self.stft(clean)
	clean_re = clean_stft[:, :self.freq_bins, :]
	clean_im = clean_stft[:, self.freq_bins:, :]

	noisy_stft = self.stft(noisy)
	noisy_re = noisy_stft[:, :self.freq_bins, :]
	noisy_im = noisy_stft[:, self.freq_bins:, :]

	noisy_power = noisy_re 2 + noisy_im 2

	sr = clean_re
	yr = noisy_re
	si = clean_im
	yi = noisy_im
	y_pow = noisy_power
	# (Sr * Yr + Si * Yi) / (Y_pow + 1e-8)
	gth_mask_re = (sr * yr + si * yi) / (y_pow + self.eps)
	# (Si * Yr - Sr * Yi) / (Y_pow + 1e-8)
	gth_mask_im = (sr * yr - si * yi) / (y_pow + self.eps)

	gth_mask_re[gth_mask_re > 2] = 1
	gth_mask_re[gth_mask_re < -2] = -1
	gth_mask_im[gth_mask_im > 2] = 1
	gth_mask_im[gth_mask_im < -2] = -1

	mask_re = est_mask[:, :self.freq_bins, :]
	mask_im = est_mask[:, self.freq_bins:, :]

	loss_re = F.mse_loss(gth_mask_re, mask_re)
	loss_im = F.mse_loss(gth_mask_im, mask_im)

	loss = loss_re + loss_im
	return loss


	MODEL_FILE = "model.pt"


	class FRCRNPretrainedModel(FRCRN):
	def __init__(self,
	config: FRCRNConfig,
	):
	super(FRCRNPretrainedModel, self).__init__(
	use_complex_networks=config.use_complex_networks,
	model_complexity=config.model_complexity,
	model_depth=config.model_depth,
	nfft=config.nfft,
	win_size=config.win_size,
	hop_size=config.hop_size,
	win_type=config.win_type,
	)
	self.config = config

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
	config = FRCRNConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)

	model = cls(config)

	if os.path.isdir(pretrained_model_name_or_path):
	ckpt_file = os.path.join(pretrained_model_name_or_path, MODEL_FILE)
	else:
	ckpt_file = pretrained_model_name_or_path

	with open(ckpt_file, "rb") as f:
	state_dict = torch.load(f, map_location="cpu", weights_only=True)
	model.load_state_dict(state_dict, strict=True)
	return model

	def save_pretrained(self,
	save_directory: Union[str, os.PathLike],
	state_dict: Optional[dict] = None,
	):

	model = self

	if state_dict is None:
	state_dict = model.state_dict()

	os.makedirs(save_directory, exist_ok=True)

	# save state dict
	model_file = os.path.join(save_directory, MODEL_FILE)
	torch.save(state_dict, model_file)

	# save config
	config_file = os.path.join(save_directory, CONFIG_FILE)
	self.config.to_yaml_file(config_file)
	return save_directory


	def main():
	model = FRCRN(
	use_complex_networks=True,
	model_complexity=-1,
	model_depth=10,
	padding_mode="zeros",
	nfft=128,
	win_size=128,
	hop_size=64,
	win_type="hann",
	)

	# model = FRCRN(
	# use_complex_networks=True,
	# model_complexity=-1,
	# model_depth=14,
	# padding_mode="zeros",
	# nfft=640,
	# win_size=640,
	# hop_size=320,
	# win_type="hann",
	# )

	# model = FRCRN(
	# use_complex_networks=True,
	# model_complexity=45,
	# model_depth=20,
	# padding_mode="zeros",
	# nfft=512,
	# win_size=512,
	# hop_size=256,
	# win_type="hann",
	# )

	mixture = torch.rand(size=(1, 32000), dtype=torch.float32)

	est_spec, est_wav, est_mask = model.forward(mixture)
	print(est_spec.shape)
	print(est_wav.shape)
	print(est_mask.shape)

	return


	if __name__ == "__main__":
	main()