CapSpeech-TTS

Runtime error

App Files Files Community

CapSpeech-TTS / capspeech /eval /src /model /dialect /wavlm_dialect.py

OpenSound

Upload 518 files

dd9600d verified 6 months ago

raw

history blame contribute delete

13 kB

	import os
	import pdb
	import copy
	import torch
	import argparse
	import loralib as lora
	import transformers.models.wavlm.modeling_wavlm as wavlm
	from speechbrain.nnet.normalization import LayerNorm
	from speechbrain.lobes.models.huggingface_transformers.huggingface import make_padding_masks

	from torch import nn
	from torch.nn import functional as F
	from transformers import Wav2Vec2FeatureExtractor
	from transformers import WavLMModel

	import sys
	from pathlib import Path
	sys.path.append(os.path.join(str(Path(os.path.realpath(__file__)).parents[1])))
	from revgrad import RevGrad

	class WavLMEncoderLayer(nn.Module):
	def __init__(self, layer_idx, config, has_relative_position_bias: bool = True):
	super().__init__()
	self.attention = wavlm.WavLMAttention(
	embed_dim=config.hidden_size,
	num_heads=config.num_attention_heads,
	dropout=config.attention_dropout,
	num_buckets=config.num_buckets,
	max_distance=config.max_bucket_distance,
	has_relative_position_bias=has_relative_position_bias,
	)
	self.dropout = nn.Dropout(config.hidden_dropout)
	self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.feed_forward = wavlm.WavLMFeedForward(config)
	self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.config = config

	if layer_idx > config.num_hidden_layers // 2:
	if self.config.finetune_method == "lora" or self.config.finetune_method == "combined":
	self.feed_forward.intermediate_dense = lora.Linear(config.hidden_size, config.intermediate_size, r=config.lora_rank)
	self.feed_forward.output_dense = lora.Linear(config.intermediate_size, config.hidden_size, r=config.lora_rank)

	def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False, index=0):

	attn_residual = hidden_states
	hidden_states, attn_weights, position_bias = self.attention(
	hidden_states,
	attention_mask=attention_mask,
	position_bias=position_bias,
	output_attentions=output_attentions,
	index=index,
	)
	hidden_states = self.dropout(hidden_states)
	hidden_states = attn_residual + hidden_states

	hidden_states = self.layer_norm(hidden_states)
	hidden_states = hidden_states + self.feed_forward(hidden_states)
	hidden_states = self.final_layer_norm(hidden_states)
	outputs = (hidden_states, position_bias)

	if output_attentions:
	outputs += (attn_weights,)

	return outputs


	class WavLMEncoderLayerStableLayerNorm(nn.Module):
	def __init__(self, layer_idx, config, has_relative_position_bias: bool = True):
	super().__init__()
	self.attention = wavlm.WavLMAttention(
	embed_dim=config.hidden_size,
	num_heads=config.num_attention_heads,
	dropout=config.attention_dropout,
	num_buckets=config.num_buckets,
	max_distance=config.max_bucket_distance,
	has_relative_position_bias=has_relative_position_bias,
	)
	self.dropout = nn.Dropout(config.hidden_dropout)
	self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.feed_forward = wavlm.WavLMFeedForward(config)
	self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.config = config

	if layer_idx > config.num_hidden_layers // 2:
	if self.config.finetune_method == "lora" or self.config.finetune_method == "combined":
	self.feed_forward.intermediate_dense = lora.Linear(config.hidden_size, config.intermediate_size, r=config.lora_rank)
	self.feed_forward.output_dense = lora.Linear(config.intermediate_size, config.hidden_size, r=config.lora_rank)


	def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False):
	attn_residual = hidden_states
	hidden_states = self.layer_norm(hidden_states)
	hidden_states, attn_weights, position_bias = self.attention(
	hidden_states,
	attention_mask=attention_mask,
	position_bias=position_bias,
	output_attentions=output_attentions,
	)
	hidden_states = self.dropout(hidden_states)
	hidden_states = attn_residual + hidden_states
	hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))

	outputs = (hidden_states, position_bias)

	if output_attentions:
	outputs += (attn_weights,)

	return outputs


	class WavLMWrapper(nn.Module):
	def __init__(
	self,
	pretrain_model="wavlm_large",
	hidden_dim=256,
	finetune_method="lora",
	lora_rank=16,
	freeze_params=True,
	output_class_num=4,
	use_conv_output=True,
	apply_gradient_reversal=False,
	num_dataset=4
	):
	super(WavLMWrapper, self).__init__()
	# 1. We Load the model first with weights
	if pretrain_model == "wavlm":
	self.backbone_model = WavLMModel.from_pretrained(
	"microsoft/wavlm-base-plus",
	output_hidden_states=True,
	)
	elif pretrain_model == "wavlm_large":
	self.processor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/wavlm-large')
	self.backbone_model = WavLMModel.from_pretrained(
	"microsoft/wavlm-large",
	output_hidden_states=True,
	)
	self.pretrain_model = pretrain_model
	self.finetune_method = finetune_method
	self.apply_gradient_reversal = apply_gradient_reversal
	self.use_conv_output = use_conv_output

	state_dict = self.backbone_model.state_dict()
	# 2. Read the model config
	self.model_config = self.backbone_model.config
	self.model_config.finetune_method = finetune_method
	self.model_config.lora_rank = lora_rank

	# 3. Config encoder layers with adapter or embedding prompt
	if self.pretrain_model == "wavlm":
	self.backbone_model.encoder.layers = nn.ModuleList(
	[WavLMEncoderLayer(i, self.model_config, has_relative_position_bias=(i == 0)) for i in range(self.model_config.num_hidden_layers)]
	)
	elif self.pretrain_model == "wavlm_large":
	self.backbone_model.encoder.layers = nn.ModuleList(
	[WavLMEncoderLayerStableLayerNorm(i, self.model_config, has_relative_position_bias=(i == 0)) for i in range(self.model_config.num_hidden_layers)]
	)
	# 4. Load the weights back
	msg = self.backbone_model.load_state_dict(state_dict, strict=False)

	# 5. Freeze the weights
	self.freeze_params = freeze_params
	if self.freeze_params and self.finetune_method != "lora":
	for _, p in self.backbone_model.named_parameters(): p.requires_grad = False
	elif self.freeze_params and self.finetune_method == "lora":
	for name, p in self.backbone_model.named_parameters():
	if name in msg.missing_keys: p.requires_grad = True
	else: p.requires_grad = False
	else:
	for _, p in self.backbone_model.named_parameters(): p.requires_grad = True

	# 6. Downstream models
	self.model_seq = nn.Sequential(
	nn.Conv1d(self.model_config.hidden_size, hidden_dim, 1, padding=0),
	nn.ReLU(),
	nn.Dropout(p=0.1),
	nn.Conv1d(hidden_dim, hidden_dim, 1, padding=0),
	nn.ReLU(),
	nn.Dropout(p=0.1),
	nn.Conv1d(hidden_dim, hidden_dim, 1, padding=0)
	)

	if self.use_conv_output:
	num_layers = self.model_config.num_hidden_layers + 1 # transformer layers + input embeddings
	self.weights = nn.Parameter(torch.ones(num_layers)/num_layers)
	else:
	num_layers = self.model_config.num_hidden_layers
	self.weights = nn.Parameter(torch.zeros(num_layers))

	if apply_gradient_reversal:
	self.dataset_layer = nn.Sequential(
	RevGrad(),
	nn.Linear(hidden_dim, hidden_dim),
	nn.ReLU(),
	nn.Linear(hidden_dim, num_dataset),
	)

	self.out_layer = nn.Sequential(
	nn.Linear(hidden_dim, hidden_dim),
	nn.ReLU(),
	nn.Linear(hidden_dim, output_class_num),
	)

	def forward(self, x, length=None, return_feature=False):
	# 1. feature extraction and projections
	if self.pretrain_model == "wavlm_large":
	with torch.no_grad():
	signal, attention_mask = list(), list()
	if length is not None: attention_mask = make_padding_masks(x, wav_len=length/length.max()).to(x.device)
	else: attention_mask = make_padding_masks(x, wav_len=torch.tensor([1]).to(x.device)).to(x.device)

	for idx in range(len(x)):
	input = self.processor(x[idx], sampling_rate=16_000, return_tensors="pt", padding=True)
	signal.append(input["input_values"][0].to(x.device))
	signal = torch.stack(signal)

	# 2. get length and mask
	if length is not None:
	length = self.get_feat_extract_output_lengths(length.detach().cpu())
	length = length.cuda()

	if self.pretrain_model == "wavlm":
	x = self.backbone_model(
	x, output_hidden_states=True
	).hidden_states
	else:
	x = self.backbone_model(
	signal,
	attention_mask=attention_mask,
	output_hidden_states=True
	).hidden_states

	# 4. stacked feature
	if self.use_conv_output: stacked_feature = torch.stack(x, dim=0)
	else: stacked_feature = torch.stack(x, dim=0)[1:]

	# 5. Weighted sum
	_, *origin_shape = stacked_feature.shape
	# Return transformer enc outputs [num_enc_layers, B, T, D]
	if self.use_conv_output:
	stacked_feature = stacked_feature.view(self.backbone_model.config.num_hidden_layers+1, -1)
	else:
	stacked_feature = stacked_feature.view(self.backbone_model.config.num_hidden_layers, -1)
	norm_weights = F.softmax(self.weights, dim=-1)

	# Perform weighted average
	weighted_feature = (norm_weights.unsqueeze(-1) * stacked_feature).sum(dim=0)
	features = weighted_feature.view(*origin_shape)

	# 6. Pass the weighted average to point-wise 1D Conv
	# B x T x D
	features = features.transpose(1, 2)
	features = self.model_seq(features)
	features = features.transpose(1, 2)

	# 7. Pooling
	if length is not None:
	mean, std = list(), list()
	for snt_id in range(features.shape[0]):
	# Avoiding padded time steps
	actual_size = length[snt_id]
	mean.append(torch.mean(features[snt_id, 0:actual_size, ...], dim=0))
	features = torch.stack(mean)
	else:
	features = torch.mean(features, dim=1)

	# 8. Output predictions
	# B x D
	predicted = self.out_layer(features)
	if self.apply_gradient_reversal:
	dataset_predicted = self.dataset_layer(features)
	if return_feature: return predicted, dataset_predicted, features
	return predicted, dataset_predicted
	if return_feature: return predicted, features
	return predicted

	# From huggingface
	def get_feat_extract_output_lengths(self, input_length):
	"""
	Computes the output length of the convolutional layers
	"""
	def _conv_out_length(input_length, kernel_size, stride):
	# 1D convolutional layer output length formula taken
	# from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
	return (input_length - kernel_size) // stride + 1
	for kernel_size, stride in zip(self.backbone_model.config.conv_kernel, self.backbone_model.config.conv_stride):
	input_length = _conv_out_length(input_length, kernel_size, stride)
	return input_length

	def prepare_mask(length, shape, dtype):
	# Modified from huggingface
	mask = torch.zeros(
	shape, dtype=dtype
	)
	# these two operations makes sure that all values
	# before the output lengths indices are attended to
	mask[(torch.arange(mask.shape[0]), length.cpu() - 1)] = 1
	mask = mask.flip([-1]).cumsum(-1).flip([-1]).bool()
	return mask