First commit

3c50954 29 days ago

6.86 kB

	# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
	# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
	# 2023 NetEase Inc
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# Modified from ESPnet(https://github.com/espnet/espnet)
	"""Encoder definition."""
	from typing import Optional, Tuple

	import torch

	from wenet.utils.mask import make_pad_mask
	from wenet.transformer.encoder import TransformerEncoder, ConformerEncoder


	class DualTransformerEncoder(TransformerEncoder):
	"""Transformer encoder module."""

	def __init__(
	self,
	input_size: int,
	output_size: int = 256,
	attention_heads: int = 4,
	linear_units: int = 2048,
	num_blocks: int = 6,
	dropout_rate: float = 0.1,
	positional_dropout_rate: float = 0.1,
	attention_dropout_rate: float = 0.0,
	input_layer: str = "conv2d",
	pos_enc_layer_type: str = "abs_pos",
	normalize_before: bool = True,
	static_chunk_size: int = 0,
	use_dynamic_chunk: bool = False,
	global_cmvn: torch.nn.Module = None,
	use_dynamic_left_chunk: bool = False,
	query_bias: bool = True,
	key_bias: bool = True,
	value_bias: bool = True,
	activation_type: str = "relu",
	gradient_checkpointing: bool = False,
	use_sdpa: bool = False,
	layer_norm_type: str = 'layer_norm',
	norm_eps: float = 1e-5,
	n_kv_head: Optional[int] = None,
	head_dim: Optional[int] = None,
	selfattention_layer_type: str = "selfattn",
	mlp_type: str = 'position_wise_feed_forward',
	mlp_bias: bool = True,
	n_expert: int = 8,
	n_expert_activated: int = 2,
	):
	""" Construct DualTransformerEncoder
	Support both the full context mode and the streaming mode separately
	"""
	super().__init__(input_size, output_size, attention_heads,
	linear_units, num_blocks, dropout_rate,
	positional_dropout_rate, attention_dropout_rate,
	input_layer, pos_enc_layer_type, normalize_before,
	static_chunk_size, use_dynamic_chunk, global_cmvn,
	use_dynamic_left_chunk, query_bias, key_bias,
	value_bias, activation_type, gradient_checkpointing,
	use_sdpa, layer_norm_type, norm_eps, n_kv_head,
	head_dim, selfattention_layer_type, mlp_type,
	mlp_bias, n_expert, n_expert_activated)

	def forward_full(
	self,
	xs: torch.Tensor,
	xs_lens: torch.Tensor,
	decoding_chunk_size: int = 0,
	num_decoding_left_chunks: int = -1,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	T = xs.size(1)
	masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
	if self.global_cmvn is not None:
	xs = self.global_cmvn(xs)
	xs, pos_emb, masks = self.embed(xs, masks)
	mask_pad = masks # (B, 1, T/subsample_rate)
	for layer in self.encoders:
	xs, masks, _, _ = layer(xs, masks, pos_emb, mask_pad)
	if self.normalize_before:
	xs = self.after_norm(xs)
	return xs, masks


	class DualConformerEncoder(ConformerEncoder):
	"""Conformer encoder module."""

	def __init__(
	self,
	input_size: int,
	output_size: int = 256,
	attention_heads: int = 4,
	linear_units: int = 2048,
	num_blocks: int = 6,
	dropout_rate: float = 0.1,
	positional_dropout_rate: float = 0.1,
	attention_dropout_rate: float = 0.0,
	input_layer: str = "conv2d",
	pos_enc_layer_type: str = "rel_pos",
	normalize_before: bool = True,
	static_chunk_size: int = 0,
	use_dynamic_chunk: bool = False,
	global_cmvn: torch.nn.Module = None,
	use_dynamic_left_chunk: bool = False,
	positionwise_conv_kernel_size: int = 1,
	macaron_style: bool = True,
	selfattention_layer_type: str = "rel_selfattn",
	activation_type: str = "swish",
	use_cnn_module: bool = True,
	cnn_module_kernel: int = 15,
	causal: bool = False,
	cnn_module_norm: str = "batch_norm",
	query_bias: bool = True,
	key_bias: bool = True,
	value_bias: bool = True,
	conv_bias: bool = True,
	gradient_checkpointing: bool = False,
	use_sdpa: bool = False,
	layer_norm_type: str = 'layer_norm',
	norm_eps: float = 1e-5,
	n_kv_head: Optional[int] = None,
	head_dim: Optional[int] = None,
	mlp_type: str = 'position_wise_feed_forward',
	mlp_bias: bool = True,
	n_expert: int = 8,
	n_expert_activated: int = 2,
	):
	""" Construct DualConformerEncoder
	Support both the full context mode and the streaming mode separately
	"""
	super().__init__(
	input_size, output_size, attention_heads, linear_units, num_blocks,
	dropout_rate, positional_dropout_rate, attention_dropout_rate,
	input_layer, pos_enc_layer_type, normalize_before,
	static_chunk_size, use_dynamic_chunk, global_cmvn,
	use_dynamic_left_chunk, positionwise_conv_kernel_size,
	macaron_style, selfattention_layer_type, activation_type,
	use_cnn_module, cnn_module_kernel, causal, cnn_module_norm,
	query_bias, key_bias, value_bias, conv_bias,
	gradient_checkpointing, use_sdpa, layer_norm_type, norm_eps,
	n_kv_head, head_dim, mlp_type, mlp_bias, n_expert,
	n_expert_activated)

	def forward_full(
	self,
	xs: torch.Tensor,
	xs_lens: torch.Tensor,
	decoding_chunk_size: int = 0,
	num_decoding_left_chunks: int = -1,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	T = xs.size(1)
	masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
	if self.global_cmvn is not None:
	xs = self.global_cmvn(xs)
	xs, pos_emb, masks = self.embed(xs, masks)
	mask_pad = masks # (B, 1, T/subsample_rate)
	for layer in self.encoders:
	xs, masks, _, _ = layer(xs, masks, pos_emb, mask_pad)
	if self.normalize_before:
	xs = self.after_norm(xs)
	return xs, masks