Comparative-Analysis-of-Speech-Synthesis-Models
/
TensorFlowTTS
/tensorflow_tts
/models
/fastspeech2.py
| # -*- coding: utf-8 -*- | |
| # Copyright 2020 The FastSpeech2 Authors and Minh Nguyen (@dathudeptrai) | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """Tensorflow Model modules for FastSpeech2.""" | |
| import tensorflow as tf | |
| from tensorflow_tts.models.fastspeech import TFFastSpeech, get_initializer | |
| class TFFastSpeechVariantPredictor(tf.keras.layers.Layer): | |
| """FastSpeech duration predictor module.""" | |
| def __init__(self, config, **kwargs): | |
| """Init variables.""" | |
| super().__init__(**kwargs) | |
| self.conv_layers = [] | |
| for i in range(config.variant_prediction_num_conv_layers): | |
| self.conv_layers.append( | |
| tf.keras.layers.Conv1D( | |
| config.variant_predictor_filter, | |
| config.variant_predictor_kernel_size, | |
| padding="same", | |
| name="conv_._{}".format(i), | |
| ) | |
| ) | |
| self.conv_layers.append(tf.keras.layers.Activation(tf.nn.relu)) | |
| self.conv_layers.append( | |
| tf.keras.layers.LayerNormalization( | |
| epsilon=config.layer_norm_eps, name="LayerNorm_._{}".format(i) | |
| ) | |
| ) | |
| self.conv_layers.append( | |
| tf.keras.layers.Dropout(config.variant_predictor_dropout_rate) | |
| ) | |
| self.conv_layers_sequence = tf.keras.Sequential(self.conv_layers) | |
| self.output_layer = tf.keras.layers.Dense(1) | |
| if config.n_speakers > 1: | |
| self.decoder_speaker_embeddings = tf.keras.layers.Embedding( | |
| config.n_speakers, | |
| config.encoder_self_attention_params.hidden_size, | |
| embeddings_initializer=get_initializer(config.initializer_range), | |
| name="speaker_embeddings", | |
| ) | |
| self.speaker_fc = tf.keras.layers.Dense( | |
| units=config.encoder_self_attention_params.hidden_size, | |
| name="speaker_fc", | |
| ) | |
| self.config = config | |
| def call(self, inputs, training=False): | |
| """Call logic.""" | |
| encoder_hidden_states, speaker_ids, attention_mask = inputs | |
| attention_mask = tf.cast( | |
| tf.expand_dims(attention_mask, 2), encoder_hidden_states.dtype | |
| ) | |
| if self.config.n_speakers > 1: | |
| speaker_embeddings = self.decoder_speaker_embeddings(speaker_ids) | |
| speaker_features = tf.math.softplus(self.speaker_fc(speaker_embeddings)) | |
| # extended speaker embeddings | |
| extended_speaker_features = speaker_features[:, tf.newaxis, :] | |
| encoder_hidden_states += extended_speaker_features | |
| # mask encoder hidden states | |
| masked_encoder_hidden_states = encoder_hidden_states * attention_mask | |
| # pass though first layer | |
| outputs = self.conv_layers_sequence(masked_encoder_hidden_states) | |
| outputs = self.output_layer(outputs) | |
| masked_outputs = outputs * attention_mask | |
| outputs = tf.squeeze(masked_outputs, -1) | |
| return outputs | |
| class TFFastSpeech2(TFFastSpeech): | |
| """TF Fastspeech module.""" | |
| def __init__(self, config, **kwargs): | |
| """Init layers for fastspeech.""" | |
| super().__init__(config, **kwargs) | |
| self.f0_predictor = TFFastSpeechVariantPredictor( | |
| config, dtype=tf.float32, name="f0_predictor" | |
| ) | |
| self.energy_predictor = TFFastSpeechVariantPredictor( | |
| config, dtype=tf.float32, name="energy_predictor", | |
| ) | |
| self.duration_predictor = TFFastSpeechVariantPredictor( | |
| config, dtype=tf.float32, name="duration_predictor" | |
| ) | |
| # define f0_embeddings and energy_embeddings | |
| self.f0_embeddings = tf.keras.layers.Conv1D( | |
| filters=config.encoder_self_attention_params.hidden_size, | |
| kernel_size=9, | |
| padding="same", | |
| name="f0_embeddings", | |
| ) | |
| self.f0_dropout = tf.keras.layers.Dropout(0.5) | |
| self.energy_embeddings = tf.keras.layers.Conv1D( | |
| filters=config.encoder_self_attention_params.hidden_size, | |
| kernel_size=9, | |
| padding="same", | |
| name="energy_embeddings", | |
| ) | |
| self.energy_dropout = tf.keras.layers.Dropout(0.5) | |
| def _build(self): | |
| """Dummy input for building model.""" | |
| # fake inputs | |
| input_ids = tf.convert_to_tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]], tf.int32) | |
| speaker_ids = tf.convert_to_tensor([0], tf.int32) | |
| duration_gts = tf.convert_to_tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], tf.int32) | |
| f0_gts = tf.convert_to_tensor( | |
| [[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]], tf.float32 | |
| ) | |
| energy_gts = tf.convert_to_tensor( | |
| [[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]], tf.float32 | |
| ) | |
| self( | |
| input_ids=input_ids, | |
| speaker_ids=speaker_ids, | |
| duration_gts=duration_gts, | |
| f0_gts=f0_gts, | |
| energy_gts=energy_gts, | |
| ) | |
| def call( | |
| self, | |
| input_ids, | |
| speaker_ids, | |
| duration_gts, | |
| f0_gts, | |
| energy_gts, | |
| training=False, | |
| **kwargs, | |
| ): | |
| """Call logic.""" | |
| attention_mask = tf.math.not_equal(input_ids, 0) | |
| embedding_output = self.embeddings([input_ids, speaker_ids], training=training) | |
| encoder_output = self.encoder( | |
| [embedding_output, attention_mask], training=training | |
| ) | |
| last_encoder_hidden_states = encoder_output[0] | |
| # energy predictor, here use last_encoder_hidden_states, u can use more hidden_states layers | |
| # rather than just use last_hidden_states of encoder for energy_predictor. | |
| duration_outputs = self.duration_predictor( | |
| [last_encoder_hidden_states, speaker_ids, attention_mask] | |
| ) # [batch_size, length] | |
| f0_outputs = self.f0_predictor( | |
| [last_encoder_hidden_states, speaker_ids, attention_mask], training=training | |
| ) | |
| energy_outputs = self.energy_predictor( | |
| [last_encoder_hidden_states, speaker_ids, attention_mask], training=training | |
| ) | |
| f0_embedding = self.f0_embeddings( | |
| tf.expand_dims(f0_gts, 2) | |
| ) # [barch_size, mel_length, feature] | |
| energy_embedding = self.energy_embeddings( | |
| tf.expand_dims(energy_gts, 2) | |
| ) # [barch_size, mel_length, feature] | |
| # apply dropout both training/inference | |
| f0_embedding = self.f0_dropout(f0_embedding, training=True) | |
| energy_embedding = self.energy_dropout(energy_embedding, training=True) | |
| # sum features | |
| last_encoder_hidden_states += f0_embedding + energy_embedding | |
| length_regulator_outputs, encoder_masks = self.length_regulator( | |
| [last_encoder_hidden_states, duration_gts], training=training | |
| ) | |
| # create decoder positional embedding | |
| decoder_pos = tf.range( | |
| 1, tf.shape(length_regulator_outputs)[1] + 1, dtype=tf.int32 | |
| ) | |
| masked_decoder_pos = tf.expand_dims(decoder_pos, 0) * encoder_masks | |
| decoder_output = self.decoder( | |
| [length_regulator_outputs, speaker_ids, encoder_masks, masked_decoder_pos], | |
| training=training, | |
| ) | |
| last_decoder_hidden_states = decoder_output[0] | |
| # here u can use sum or concat more than 1 hidden states layers from decoder. | |
| mels_before = self.mel_dense(last_decoder_hidden_states) | |
| mels_after = ( | |
| self.postnet([mels_before, encoder_masks], training=training) + mels_before | |
| ) | |
| outputs = ( | |
| mels_before, | |
| mels_after, | |
| duration_outputs, | |
| f0_outputs, | |
| energy_outputs, | |
| ) | |
| return outputs | |
| def _inference( | |
| self, input_ids, speaker_ids, speed_ratios, f0_ratios, energy_ratios, **kwargs, | |
| ): | |
| """Call logic.""" | |
| attention_mask = tf.math.not_equal(input_ids, 0) | |
| embedding_output = self.embeddings([input_ids, speaker_ids], training=False) | |
| encoder_output = self.encoder( | |
| [embedding_output, attention_mask], training=False | |
| ) | |
| last_encoder_hidden_states = encoder_output[0] | |
| # expand ratios | |
| speed_ratios = tf.expand_dims(speed_ratios, 1) # [B, 1] | |
| f0_ratios = tf.expand_dims(f0_ratios, 1) # [B, 1] | |
| energy_ratios = tf.expand_dims(energy_ratios, 1) # [B, 1] | |
| # energy predictor, here use last_encoder_hidden_states, u can use more hidden_states layers | |
| # rather than just use last_hidden_states of encoder for energy_predictor. | |
| duration_outputs = self.duration_predictor( | |
| [last_encoder_hidden_states, speaker_ids, attention_mask] | |
| ) # [batch_size, length] | |
| duration_outputs = tf.nn.relu(tf.math.exp(duration_outputs) - 1.0) | |
| duration_outputs = tf.cast( | |
| tf.math.round(duration_outputs * speed_ratios), tf.int32 | |
| ) | |
| f0_outputs = self.f0_predictor( | |
| [last_encoder_hidden_states, speaker_ids, attention_mask], training=False | |
| ) | |
| f0_outputs *= f0_ratios | |
| energy_outputs = self.energy_predictor( | |
| [last_encoder_hidden_states, speaker_ids, attention_mask], training=False | |
| ) | |
| energy_outputs *= energy_ratios | |
| f0_embedding = self.f0_dropout( | |
| self.f0_embeddings(tf.expand_dims(f0_outputs, 2)), training=True | |
| ) | |
| energy_embedding = self.energy_dropout( | |
| self.energy_embeddings(tf.expand_dims(energy_outputs, 2)), training=True | |
| ) | |
| # sum features | |
| last_encoder_hidden_states += f0_embedding + energy_embedding | |
| length_regulator_outputs, encoder_masks = self.length_regulator( | |
| [last_encoder_hidden_states, duration_outputs], training=False | |
| ) | |
| # create decoder positional embedding | |
| decoder_pos = tf.range( | |
| 1, tf.shape(length_regulator_outputs)[1] + 1, dtype=tf.int32 | |
| ) | |
| masked_decoder_pos = tf.expand_dims(decoder_pos, 0) * encoder_masks | |
| decoder_output = self.decoder( | |
| [length_regulator_outputs, speaker_ids, encoder_masks, masked_decoder_pos], | |
| training=False, | |
| ) | |
| last_decoder_hidden_states = decoder_output[0] | |
| # here u can use sum or concat more than 1 hidden states layers from decoder. | |
| mel_before = self.mel_dense(last_decoder_hidden_states) | |
| mel_after = ( | |
| self.postnet([mel_before, encoder_masks], training=False) + mel_before | |
| ) | |
| outputs = (mel_before, mel_after, duration_outputs, f0_outputs, energy_outputs) | |
| return outputs | |
| def setup_inference_fn(self): | |
| self.inference = tf.function( | |
| self._inference, | |
| experimental_relax_shapes=True, | |
| input_signature=[ | |
| tf.TensorSpec(shape=[None, None], dtype=tf.int32, name="input_ids"), | |
| tf.TensorSpec(shape=[None,], dtype=tf.int32, name="speaker_ids"), | |
| tf.TensorSpec(shape=[None,], dtype=tf.float32, name="speed_ratios"), | |
| tf.TensorSpec(shape=[None,], dtype=tf.float32, name="f0_ratios"), | |
| tf.TensorSpec(shape=[None,], dtype=tf.float32, name="energy_ratios"), | |
| ], | |
| ) | |
| self.inference_tflite = tf.function( | |
| self._inference, | |
| experimental_relax_shapes=True, | |
| input_signature=[ | |
| tf.TensorSpec(shape=[1, None], dtype=tf.int32, name="input_ids"), | |
| tf.TensorSpec(shape=[1,], dtype=tf.int32, name="speaker_ids"), | |
| tf.TensorSpec(shape=[1,], dtype=tf.float32, name="speed_ratios"), | |
| tf.TensorSpec(shape=[1,], dtype=tf.float32, name="f0_ratios"), | |
| tf.TensorSpec(shape=[1,], dtype=tf.float32, name="energy_ratios"), | |
| ], | |
| ) | |