| from typing import Dict |
|
|
| import numpy as np |
| import torch |
| from torch import nn |
|
|
|
|
| def numpy_to_torch(np_array, dtype, cuda=False, device="cpu"): |
| if cuda: |
| device = "cuda" |
| if np_array is None: |
| return None |
| tensor = torch.as_tensor(np_array, dtype=dtype, device=device) |
| return tensor |
|
|
|
|
| def compute_style_mel(style_wav, ap, cuda=False, device="cpu"): |
| if cuda: |
| device = "cuda" |
| style_mel = torch.FloatTensor( |
| ap.melspectrogram(ap.load_wav(style_wav, sr=ap.sample_rate)), |
| device=device, |
| ).unsqueeze(0) |
| return style_mel |
|
|
|
|
| def run_model_torch( |
| model: nn.Module, |
| inputs: torch.Tensor, |
| speaker_id: int = None, |
| style_mel: torch.Tensor = None, |
| style_text: str = None, |
| d_vector: torch.Tensor = None, |
| language_id: torch.Tensor = None, |
| ) -> Dict: |
| """Run a torch model for inference. It does not support batch inference. |
| |
| Args: |
| model (nn.Module): The model to run inference. |
| inputs (torch.Tensor): Input tensor with character ids. |
| speaker_id (int, optional): Input speaker ids for multi-speaker models. Defaults to None. |
| style_mel (torch.Tensor, optional): Spectrograms used for voice styling . Defaults to None. |
| d_vector (torch.Tensor, optional): d-vector for multi-speaker models . Defaults to None. |
| |
| Returns: |
| Dict: model outputs. |
| """ |
| input_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) |
| if hasattr(model, "module"): |
| _func = model.module.inference |
| else: |
| _func = model.inference |
| outputs = _func( |
| inputs, |
| aux_input={ |
| "x_lengths": input_lengths, |
| "speaker_ids": speaker_id, |
| "d_vectors": d_vector, |
| "style_mel": style_mel, |
| "style_text": style_text, |
| "language_ids": language_id, |
| }, |
| ) |
| return outputs |
|
|
|
|
| def trim_silence(wav, ap): |
| return wav[: ap.find_endpoint(wav)] |
|
|
|
|
| def inv_spectrogram(postnet_output, ap, CONFIG): |
| if CONFIG.model.lower() in ["tacotron"]: |
| wav = ap.inv_spectrogram(postnet_output.T) |
| else: |
| wav = ap.inv_melspectrogram(postnet_output.T) |
| return wav |
|
|
|
|
| def id_to_torch(aux_id, cuda=False, device="cpu"): |
| if cuda: |
| device = "cuda" |
| if aux_id is not None: |
| aux_id = np.asarray(aux_id) |
| aux_id = torch.from_numpy(aux_id).to(device) |
| return aux_id |
|
|
|
|
| def embedding_to_torch(d_vector, cuda=False, device="cpu"): |
| if cuda: |
| device = "cuda" |
| if d_vector is not None: |
| d_vector = np.asarray(d_vector) |
| d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor) |
| d_vector = d_vector.squeeze().unsqueeze(0).to(device) |
| return d_vector |
|
|
|
|
| |
| def apply_griffin_lim(inputs, input_lens, CONFIG, ap): |
| """Apply griffin-lim to each sample iterating throught the first dimension. |
| Args: |
| inputs (Tensor or np.Array): Features to be converted by GL. First dimension is the batch size. |
| input_lens (Tensor or np.Array): 1D array of sample lengths. |
| CONFIG (Dict): TTS config. |
| ap (AudioProcessor): TTS audio processor. |
| """ |
| wavs = [] |
| for idx, spec in enumerate(inputs): |
| wav_len = (input_lens[idx] * ap.hop_length) - ap.hop_length |
| wav = inv_spectrogram(spec, ap, CONFIG) |
| |
| wavs.append(wav[:wav_len]) |
| return wavs |
|
|
|
|
| def synthesis( |
| model, |
| text, |
| CONFIG, |
| use_cuda, |
| speaker_id=None, |
| style_wav=None, |
| style_text=None, |
| use_griffin_lim=False, |
| do_trim_silence=False, |
| d_vector=None, |
| language_id=None, |
| ): |
| """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to |
| the vocoder model. |
| |
| Args: |
| model (TTS.tts.models): |
| The TTS model to synthesize audio with. |
| |
| text (str): |
| The input text to convert to speech. |
| |
| CONFIG (Coqpit): |
| Model configuration. |
| |
| use_cuda (bool): |
| Enable/disable CUDA. |
| |
| speaker_id (int): |
| Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None. |
| |
| style_wav (str | Dict[str, float]): |
| Path or tensor to/of a waveform used for computing the style embedding based on GST or Capacitron. |
| Defaults to None, meaning that Capacitron models will sample from the prior distribution to |
| generate random but realistic prosody. |
| |
| style_text (str): |
| Transcription of style_wav for Capacitron models. Defaults to None. |
| |
| enable_eos_bos_chars (bool): |
| enable special chars for end of sentence and start of sentence. Defaults to False. |
| |
| do_trim_silence (bool): |
| trim silence after synthesis. Defaults to False. |
| |
| d_vector (torch.Tensor): |
| d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None. |
| |
| language_id (int): |
| Language ID passed to the language embedding layer in multi-langual model. Defaults to None. |
| """ |
| |
| device = next(model.parameters()).device |
| if use_cuda: |
| device = "cuda" |
|
|
| |
| |
| style_mel = None |
| if CONFIG.has("gst") and CONFIG.gst and style_wav is not None: |
| if isinstance(style_wav, dict): |
| style_mel = style_wav |
| else: |
| style_mel = compute_style_mel(style_wav, model.ap, device=device) |
|
|
| if CONFIG.has("capacitron_vae") and CONFIG.use_capacitron_vae and style_wav is not None: |
| style_mel = compute_style_mel(style_wav, model.ap, device=device) |
| style_mel = style_mel.transpose(1, 2) |
|
|
| language_name = None |
| if language_id is not None: |
| language = [k for k, v in model.language_manager.name_to_id.items() if v == language_id] |
| assert len(language) == 1, "language_id must be a valid language" |
| language_name = language[0] |
|
|
| |
| text_inputs = np.asarray( |
| model.tokenizer.text_to_ids(text, language=language_name), |
| dtype=np.int32, |
| ) |
| |
| if speaker_id is not None: |
| speaker_id = id_to_torch(speaker_id, device=device) |
|
|
| if d_vector is not None: |
| d_vector = embedding_to_torch(d_vector, device=device) |
|
|
| if language_id is not None: |
| language_id = id_to_torch(language_id, device=device) |
|
|
| if not isinstance(style_mel, dict): |
| |
| style_mel = numpy_to_torch(style_mel, torch.float, device=device) |
| if style_text is not None: |
| style_text = np.asarray( |
| model.tokenizer.text_to_ids(style_text, language=language_id), |
| dtype=np.int32, |
| ) |
| style_text = numpy_to_torch(style_text, torch.long, device=device) |
| style_text = style_text.unsqueeze(0) |
|
|
| text_inputs = numpy_to_torch(text_inputs, torch.long, device=device) |
| text_inputs = text_inputs.unsqueeze(0) |
| |
| outputs = run_model_torch( |
| model, |
| text_inputs, |
| speaker_id, |
| style_mel, |
| style_text, |
| d_vector=d_vector, |
| language_id=language_id, |
| ) |
| model_outputs = outputs["model_outputs"] |
| model_outputs = model_outputs[0].data.cpu().numpy() |
| alignments = outputs["alignments"] |
|
|
| |
| |
| wav = None |
| model_outputs = model_outputs.squeeze() |
| if model_outputs.ndim == 2: |
| if use_griffin_lim: |
| wav = inv_spectrogram(model_outputs, model.ap, CONFIG) |
| |
| if do_trim_silence: |
| wav = trim_silence(wav, model.ap) |
| else: |
| wav = model_outputs |
| return_dict = { |
| "wav": wav, |
| "alignments": alignments, |
| "text_inputs": text_inputs, |
| "outputs": outputs, |
| } |
| return return_dict |
|
|
|
|
| def transfer_voice( |
| model, |
| CONFIG, |
| use_cuda, |
| reference_wav, |
| speaker_id=None, |
| d_vector=None, |
| reference_speaker_id=None, |
| reference_d_vector=None, |
| do_trim_silence=False, |
| use_griffin_lim=False, |
| ): |
| """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to |
| the vocoder model. |
| |
| Args: |
| model (TTS.tts.models): |
| The TTS model to synthesize audio with. |
| |
| CONFIG (Coqpit): |
| Model configuration. |
| |
| use_cuda (bool): |
| Enable/disable CUDA. |
| |
| reference_wav (str): |
| Path of reference_wav to be used to voice conversion. |
| |
| speaker_id (int): |
| Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None. |
| |
| d_vector (torch.Tensor): |
| d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None. |
| |
| reference_speaker_id (int): |
| Reference Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None. |
| |
| reference_d_vector (torch.Tensor): |
| Reference d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None. |
| |
| enable_eos_bos_chars (bool): |
| enable special chars for end of sentence and start of sentence. Defaults to False. |
| |
| do_trim_silence (bool): |
| trim silence after synthesis. Defaults to False. |
| """ |
| |
| device = next(model.parameters()).device |
| if use_cuda: |
| device = "cuda" |
|
|
| |
| if speaker_id is not None: |
| speaker_id = id_to_torch(speaker_id, device=device) |
|
|
| if d_vector is not None: |
| d_vector = embedding_to_torch(d_vector, device=device) |
|
|
| if reference_d_vector is not None: |
| reference_d_vector = embedding_to_torch(reference_d_vector, device=device) |
|
|
| |
| reference_wav = embedding_to_torch( |
| model.ap.load_wav( |
| reference_wav, sr=model.args.encoder_sample_rate if model.args.encoder_sample_rate else model.ap.sample_rate |
| ), |
| device=device, |
| ) |
|
|
| if hasattr(model, "module"): |
| _func = model.module.inference_voice_conversion |
| else: |
| _func = model.inference_voice_conversion |
| model_outputs = _func(reference_wav, speaker_id, d_vector, reference_speaker_id, reference_d_vector) |
|
|
| |
| |
| wav = None |
| model_outputs = model_outputs.squeeze() |
| if model_outputs.ndim == 2: |
| if use_griffin_lim: |
| wav = inv_spectrogram(model_outputs, model.ap, CONFIG) |
| |
| if do_trim_silence: |
| wav = trim_silence(wav, model.ap) |
| else: |
| wav = model_outputs |
|
|
| return wav |
|
|