Spaces:
Runtime error
Runtime error
| import re | |
| import os | |
| import nltk | |
| import torch | |
| import pickle | |
| import torchaudio | |
| import numpy as np | |
| from TTS.tts.models.xtts import Xtts | |
| from nltk.tokenize import sent_tokenize | |
| from TTS.tts.configs.xtts_config import XttsConfig | |
| def _load_array(filename): | |
| """ Opens a file a returns it, used with numpy files """ | |
| with open(filename, 'rb') as f: | |
| return pickle.load(f) | |
| os.environ['COQUI_TOS_AGREED'] = '1' | |
| # Used to generate audio based on a sample | |
| nltk.download('punkt') | |
| model_path = os.path.join("tts_model") | |
| config = XttsConfig() | |
| config.load_json(os.path.join(model_path, "config.json")) | |
| model = Xtts.init_from_config(config) | |
| model.load_checkpoint( | |
| config, | |
| checkpoint_path=os.path.join(model_path, "model.pth"), | |
| vocab_path=os.path.join(model_path, "vocab.json"), | |
| eval=True, | |
| use_deepspeed=True, | |
| ) | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| model.to(device) | |
| # Speaker latent | |
| path_latents = 'assets/gpt_cond_latent.npy' | |
| gpt_cond_latent = _load_array(path_latents) | |
| # Speaker embedding | |
| path_embedding = 'assets/speaker_embedding.npy' | |
| speaker_embedding = _load_array(path_embedding) | |
| def get_audio(text: str, language: str = 'es', saving_path: str = 'output') -> None: | |
| """ | |
| Creates an audio | |
| :param text: text to convert to audio | |
| :param language: 'es', 'en' or 'pt', language used for the audio file | |
| :param saving_path: path to save the audio | |
| :return: None | |
| """ | |
| # Creates an audio with the answer and saves it as output.wav | |
| _save_audio(text, language, saving_path) | |
| return | |
| def _save_audio(text: str, language: str, path_audio: str) -> None: | |
| """ | |
| Splits the text into sentences, clean and creates an audio for each one, then concatenates | |
| all the audios and saves them into a file. | |
| :param text: input text | |
| :param language: language used in the audio | |
| :param path_audio: saving path of the audio | |
| :return: None | |
| """ | |
| # Split the answer into sentences and clean it | |
| sentences = _get_clean_text(text, language) | |
| # Get the voice of each sentence | |
| audio_segments = [] | |
| for sentence in sentences: | |
| audio_stream = _get_voice(sentence, language) | |
| audio_stream = torch.tensor(audio_stream) | |
| audio_segments.append(audio_stream) | |
| # Concatenate and save all audio segments | |
| concatenated_audio = torch.cat(audio_segments, dim=0) | |
| torchaudio.save(f'{path_audio}.wav', concatenated_audio.unsqueeze(0), 24000) | |
| return | |
| def _get_voice(sentence: str, language: str) -> np.ndarray: | |
| """ | |
| Gets a numpy array with a wav of an audio with the given sentence and language | |
| :param sentence: input sentence | |
| :param language: languages used in the audio | |
| :return: numpy array with the audio | |
| """ | |
| out = model.inference( | |
| sentence, | |
| language=language, | |
| gpt_cond_latent=gpt_cond_latent, | |
| speaker_embedding=speaker_embedding, | |
| temperature=0.1 | |
| ) | |
| return out['wav'] | |
| def _get_clean_text(text: str, language: str) -> list[str]: | |
| """ | |
| Splits the text into smaller sentences using nltk and removes links. | |
| :param text: input text for the audio | |
| :param language: language used for the audio ('es', 'en', 'pt') | |
| :return: list of sentences | |
| """ | |
| # Remove the links in the audio and add another sentence | |
| if language == 'en': | |
| clean_answer = re.sub(r'http[s]?://\S+', 'the following link', text) | |
| max_characters = 250 | |
| elif language == 'es': | |
| clean_answer = re.sub(r'http[s]?://\S+', 'el siguiente link', text) | |
| max_characters = 239 | |
| else: | |
| clean_answer = re.sub(r'http[s]?://\S+', 'o seguinte link', text) | |
| max_characters = 203 | |
| # Change the name from Bella to Bela | |
| clean_answer = clean_answer.replace('Bella', 'Bela') | |
| # Remove Florida and zipcode | |
| clean_answer = re.sub(r', FL \d+', "", clean_answer) | |
| # Split the answer into sentences with nltk and make sure they are shorter than the maximum possible | |
| # characters | |
| split_sentences = sent_tokenize(clean_answer) | |
| sentences = [] | |
| for sentence in split_sentences: | |
| if len(sentence) > max_characters: | |
| sentences.extend(_split_sentence(sentence, max_characters)) | |
| else: | |
| sentences.append(sentence) | |
| return sentences | |
| def _split_sentence(sentence: str, max_characters: int) -> list[str]: | |
| """ | |
| Used when the sentences are still to long. The split point is the nearest comma to the middle | |
| of the sentence, if there is no comma then a space is used or just the middle. If the | |
| remaining sentences are still too long, another iteration is run. | |
| :param sentence: sentence to be split | |
| :param max_characters: max number of characters a sentence can have | |
| :return: list of sentences | |
| """ | |
| # Get index of each comma | |
| sentences = [] | |
| commas = [i for i, c in enumerate(sentence) if c == ','] | |
| # No commas, search for spaces | |
| if len(commas) == 0: | |
| commas = [i for i, c in enumerate(sentence) if c == ' '] | |
| # No commas or spaces, split it in the middle | |
| if len(commas) == 0: | |
| sentences.append(sentence[:len(sentence) // 2]) | |
| sentences.append(sentence[len(sentence) // 2:]) | |
| return sentences | |
| # Nearest index to the middle | |
| split_point = min(commas, key=lambda x: abs(x - (len(sentence) // 2))) | |
| if sentence[split_point] == ',': | |
| left = sentence[:split_point] | |
| right = sentence[split_point + 2:] | |
| else: | |
| left = sentence[:split_point] | |
| right = sentence[split_point + 1:] | |
| if len(left) > max_characters: | |
| sentences.extend(_split_sentence(left, max_characters)) | |
| else: | |
| sentences.append(left) | |
| if len(right) > max_characters: | |
| sentences.extend(_split_sentence(right, max_characters)) | |
| else: | |
| sentences.append(right) | |
| return sentences | |