|
|
--- |
|
|
library_name: transformers |
|
|
license: apache-2.0 |
|
|
language: |
|
|
- en |
|
|
pipeline_tag: text-to-speech |
|
|
--- |
|
|
|
|
|
Converted the models from https://github.com/taylorchu/2cent-tts to .safetensors. Below is inference code: |
|
|
|
|
|
```python |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
import torch |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"timBoML/2cent-tts-60m" |
|
|
) |
|
|
tokenizer = AutoTokenizer.from_pretrained("timBoML/2cent-tts-60m") |
|
|
|
|
|
phones = "həlˈoʊ aɪɐm tˈuː sˈɛnt tˌiːtˌiːˈɛs" # using espeak-ng |
|
|
|
|
|
input_ids = ( |
|
|
tokenizer.encode(phones, add_special_tokens=False) |
|
|
+ tokenizer.encode("<s>", add_special_tokens=False) |
|
|
+ [4136] |
|
|
) |
|
|
|
|
|
input_ids = torch.tensor(input_ids).unsqueeze(0) |
|
|
|
|
|
generated_ids = model.generate( |
|
|
input_ids=input_ids, |
|
|
max_new_tokens=2048, |
|
|
|
|
|
) |
|
|
|
|
|
generated_ids = generated_ids.squeeze() |
|
|
|
|
|
tokens = generated_ids[input_ids.shape[1]:] |
|
|
|
|
|
first_audio_token = tokenizer.encode("<audio_0>")[-1] |
|
|
|
|
|
tokens = tokens - first_audio_token |
|
|
|
|
|
|
|
|
import locale |
|
|
import torchaudio.transforms as T |
|
|
import os |
|
|
import torch |
|
|
from snac import SNAC |
|
|
locale.getpreferredencoding = lambda: "UTF-8" |
|
|
|
|
|
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz") |
|
|
|
|
|
def redistribute_codes(code_list): |
|
|
layer_1 = [] |
|
|
layer_2 = [] |
|
|
layer_3 = [] |
|
|
for i in range((len(code_list)+1)//7): |
|
|
layer_1.append(code_list[7*i]) |
|
|
layer_2.append(code_list[7*i+1]) |
|
|
layer_3.append(code_list[7*i+2]) |
|
|
layer_3.append(code_list[7*i+3]) |
|
|
layer_2.append(code_list[7*i+4]) |
|
|
layer_3.append(code_list[7*i+5]) |
|
|
layer_3.append(code_list[7*i+6]) |
|
|
codes = [torch.tensor(layer_1).unsqueeze(0), |
|
|
torch.tensor(layer_2).unsqueeze(0), |
|
|
torch.tensor(layer_3).unsqueeze(0)] |
|
|
audio_hat = snac_model.decode(codes) |
|
|
return audio_hat |
|
|
|
|
|
sample = redistribute_codes(tokens) |
|
|
|
|
|
from IPython.display import Audio, display |
|
|
display(Audio(sample.detach().squeeze().to("cpu").numpy(), rate=24000)) |
|
|
``` |