File size: 1,911 Bytes
9415f0c 5fbbfb7 9415f0c c2236e4 9415f0c 1d032ff c2236e4 9415f0c c2236e4 9415f0c c2236e4 9415f0c c2236e4 9415f0c c2236e4 9415f0c c2236e4 9415f0c c2236e4 9415f0c c2236e4 9415f0c c2236e4 9415f0c c2236e4 9415f0c c2236e4 9415f0c c2236e4 9415f0c c2236e4 9415f0c c2236e4 9415f0c c2236e4 9415f0c c2236e4 5fbbfb7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
---
library_name: transformers
license: apache-2.0
language:
- en
pipeline_tag: text-to-speech
---
Converted the models from https://github.com/taylorchu/2cent-tts to .safetensors. Below is inference code:
```python
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained(
"timBoML/2cent-tts-25m"
)
tokenizer = AutoTokenizer.from_pretrained("timBoML/2cent-tts-25m")
phones = "həlˈoʊ aɪɐm tˈuː sˈɛnt tˌiːtˌiːˈɛs" # using espeak-ng
input_ids = (
tokenizer.encode(phones, add_special_tokens=False)
+ tokenizer.encode("<s>", add_special_tokens=False)
+ [4136]
)
input_ids = torch.tensor(input_ids).unsqueeze(0)
generated_ids = model.generate(
input_ids=input_ids,
max_new_tokens=2048,
)
generated_ids = generated_ids.squeeze()
tokens = generated_ids[input_ids.shape[1]:]
first_audio_token = tokenizer.encode("<audio_0>")[-1]
tokens = tokens - first_audio_token
import locale
import torchaudio.transforms as T
import os
import torch
from snac import SNAC
locale.getpreferredencoding = lambda: "UTF-8"
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
def redistribute_codes(code_list):
layer_1 = []
layer_2 = []
layer_3 = []
for i in range((len(code_list)+1)//7):
layer_1.append(code_list[7*i])
layer_2.append(code_list[7*i+1])
layer_3.append(code_list[7*i+2])
layer_3.append(code_list[7*i+3])
layer_2.append(code_list[7*i+4])
layer_3.append(code_list[7*i+5])
layer_3.append(code_list[7*i+6])
codes = [torch.tensor(layer_1).unsqueeze(0),
torch.tensor(layer_2).unsqueeze(0),
torch.tensor(layer_3).unsqueeze(0)]
audio_hat = snac_model.decode(codes)
return audio_hat
sample = redistribute_codes(tokens)
from IPython.display import Audio, display
display(Audio(sample.detach().squeeze().to("cpu").numpy(), rate=24000))
``` |