File size: 1,911 Bytes
7ce0ce5 62687e3 7ce0ce5 f8ef104 7ce0ce5 25d6c49 f8ef104 7ce0ce5 f8ef104 7ce0ce5 f8ef104 7ce0ce5 f8ef104 7ce0ce5 f8ef104 7ce0ce5 f8ef104 7ce0ce5 f8ef104 7ce0ce5 f8ef104 7ce0ce5 f8ef104 7ce0ce5 f8ef104 7ce0ce5 f8ef104 7ce0ce5 f8ef104 7ce0ce5 f8ef104 7ce0ce5 f8ef104 7ce0ce5 f8ef104 7ce0ce5 f8ef104 62687e3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
---
library_name: transformers
license: apache-2.0
language:
- en
pipeline_tag: text-to-speech
---
Converted the models from https://github.com/taylorchu/2cent-tts to .safetensors. Below is inference code:
```python
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained(
"timBoML/2cent-tts-60m"
)
tokenizer = AutoTokenizer.from_pretrained("timBoML/2cent-tts-60m")
phones = "həlˈoʊ aɪɐm tˈuː sˈɛnt tˌiːtˌiːˈɛs" # using espeak-ng
input_ids = (
tokenizer.encode(phones, add_special_tokens=False)
+ tokenizer.encode("<s>", add_special_tokens=False)
+ [4136]
)
input_ids = torch.tensor(input_ids).unsqueeze(0)
generated_ids = model.generate(
input_ids=input_ids,
max_new_tokens=2048,
)
generated_ids = generated_ids.squeeze()
tokens = generated_ids[input_ids.shape[1]:]
first_audio_token = tokenizer.encode("<audio_0>")[-1]
tokens = tokens - first_audio_token
import locale
import torchaudio.transforms as T
import os
import torch
from snac import SNAC
locale.getpreferredencoding = lambda: "UTF-8"
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
def redistribute_codes(code_list):
layer_1 = []
layer_2 = []
layer_3 = []
for i in range((len(code_list)+1)//7):
layer_1.append(code_list[7*i])
layer_2.append(code_list[7*i+1])
layer_3.append(code_list[7*i+2])
layer_3.append(code_list[7*i+3])
layer_2.append(code_list[7*i+4])
layer_3.append(code_list[7*i+5])
layer_3.append(code_list[7*i+6])
codes = [torch.tensor(layer_1).unsqueeze(0),
torch.tensor(layer_2).unsqueeze(0),
torch.tensor(layer_3).unsqueeze(0)]
audio_hat = snac_model.decode(codes)
return audio_hat
sample = redistribute_codes(tokens)
from IPython.display import Audio, display
display(Audio(sample.detach().squeeze().to("cpu").numpy(), rate=24000))
``` |