File size: 1,911 Bytes
7ce0ce5
 
62687e3
 
 
 
7ce0ce5
 
f8ef104
7ce0ce5
25d6c49
f8ef104
 
7ce0ce5
f8ef104
 
 
 
7ce0ce5
f8ef104
7ce0ce5
f8ef104
 
 
 
 
7ce0ce5
f8ef104
7ce0ce5
f8ef104
 
 
7ce0ce5
f8ef104
7ce0ce5
f8ef104
7ce0ce5
f8ef104
7ce0ce5
f8ef104
7ce0ce5
f8ef104
7ce0ce5
 
f8ef104
 
 
 
 
 
7ce0ce5
f8ef104
7ce0ce5
f8ef104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ce0ce5
f8ef104
7ce0ce5
f8ef104
 
62687e3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
---
library_name: transformers
license: apache-2.0
language:
- en
pipeline_tag: text-to-speech
---

Converted the models from https://github.com/taylorchu/2cent-tts to .safetensors. Below is inference code:

```python
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained(
    "timBoML/2cent-tts-60m"
)
tokenizer = AutoTokenizer.from_pretrained("timBoML/2cent-tts-60m")

phones = "həlˈoʊ aɪɐm tˈuː sˈɛnt tˌiːtˌiːˈɛs" # using espeak-ng

input_ids = (
    tokenizer.encode(phones, add_special_tokens=False)
    + tokenizer.encode("<s>", add_special_tokens=False)
    + [4136]
)

input_ids = torch.tensor(input_ids).unsqueeze(0)

generated_ids = model.generate(
      input_ids=input_ids,
      max_new_tokens=2048,

  )

generated_ids = generated_ids.squeeze()

tokens = generated_ids[input_ids.shape[1]:]

first_audio_token = tokenizer.encode("<audio_0>")[-1]

tokens = tokens - first_audio_token


import locale
import torchaudio.transforms as T
import os
import torch
from snac import SNAC
locale.getpreferredencoding = lambda: "UTF-8"

snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")

def redistribute_codes(code_list):
  layer_1 = []
  layer_2 = []
  layer_3 = []
  for i in range((len(code_list)+1)//7):
    layer_1.append(code_list[7*i])
    layer_2.append(code_list[7*i+1])
    layer_3.append(code_list[7*i+2])
    layer_3.append(code_list[7*i+3])
    layer_2.append(code_list[7*i+4])
    layer_3.append(code_list[7*i+5])
    layer_3.append(code_list[7*i+6])
  codes = [torch.tensor(layer_1).unsqueeze(0),
         torch.tensor(layer_2).unsqueeze(0),
         torch.tensor(layer_3).unsqueeze(0)]
  audio_hat = snac_model.decode(codes)
  return audio_hat

sample = redistribute_codes(tokens)

from IPython.display import Audio, display
display(Audio(sample.detach().squeeze().to("cpu").numpy(), rate=24000))
```