File size: 1,911 Bytes
9415f0c
 
5fbbfb7
 
 
 
9415f0c
 
c2236e4
9415f0c
1d032ff
c2236e4
 
9415f0c
c2236e4
 
 
 
9415f0c
c2236e4
9415f0c
c2236e4
 
 
 
 
9415f0c
c2236e4
9415f0c
c2236e4
 
 
9415f0c
c2236e4
9415f0c
c2236e4
9415f0c
c2236e4
9415f0c
c2236e4
9415f0c
c2236e4
9415f0c
 
c2236e4
 
 
 
 
 
9415f0c
c2236e4
9415f0c
c2236e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9415f0c
c2236e4
9415f0c
c2236e4
 
5fbbfb7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
---
library_name: transformers
license: apache-2.0
language:
- en
pipeline_tag: text-to-speech
---

Converted the models from https://github.com/taylorchu/2cent-tts to .safetensors. Below is inference code:

```python
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained(
    "timBoML/2cent-tts-25m"
)
tokenizer = AutoTokenizer.from_pretrained("timBoML/2cent-tts-25m")

phones = "həlˈoʊ aɪɐm tˈuː sˈɛnt tˌiːtˌiːˈɛs" # using espeak-ng

input_ids = (
    tokenizer.encode(phones, add_special_tokens=False)
    + tokenizer.encode("<s>", add_special_tokens=False)
    + [4136]
)

input_ids = torch.tensor(input_ids).unsqueeze(0)

generated_ids = model.generate(
      input_ids=input_ids,
      max_new_tokens=2048,

  )

generated_ids = generated_ids.squeeze()

tokens = generated_ids[input_ids.shape[1]:]

first_audio_token = tokenizer.encode("<audio_0>")[-1]

tokens = tokens - first_audio_token


import locale
import torchaudio.transforms as T
import os
import torch
from snac import SNAC
locale.getpreferredencoding = lambda: "UTF-8"

snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")

def redistribute_codes(code_list):
  layer_1 = []
  layer_2 = []
  layer_3 = []
  for i in range((len(code_list)+1)//7):
    layer_1.append(code_list[7*i])
    layer_2.append(code_list[7*i+1])
    layer_3.append(code_list[7*i+2])
    layer_3.append(code_list[7*i+3])
    layer_2.append(code_list[7*i+4])
    layer_3.append(code_list[7*i+5])
    layer_3.append(code_list[7*i+6])
  codes = [torch.tensor(layer_1).unsqueeze(0),
         torch.tensor(layer_2).unsqueeze(0),
         torch.tensor(layer_3).unsqueeze(0)]
  audio_hat = snac_model.decode(codes)
  return audio_hat

sample = redistribute_codes(tokens)

from IPython.display import Audio, display
display(Audio(sample.detach().squeeze().to("cpu").numpy(), rate=24000))
```