--- library_name: transformers license: apache-2.0 language: - en pipeline_tag: text-to-speech --- Converted the models from https://github.com/taylorchu/2cent-tts to .safetensors. Below is inference code: ```python from transformers import AutoTokenizer, AutoModelForCausalLM import torch model = AutoModelForCausalLM.from_pretrained( "timBoML/2cent-tts-25m" ) tokenizer = AutoTokenizer.from_pretrained("timBoML/2cent-tts-25m") phones = "həlˈoʊ aɪɐm tˈuː sˈɛnt tˌiːtˌiːˈɛs" # using espeak-ng input_ids = ( tokenizer.encode(phones, add_special_tokens=False) + tokenizer.encode("", add_special_tokens=False) + [4136] ) input_ids = torch.tensor(input_ids).unsqueeze(0) generated_ids = model.generate( input_ids=input_ids, max_new_tokens=2048, ) generated_ids = generated_ids.squeeze() tokens = generated_ids[input_ids.shape[1]:] first_audio_token = tokenizer.encode("")[-1] tokens = tokens - first_audio_token import locale import torchaudio.transforms as T import os import torch from snac import SNAC locale.getpreferredencoding = lambda: "UTF-8" snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz") def redistribute_codes(code_list): layer_1 = [] layer_2 = [] layer_3 = [] for i in range((len(code_list)+1)//7): layer_1.append(code_list[7*i]) layer_2.append(code_list[7*i+1]) layer_3.append(code_list[7*i+2]) layer_3.append(code_list[7*i+3]) layer_2.append(code_list[7*i+4]) layer_3.append(code_list[7*i+5]) layer_3.append(code_list[7*i+6]) codes = [torch.tensor(layer_1).unsqueeze(0), torch.tensor(layer_2).unsqueeze(0), torch.tensor(layer_3).unsqueeze(0)] audio_hat = snac_model.decode(codes) return audio_hat sample = redistribute_codes(tokens) from IPython.display import Audio, display display(Audio(sample.detach().squeeze().to("cpu").numpy(), rate=24000)) ```