Sebian TTS
Collection
Sebian TTS
•
2 items
•
Updated
•
1
AiDA TTS hf Space *(malo strpljenja min 10 min. da se učita model)
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
!pip install unsloth
else:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth
!pip install snac
!pip install wandb
!pip install -U datasets
from unsloth import FastLanguageModel
import torch
dtype = None
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "datatab/aida-parla-16bit-v1",
max_seq_length=32768,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
def generate_audio_for_prompts(prompts, chosen_voice=None, model=None, tokenizer=None):
import locale
import torch
from snac import SNAC
from IPython.display import display, Audio
locale.getpreferredencoding = lambda: "UTF-8"
if model is None or tokenizer is None:
raise ValueError("You must pass both model and tokenizer.")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cuda")
# Enable fast inference
FastLanguageModel.for_inference(model)
# Move SNAC model to CPU for decoding
snac_model = snac_model.to("cpu")
for prompt in prompts:
print(f"\n🗣 Prompt: {prompt}")
full_prompt = f"{chosen_voice}: {prompt}" if chosen_voice else prompt
input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
start_token = torch.tensor([[128259]], dtype=torch.int64)
end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)
modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
padding = 0
max_len = modified_input_ids.shape[1]
padded_tensor = modified_input_ids
attention_mask = torch.ones((1, max_len), dtype=torch.int64)
input_ids = padded_tensor.to("cuda")
attention_mask = attention_mask.to("cuda")
generated_ids = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=4096,
do_sample=True,
temperature=0.2,
top_p=0.95,
repetition_penalty=1.6,
num_return_sequences=1,
eos_token_id=128258,
use_cache=True
)
token_to_find = 128257
token_to_remove = 128258
token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
if len(token_indices[1]) > 0:
last_idx = token_indices[1][-1].item()
cropped = generated_ids[:, last_idx + 1:]
else:
cropped = generated_ids
cropped = cropped[cropped != token_to_remove]
row_length = cropped.size(0)
new_length = (row_length // 7) * 7
trimmed_row = cropped[:new_length]
trimmed_row = [t - 128266 for t in trimmed_row]
def redistribute_codes(code_list):
layer_1 = []
layer_2 = []
layer_3 = []
for i in range((len(code_list)+1)//7):
layer_1.append(code_list[7*i])
layer_2.append(code_list[7*i+1]-4096)
layer_3.append(code_list[7*i+2]-(2*4096))
layer_3.append(code_list[7*i+3]-(3*4096))
layer_2.append(code_list[7*i+4]-(4*4096))
layer_3.append(code_list[7*i+5]-(5*4096))
layer_3.append(code_list[7*i+6]-(6*4096))
codes = [torch.tensor(layer_1).unsqueeze(0),
torch.tensor(layer_2).unsqueeze(0),
torch.tensor(layer_3).unsqueeze(0)]
# codes = [c.to("cuda") for c in codes]
audio_hat = snac_model.decode(codes)
return audio_hat
try:
samples = redistribute_codes(trimmed_row)
display(Audio(samples.detach().squeeze().to("cpu").numpy(), rate=24000))
except Exception as e:
print("❌ Error decoding audio:", str(e))
# Clean up
del samples
prompts = [
"Novi Sad je Evropska prestonica kulture,posle Beograda,drugi grad u Srbiji po broju stanovnika.",
"Kragujevac je gradsko naselje i sedište istoimene teritorijalne jedinice u Srbiji.",
"Gost emisije Među nama bio je Robert Kozma iz Zeleno levog fronta.",
"On je čovek brojnih zanimanja, kojima je zajednička kreativnost. On je i satiričar i glumac i muzičar, imitator.",
]
chosen_voice = "alek" # glasovi: "mila", "senka", "judita", "saska", "goga", "alek", "arsa", "janko", "bora", "zoki"
generate_audio_for_prompts(prompts, chosen_voice, model=model, tokenizer=tokenizer)