HF SPACE

~~AiDA TTS hf Space *(malo strpljenja min 10 min. da se učita model)~~

Inference kod (google colab):

%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth
!pip install snac
!pip install wandb
!pip install -U datasets

from unsloth import FastLanguageModel
import torch

dtype = None
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "datatab/aida-parla-16bit-v1",
    max_seq_length=32768,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

def generate_audio_for_prompts(prompts, chosen_voice=None, model=None, tokenizer=None):
    import locale
    import torch
    from snac import SNAC
    from IPython.display import display, Audio

    locale.getpreferredencoding = lambda: "UTF-8"

    if model is None or tokenizer is None:
        raise ValueError("You must pass both model and tokenizer.")

    snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cuda")

    # Enable fast inference
    FastLanguageModel.for_inference(model)

    # Move SNAC model to CPU for decoding
    snac_model = snac_model.to("cpu")

    for prompt in prompts:
        print(f"\n🗣 Prompt: {prompt}")

        full_prompt = f"{chosen_voice}: {prompt}" if chosen_voice else prompt
        input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids

        start_token = torch.tensor([[128259]], dtype=torch.int64)
        end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)

        modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)

        padding = 0
        max_len = modified_input_ids.shape[1]
        padded_tensor = modified_input_ids
        attention_mask = torch.ones((1, max_len), dtype=torch.int64)

        input_ids = padded_tensor.to("cuda")
        attention_mask = attention_mask.to("cuda")

        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=4096,
            do_sample=True,
            temperature=0.2,
            top_p=0.95,
            repetition_penalty=1.6,
            num_return_sequences=1,
            eos_token_id=128258,
            use_cache=True
        )

        token_to_find = 128257
        token_to_remove = 128258

        token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
        if len(token_indices[1]) > 0:
            last_idx = token_indices[1][-1].item()
            cropped = generated_ids[:, last_idx + 1:]
        else:
            cropped = generated_ids

        cropped = cropped[cropped != token_to_remove]

        row_length = cropped.size(0)
        new_length = (row_length // 7) * 7
        trimmed_row = cropped[:new_length]
        trimmed_row = [t - 128266 for t in trimmed_row]

        def redistribute_codes(code_list):
          layer_1 = []
          layer_2 = []
          layer_3 = []
          for i in range((len(code_list)+1)//7):
            layer_1.append(code_list[7*i])
            layer_2.append(code_list[7*i+1]-4096)
            layer_3.append(code_list[7*i+2]-(2*4096))
            layer_3.append(code_list[7*i+3]-(3*4096))
            layer_2.append(code_list[7*i+4]-(4*4096))
            layer_3.append(code_list[7*i+5]-(5*4096))
            layer_3.append(code_list[7*i+6]-(6*4096))
          codes = [torch.tensor(layer_1).unsqueeze(0),
                torch.tensor(layer_2).unsqueeze(0),
                torch.tensor(layer_3).unsqueeze(0)]

          # codes = [c.to("cuda") for c in codes]
          audio_hat = snac_model.decode(codes)
          return audio_hat

        try:
            samples = redistribute_codes(trimmed_row)
            display(Audio(samples.detach().squeeze().to("cpu").numpy(), rate=24000))
        except Exception as e:
            print("❌ Error decoding audio:", str(e))

    # Clean up
    del samples

prompts = [
    "Novi Sad je Evropska prestonica kulture,posle Beograda,drugi grad u Srbiji po broju stanovnika.",
    "Kragujevac je gradsko naselje i sedište istoimene teritorijalne jedinice u Srbiji.",
    "Gost emisije Među nama bio je Robert Kozma iz Zeleno levog fronta.",
    "On je čovek brojnih zanimanja, kojima je zajednička kreativnost. On je i satiričar i glumac i muzičar, imitator.",
]

chosen_voice = "alek"  # glasovi: "mila", "senka", "judita", "saska", "goga", "alek", "arsa", "janko", "bora", "zoki"
generate_audio_for_prompts(prompts, chosen_voice, model=model, tokenizer=tokenizer)

Downloads last month: 42

Safetensors

Model size

4B params

Tensor type

BF16

F16

Model tree for datatab/aida-parla-16bit-v1

Quantizations

1 model

Collection including datatab/aida-parla-16bit-v1

Sebian TTS

Collection

Sebian TTS • 2 items • Updated Feb 5 • 1