Please update model card and upload the sample codes.

#1
by jarvishacks - opened

Please update model card and upload the sample codes.


import torch

from snac import SNAC

import onnx
import onnxruntime 
import onnxruntime_genai as og
import time
import json
onnxruntime.preload_dlls(directory="D:\\Cuda\\12\\bin")

import soundfile as sf

# Initialize SNAC decoder
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().cuda()



# def generate_speech(text, speaker="kavya", temperature=0.4, top_p=0.9):
#     """Generate speech from text using specified speaker voice"""
#
#     # Prepare input with speaker token
#     prompt = f"<spk_{speaker}> {text}"
#     prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False)
#
#     # Construct full sequence: [HUMAN] <spk_speaker> text [/HUMAN] [AI] [SPEECH]
#     input_tokens = [
#         START_OF_HUMAN_TOKEN,
#         *prompt_tokens,
#         END_OF_HUMAN_TOKEN,
#         START_OF_AI_TOKEN,
#         START_OF_SPEECH_TOKEN
#     ]
#
#     input_ids = torch.tensor([input_tokens], device=model.device)
#
#     # Calculate max tokens based on text length
#     max_tokens = min(int(len(text) * 1.3) * 7 + 21, 700)
#
#     # Generate audio tokens
#     with torch.no_grad():
#         output = model.generate(
#             input_ids,
#             max_new_tokens=max_tokens,
#             do_sample=True,
#             temperature=temperature,
#             top_p=top_p,
#             repetition_penalty=1.05,
#             pad_token_id=tokenizer.pad_token_id,
#             eos_token_id=[END_OF_SPEECH_TOKEN, END_OF_AI_TOKEN]
#         )
#
#     # Extract SNAC tokens
#     generated_ids = output[0][len(input_tokens):].tolist()
#     snac_tokens = [
#         token_id for token_id in generated_ids
#         if AUDIO_CODE_BASE_OFFSET <= token_id < (AUDIO_CODE_BASE_OFFSET + 7 * 4096)
#     ]
#
#     if not snac_tokens:
#         raise ValueError("No audio tokens generated")
#
#     # Decode audio
#     audio = decode_snac_tokens(snac_tokens, snac_model)
#     return audio

def decode_snac_tokens(snac_tokens, snac_model):
    print(len(snac_tokens))
    """De-interleave and decode SNAC tokens to audio"""
    if not snac_tokens or len(snac_tokens) % 7 != 0:
        return None

    # Get the device of the SNAC model. Fixed by Shresth to run on colab notebook :)
    snac_device = next(snac_model.parameters()).device

    # De-interleave tokens into 3 hierarchical levels
    codes_lvl = [[] for _ in range(3)]
    llm_codebook_offsets = [AUDIO_CODE_BASE_OFFSET + i * 4096 for i in range(7)]

    for i in range(0, len(snac_tokens), 7):
        # Level 0: Coarse (1 token)
        codes_lvl[0].append(snac_tokens[i] - llm_codebook_offsets[0])
        # Level 1: Medium (2 tokens)
        codes_lvl[1].append(snac_tokens[i+1] - llm_codebook_offsets[1])
        codes_lvl[1].append(snac_tokens[i+4] - llm_codebook_offsets[4])
        # Level 2: Fine (4 tokens)
        codes_lvl[2].append(snac_tokens[i+2] - llm_codebook_offsets[2])
        codes_lvl[2].append(snac_tokens[i+3] - llm_codebook_offsets[3])
        codes_lvl[2].append(snac_tokens[i+5] - llm_codebook_offsets[5])
        codes_lvl[2].append(snac_tokens[i+6] - llm_codebook_offsets[6])

    # Convert to tensors for SNAC decoder
    hierarchical_codes = []
    for lvl_codes in codes_lvl:
        tensor = torch.tensor(lvl_codes, dtype=torch.int32, device=snac_device).unsqueeze(0)
        if torch.any((tensor < 0) | (tensor > 4095)):
            raise ValueError("Invalid SNAC token values")
        hierarchical_codes.append(tensor)

    # Decode with SNAC
    with torch.no_grad():
        audio_hat = snac_model.decode(hierarchical_codes)

    return audio_hat.squeeze().clamp(-1, 1).cpu().numpy()

# Control token IDs (fixed for Veena)
START_OF_SPEECH_TOKEN = 128257
END_OF_SPEECH_TOKEN = 128258
START_OF_HUMAN_TOKEN = 128259
END_OF_HUMAN_TOKEN = 128260
START_OF_AI_TOKEN = 128261
END_OF_AI_TOKEN = 128262
AUDIO_CODE_BASE_OFFSET = 128266

# Available speakers
speakers:list[str] = ["kavya", "agastya", "maitri", "vinaya"]

def main():
    text =  "आज मैंने एक नई तकनीक के बारे में सीखा जो कृत्रिम बुद्धिमत्ता का उपयोग करके मानव जैसी आवाज़ उत्पन्न कर सकती है।"

    prompt = f"<custom_token_3><spk_{speakers[2]}> {text}<custom_token_4><custom_token_5><custom_token_1>"
    model_path = "onnx_model"

    # input_tokens:list[int] = [
    #     START_OF_HUMAN_TOKEN,
    #     *prompt_tokens,
    #     END_OF_HUMAN_TOKEN,
    #     START_OF_AI_TOKEN,
    #     START_OF_SPEECH_TOKEN
    # ]
    # 128260, 128261, 128257
    batch_size = 1

    config = og.Config(model_path)
    config.append_provider("cuda")

    #config.overlay(f'{{"search": {{"batch_size": {batch_size}}}}}')

    #if args.execution_provider != "follow_config":
    #    config.clear_providers()
    #    if args.execution_provider != "cpu":
    #        if args.verbose:
    #            print(f"Setting model to {args.execution_provider}...")
    #        config.append_provider(args.execution_provider)
    model = og.Model(config)

    if True: print("Model loaded")
    tokenizer = og.Tokenizer(model)
    if True : print("Tokenizer created")

 
    input_tokens = tokenizer.encode(prompt)
    if True : print(f'Prompt(s) encoded: {prompt}');print(f"tokenized: {input_tokens}")

    params = og.GeneratorParams(model)

    search_options = {}
   
    #search_options['num_beams'] = 3
    search_options['max_length'] = min(int(len(text) * 1.3) * 7 + 21, 700)
    #if True : print(f'Args: {args}')
    if True : print(f'Search options: {search_options}')

    params.set_search_options(**search_options)
    if True: print("GeneratorParams created")

    generator = og.Generator(model, params)
    if True: print("Generator created")
    
    generator.append_tokens(input_tokens)
    if True:print("Input tokens added")

    if True: print("Generating tokens ...\n")
    start_time = time.time()
    while not generator.is_done():
        generator.generate_next_token()
    run_time = time.time() - start_time

   # dec = generator.get_sequence(0)
   # print(tokenizer.decode(dec))

    total_tokens = dec.shape[0]
    print(f"Tokens: {total_tokens} Time: {run_time:.2f} Tokens per second: {total_tokens/run_time:.2f}")
    print()

    print("Hello from venna!")

    snac_tokens = [
        token_id for token_id in dec
        if AUDIO_CODE_BASE_OFFSET <= token_id < (AUDIO_CODE_BASE_OFFSET + 7 * 4096)
    ]
    print(len(snac_tokens))
    if not snac_tokens:
        raise ValueError("No audio tokens generated")

    # Decode audio
    print(snac_tokens)

    audio = decode_snac_tokens( snac_tokens[:7 * (len(snac_tokens) // 7)], snac_model)
    print(audio)
    sf.write(f"output_english_{speakers[2]}.wav", audio, 24000)

# --- Example Usage ---

# Hindi
# text_hindi = "आज मैंने एक नई तकनीक के बारे में सीखा जो कृत्रिम बुद्धिमत्ता का उपयोग करके मानव जैसी आवाज़ उत्पन्न कर सकती है।"
# audio = generate_speech(text_hindi, speaker="kavya")
# sf.write("output_hindi_kavya.wav", audio, 24000)
#
# # English
# text_english = "Today I learned about a new technology that uses artificial intelligence to generate human-like voices."
# audio = generate_speech(text_english, speaker="agastya")
# sf.write("output_english_agastya.wav", audio, 24000)
#
# # Code-mixed
# text_mixed = "मैं तो पूरा presentation prepare कर चुका हूं! कल रात को ही मैंने पूरा code base चेक किया।"
# audio = generate_speech(text_mixed, speaker="maitri")
# sf.write("output_mixed_maitri.wav", audio, 24000)

if __name__ == "__main__":
    main()
[project]
name = "venna"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
    "accelerate>=1.12.0",
    "bitsandbytes>=0.48.2",
    "hf-xet==1.2.0",
    "onnx>=1.19.1",
    "onnxruntime-genai-cuda==0.11.2",
    "onnxruntime-gpu>=1.23.2",
    "snac>=1.2.1",
    "soundfile>=0.13.1",
]

[tool.uv.sources]
torch = { index = "pytorch" }
#torchvision = { index = "pytorch" }
#xformers = { url = "https://download.pytorch.org/whl/cu130/xformers-0.0.33.post1-cp39-abi3-win_amd64.whl" }


[[tool.uv.index]]
name = "pytorch"
url = "https://download.pytorch.org/whl/cu129"
explicit = true

[dependency-groups]
dev = [
    "torch==2.9.0+cu129",
    "transformers>=4.57.1",
]

Loading CUDA-based torch already comes with required CUDA DLL, so there is no need to preload again with ONNX.

Thank you.

jarvishacks changed discussion status to closed
jarvishacks changed discussion status to open

Sign up or log in to comment