"""Example for using HiggsAudio for generating both the transcript and audio in an interleaved manner.""" from boson_multimodal.serve.serve_engine import HiggsAudioServeEngine, HiggsAudioResponse import torch import torchaudio import time from loguru import logger import click from input_samples import INPUT_SAMPLES MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base" AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer" @click.command() @click.argument("example", type=click.Choice(list(INPUT_SAMPLES.keys()))) def main(example: str): input_sample = INPUT_SAMPLES[example]() device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {device}") serve_engine = HiggsAudioServeEngine( MODEL_PATH, AUDIO_TOKENIZER_PATH, device=device, ) logger.info("Starting generation...") start_time = time.time() output: HiggsAudioResponse = serve_engine.generate( chat_ml_sample=input_sample, max_new_tokens=1024, temperature=1.0, top_p=0.95, top_k=50, stop_strings=["<|end_of_text|>", "<|eot_id|>"], ) elapsed_time = time.time() - start_time logger.info(f"Generation time: {elapsed_time:.2f} seconds") torchaudio.save(f"output_{example}.wav", torch.from_numpy(output.audio)[None, :], output.sampling_rate) logger.info(f"Generated text:\n{output.generated_text}") logger.info(f"Saved audio to output_{example}.wav") if __name__ == "__main__": main()