# Example usage script for AudioLLM import torch import torchaudio from transformers import pipeline # Load the pipeline directly (recommended) audio_llm = pipeline( "text-generation", model="cdreetz/audio-llama-hf", device="cuda" if torch.cuda.is_available() else "cpu" ) # Process audio file result = audio_llm("path/to/audio.wav") print(result[0]["generated_text"]) # Process audio with custom prompt result = audio_llm(("path/to/audio.wav", "Describe the music in this audio:")) print(result[0]["generated_text"]) # Text-only generation result = audio_llm("Write a poem about sound:") print(result[0]["generated_text"]) # Advanced usage: load model components manually from transformers import AutoTokenizer, AutoModelForCausalLM, WhisperProcessor # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained("cdreetz/audio-llama-hf") model = AutoModelForCausalLM.from_pretrained("cdreetz/audio-llama-hf") # Process inputs manually waveform, sample_rate = torchaudio.load("path/to/audio.wav") if waveform.shape[0] > 1: # Convert stereo to mono waveform = torch.mean(waveform, dim=0, keepdim=True) # Preprocess audio whisper_processor = WhisperProcessor.from_pretrained(model.config.whisper_model_id) audio_features = whisper_processor( waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt" ).input_features # Tokenize text prompt inputs = tokenizer("Describe the audio:", return_tensors="pt") # Generate with torch.no_grad(): outputs = model.generate( input_ids=inputs.input_ids.to(model.device), attention_mask=inputs.attention_mask.to(model.device), audio_features=audio_features.to(model.device), max_new_tokens=256 ) # Decode result = tokenizer.decode(outputs[0], skip_special_tokens=True) print(result)