cdreetz
/

audio-llama-hf

Text Generation

Model card Files Files and versions

audio-llama-hf / example.py

cdreetz's picture

Upload converted AudioLLM model

ed60929 verified 10 months ago

history blame contribute delete

1.82 kB


	# Example usage script for AudioLLM
	import torch
	import torchaudio
	from transformers import pipeline

	# Load the pipeline directly (recommended)
	audio_llm = pipeline(
	"text-generation",
	model="cdreetz/audio-llama-hf",
	device="cuda" if torch.cuda.is_available() else "cpu"
	)

	# Process audio file
	result = audio_llm("path/to/audio.wav")
	print(result[0]["generated_text"])

	# Process audio with custom prompt
	result = audio_llm(("path/to/audio.wav", "Describe the music in this audio:"))
	print(result[0]["generated_text"])

	# Text-only generation
	result = audio_llm("Write a poem about sound:")
	print(result[0]["generated_text"])

	# Advanced usage: load model components manually
	from transformers import AutoTokenizer, AutoModelForCausalLM, WhisperProcessor

	# Load tokenizer and model
	tokenizer = AutoTokenizer.from_pretrained("cdreetz/audio-llama-hf")
	model = AutoModelForCausalLM.from_pretrained("cdreetz/audio-llama-hf")

	# Process inputs manually
	waveform, sample_rate = torchaudio.load("path/to/audio.wav")
	if waveform.shape[0] > 1: # Convert stereo to mono
	waveform = torch.mean(waveform, dim=0, keepdim=True)

	# Preprocess audio
	whisper_processor = WhisperProcessor.from_pretrained(model.config.whisper_model_id)
	audio_features = whisper_processor(
	waveform.squeeze().numpy(),
	sampling_rate=16000,
	return_tensors="pt"
	).input_features

	# Tokenize text prompt
	inputs = tokenizer("Describe the audio:", return_tensors="pt")

	# Generate
	with torch.no_grad():
	outputs = model.generate(
	input_ids=inputs.input_ids.to(model.device),
	attention_mask=inputs.attention_mask.to(model.device),
	audio_features=audio_features.to(model.device),
	max_new_tokens=256
	)

	# Decode
	result = tokenizer.decode(outputs[0], skip_special_tokens=True)
	print(result)