|
|
|
|
|
|
|
|
import torch |
|
|
import torchaudio |
|
|
from transformers import pipeline |
|
|
|
|
|
|
|
|
audio_llm = pipeline( |
|
|
"text-generation", |
|
|
model="cdreetz/audio-llama-hf", |
|
|
device="cuda" if torch.cuda.is_available() else "cpu" |
|
|
) |
|
|
|
|
|
|
|
|
result = audio_llm("path/to/audio.wav") |
|
|
print(result[0]["generated_text"]) |
|
|
|
|
|
|
|
|
result = audio_llm(("path/to/audio.wav", "Describe the music in this audio:")) |
|
|
print(result[0]["generated_text"]) |
|
|
|
|
|
|
|
|
result = audio_llm("Write a poem about sound:") |
|
|
print(result[0]["generated_text"]) |
|
|
|
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, WhisperProcessor |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("cdreetz/audio-llama-hf") |
|
|
model = AutoModelForCausalLM.from_pretrained("cdreetz/audio-llama-hf") |
|
|
|
|
|
|
|
|
waveform, sample_rate = torchaudio.load("path/to/audio.wav") |
|
|
if waveform.shape[0] > 1: |
|
|
waveform = torch.mean(waveform, dim=0, keepdim=True) |
|
|
|
|
|
|
|
|
whisper_processor = WhisperProcessor.from_pretrained(model.config.whisper_model_id) |
|
|
audio_features = whisper_processor( |
|
|
waveform.squeeze().numpy(), |
|
|
sampling_rate=16000, |
|
|
return_tensors="pt" |
|
|
).input_features |
|
|
|
|
|
|
|
|
inputs = tokenizer("Describe the audio:", return_tensors="pt") |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model.generate( |
|
|
input_ids=inputs.input_ids.to(model.device), |
|
|
attention_mask=inputs.attention_mask.to(model.device), |
|
|
audio_features=audio_features.to(model.device), |
|
|
max_new_tokens=256 |
|
|
) |
|
|
|
|
|
|
|
|
result = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
print(result) |
|
|
|