import torch import torchaudio from transformers import AutoModel from datasets import load_dataset import pdb; pdb.set_trace() dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") sampling_rate = dataset.features["audio"].sampling_rate assert sampling_rate == 16000 device = torch.device("cpu") if torch.cuda.is_available(): device = torch.device("cuda") model = AutoModel.from_pretrained("/mnt/shared-storage-user/housiyuan/xiaoyu/workspace/icefall_general_encoder/egs/general_audio_encoder/mtl/spear_large_speech_hf", trust_remote_code=True) model.eval() model.to(device) audio = dataset[0]["audio"]["array"].to(device) audio_len = torch.tensor(audio.shape[-1]).to(device) with torch.no_grad(): outputs = model(audio, audio_len) encoder_out = outputs["encoder_out"] # (N,T,C) encoder_out_lens = outputs["encoder_out_lens"] # (N) middle_out = outputs["hidden_states"] # list of (N,T,C) print(encoder_out) print(encoder_out_lens) print(middle_out[0].shape)