| import torch | |
| import torchaudio | |
| from transformers import AutoModel | |
| from datasets import load_dataset | |
| import pdb; pdb.set_trace() | |
| dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") | |
| sampling_rate = dataset.features["audio"].sampling_rate | |
| assert sampling_rate == 16000 | |
| device = torch.device("cpu") | |
| if torch.cuda.is_available(): | |
| device = torch.device("cuda") | |
| model = AutoModel.from_pretrained("/mnt/shared-storage-user/housiyuan/xiaoyu/workspace/icefall_general_encoder/egs/general_audio_encoder/mtl/spear_large_speech_hf", trust_remote_code=True) | |
| model.eval() | |
| model.to(device) | |
| audio = dataset[0]["audio"]["array"].to(device) | |
| audio_len = torch.tensor(audio.shape[-1]).to(device) | |
| with torch.no_grad(): | |
| outputs = model(audio, audio_len) | |
| encoder_out = outputs["encoder_out"] # (N,T,C) | |
| encoder_out_lens = outputs["encoder_out_lens"] # (N) | |
| middle_out = outputs["hidden_states"] # list of (N,T,C) | |
| print(encoder_out) | |
| print(encoder_out_lens) | |
| print(middle_out[0].shape) |