marcoyang's picture
Upload folder using huggingface_hub
3594d34 verified
import torch
import torchaudio
from transformers import AutoModel
from datasets import load_dataset
import pdb; pdb.set_trace()
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
sampling_rate = dataset.features["audio"].sampling_rate
assert sampling_rate == 16000
device = torch.device("cpu")
if torch.cuda.is_available():
device = torch.device("cuda")
model = AutoModel.from_pretrained("/mnt/shared-storage-user/housiyuan/xiaoyu/workspace/icefall_general_encoder/egs/general_audio_encoder/mtl/spear_large_speech_hf", trust_remote_code=True)
model.eval()
model.to(device)
audio = dataset[0]["audio"]["array"].to(device)
audio_len = torch.tensor(audio.shape[-1]).to(device)
with torch.no_grad():
outputs = model(audio, audio_len)
encoder_out = outputs["encoder_out"] # (N,T,C)
encoder_out_lens = outputs["encoder_out_lens"] # (N)
middle_out = outputs["hidden_states"] # list of (N,T,C)
print(encoder_out)
print(encoder_out_lens)
print(middle_out[0].shape)