marcoyang commited on
Commit
1ad17a3
·
verified ·
1 Parent(s): 5d423ae

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +13 -17
README.md CHANGED
@@ -53,27 +53,21 @@ The model acheives the following mean average precision (mAP) when fine-tuned on
53
  You can extract its top-layer feature (and intermediate hidden states) using the following code:
54
 
55
  ```python
56
- import torch
57
- import torchaudio
58
-
59
  from transformers import AutoModel
60
- from datasets import load_dataset
61
-
62
- import pdb; pdb.set_trace()
63
- dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
64
- sampling_rate = dataset.features["audio"].sampling_rate
65
- assert sampling_rate == 16000
66
 
67
- device = torch.device("cpu")
 
 
 
 
68
  if torch.cuda.is_available():
69
- device = torch.device("cuda")
70
-
71
- model = AutoModel.from_pretrained("/mnt/shared-storage-user/housiyuan/xiaoyu/workspace/icefall_general_encoder/egs/general_audio_encoder/mtl/spear_large_speech_hf", trust_remote_code=True)
72
  model.eval()
73
- model.to(device)
74
 
75
- audio = dataset[0]["audio"]["array"].to(device)
76
- audio_len = torch.tensor(audio.shape[-1]).to(device)
 
77
 
78
  with torch.no_grad():
79
  outputs = model(audio, audio_len)
@@ -84,5 +78,7 @@ middle_out = outputs["hidden_states"] # list of (N,T,C)
84
 
85
  print(encoder_out)
86
  print(encoder_out_lens)
87
- print(middle_out[0].shape)
 
 
88
  ```
 
53
  You can extract its top-layer feature (and intermediate hidden states) using the following code:
54
 
55
  ```python
 
 
 
56
  from transformers import AutoModel
57
+ import torch
 
 
 
 
 
58
 
59
+ model = AutoModel.from_pretrained(
60
+ "marcoyang/spear-base-speech-audio",
61
+ trust_remote_code=True,
62
+ force_download=False,
63
+ )
64
  if torch.cuda.is_available():
65
+ model = model.to("cuda")
 
 
66
  model.eval()
 
67
 
68
+ device = next(model.parameters()).device
69
+ audio = torch.randn(1, 160000).to(device) # dummy audio input of 10 seconds
70
+ audio_len = torch.tensor([160000]).to(device)
71
 
72
  with torch.no_grad():
73
  outputs = model(audio, audio_len)
 
78
 
79
  print(encoder_out)
80
  print(encoder_out_lens)
81
+ print(len(middle_out)) # 12 layers
82
+ print(middle_out[-1].shape)
83
+ print(middle_out[-1])
84
  ```