marcoyang commited on
Commit
fa3ece6
·
1 Parent(s): 9324520

update readme

Browse files
Files changed (1) hide show
  1. README.md +13 -17
README.md CHANGED
@@ -54,27 +54,21 @@ The model acheives the following mean average precision (mAP) when fine-tuned on
54
  You can extract its top-layer feature (and intermediate hidden states) using the following code:
55
 
56
  ```python
57
- import torch
58
- import torchaudio
59
-
60
  from transformers import AutoModel
61
- from datasets import load_dataset
62
-
63
- import pdb; pdb.set_trace()
64
- dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
65
- sampling_rate = dataset.features["audio"].sampling_rate
66
- assert sampling_rate == 16000
67
 
68
- device = torch.device("cpu")
 
 
 
 
69
  if torch.cuda.is_available():
70
- device = torch.device("cuda")
71
-
72
- model = AutoModel.from_pretrained("/mnt/shared-storage-user/housiyuan/xiaoyu/workspace/icefall_general_encoder/egs/general_audio_encoder/mtl/spear_large_speech_hf", trust_remote_code=True)
73
  model.eval()
74
- model.to(device)
75
 
76
- audio = dataset[0]["audio"]["array"].to(device)
77
- audio_len = torch.tensor(audio.shape[-1]).to(device)
 
78
 
79
  with torch.no_grad():
80
  outputs = model(audio, audio_len)
@@ -85,5 +79,7 @@ middle_out = outputs["hidden_states"] # list of (N,T,C)
85
 
86
  print(encoder_out)
87
  print(encoder_out_lens)
88
- print(middle_out[0].shape)
 
 
89
  ```
 
54
  You can extract its top-layer feature (and intermediate hidden states) using the following code:
55
 
56
  ```python
 
 
 
57
  from transformers import AutoModel
58
+ import torch
 
 
 
 
 
59
 
60
+ model = AutoModel.from_pretrained(
61
+ "marcoyang/spear-xlarge-speech-audio",
62
+ trust_remote_code=True,
63
+ force_download=False,
64
+ )
65
  if torch.cuda.is_available():
66
+ model = model.to("cuda")
 
 
67
  model.eval()
 
68
 
69
+ device = next(model.parameters()).device
70
+ audio = torch.randn(1, 160000).to(device) # dummy audio input of 10 seconds
71
+ audio_len = torch.tensor([160000]).to(device)
72
 
73
  with torch.no_grad():
74
  outputs = model(audio, audio_len)
 
79
 
80
  print(encoder_out)
81
  print(encoder_out_lens)
82
+ print(len(middle_out)) # 13 layers
83
+ print(middle_out[-1].shape)
84
+ print(middle_out[-1])
85
  ```