marcoyang commited on
Commit
2bbfdf9
·
verified ·
1 Parent(s): 75d5f67

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +13 -17
README.md CHANGED
@@ -35,27 +35,21 @@ The model achieves the following performance when fine-tuned on LibriSpeech for
35
  You can however extract its top-layer feature (and intermediate hidden states) using the following code:
36
 
37
  ```python
38
- import torch
39
- import torchaudio
40
-
41
  from transformers import AutoModel
42
- from datasets import load_dataset
43
-
44
- import pdb; pdb.set_trace()
45
- dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
46
- sampling_rate = dataset.features["audio"].sampling_rate
47
- assert sampling_rate == 16000
48
 
49
- device = torch.device("cpu")
 
 
 
 
50
  if torch.cuda.is_available():
51
- device = torch.device("cuda")
52
-
53
- model = AutoModel.from_pretrained("/mnt/shared-storage-user/housiyuan/xiaoyu/workspace/icefall_general_encoder/egs/general_audio_encoder/mtl/spear_large_speech_hf", trust_remote_code=True)
54
  model.eval()
55
- model.to(device)
56
 
57
- audio = dataset[0]["audio"]["array"].to(device)
58
- audio_len = torch.tensor(audio.shape[-1]).to(device)
 
59
 
60
  with torch.no_grad():
61
  outputs = model(audio, audio_len)
@@ -66,5 +60,7 @@ middle_out = outputs["hidden_states"] # list of (N,T,C)
66
 
67
  print(encoder_out)
68
  print(encoder_out_lens)
69
- print(middle_out[0].shape)
 
 
70
  ```
 
35
  You can however extract its top-layer feature (and intermediate hidden states) using the following code:
36
 
37
  ```python
 
 
 
38
  from transformers import AutoModel
39
+ import torch
 
 
 
 
 
40
 
41
+ model = AutoModel.from_pretrained(
42
+ "marcoyang/spear-large-speech",
43
+ trust_remote_code=True,
44
+ force_download=False,
45
+ )
46
  if torch.cuda.is_available():
47
+ model = model.to("cuda")
 
 
48
  model.eval()
 
49
 
50
+ device = next(model.parameters()).device
51
+ audio = torch.randn(1, 160000).to(device) # dummy audio input of 10 seconds
52
+ audio_len = torch.tensor([160000]).to(device)
53
 
54
  with torch.no_grad():
55
  outputs = model(audio, audio_len)
 
60
 
61
  print(encoder_out)
62
  print(encoder_out_lens)
63
+ print(len(middle_out)) # 11 layers
64
+ print(middle_out[-1].shape)
65
+ print(middle_out[-1])
66
  ```