ProAIDev commited on
Commit
7213efd
·
verified ·
1 Parent(s): 647e46f

Upload /workspace/ComfyUI/models/transformers/TencentGameMate/chinese-wav2vec2-base/README.md with huggingface_hub

Browse files
workspace/ComfyUI/models/transformers/TencentGameMate/chinese-wav2vec2-base/README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+ Pretrained on 10k hours WenetSpeech L subset. More details in [TencentGameMate/chinese_speech_pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
5
+
6
+ This model does not have a tokenizer as it was pretrained on audio alone.
7
+ In order to use this model speech recognition, a tokenizer should be created and the model should be fine-tuned on labeled text data.
8
+
9
+ python package:
10
+ transformers==4.16.2
11
+
12
+ ```python
13
+
14
+
15
+ import torch
16
+ import torch.nn.functional as F
17
+ import soundfile as sf
18
+ from fairseq import checkpoint_utils
19
+
20
+ from transformers import (
21
+ Wav2Vec2FeatureExtractor,
22
+ Wav2Vec2ForPreTraining,
23
+ Wav2Vec2Model,
24
+ )
25
+ from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices
26
+
27
+ model_path=""
28
+ wav_path=""
29
+ mask_prob=0.0
30
+ mask_length=10
31
+
32
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_path)
33
+ model = Wav2Vec2Model.from_pretrained(model_path)
34
+
35
+ # for pretrain: Wav2Vec2ForPreTraining
36
+ # model = Wav2Vec2ForPreTraining.from_pretrained(model_path)
37
+
38
+ model = model.to(device)
39
+ model = model.half()
40
+ model.eval()
41
+
42
+ wav, sr = sf.read(wav_path)
43
+ input_values = feature_extractor(wav, return_tensors="pt").input_values
44
+ input_values = input_values.half()
45
+ input_values = input_values.to(device)
46
+
47
+ # for Wav2Vec2ForPreTraining
48
+ # batch_size, raw_sequence_length = input_values.shape
49
+ # sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
50
+ # mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.0, mask_length=2)
51
+ # mask_time_indices = torch.tensor(mask_time_indices, device=input_values.device, dtype=torch.long)
52
+
53
+ with torch.no_grad():
54
+ outputs = model(input_values)
55
+ last_hidden_state = outputs.last_hidden_state
56
+
57
+ # for Wav2Vec2ForPreTraining
58
+ # outputs = model(input_values, mask_time_indices=mask_time_indices, output_hidden_states=True)
59
+ # last_hidden_state = outputs.hidden_states[-1]
60
+
61
+ ```