abidanoaman commited on
Commit
747b557
·
verified ·
1 Parent(s): bf9be72

Add WhisperProcessor (tokenizer + feature extractor)

Browse files
Files changed (3) hide show
  1. processor_config.json +17 -0
  2. tokenizer.json +0 -0
  3. tokenizer_config.json +127 -0
processor_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_extractor": {
3
+ "chunk_length": 30,
4
+ "dither": 0.0,
5
+ "feature_extractor_type": "WhisperFeatureExtractor",
6
+ "feature_size": 80,
7
+ "hop_length": 160,
8
+ "n_fft": 400,
9
+ "n_samples": 480000,
10
+ "nb_max_frames": 3000,
11
+ "padding_side": "right",
12
+ "padding_value": 0.0,
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ },
16
+ "processor_class": "WhisperProcessor"
17
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|endoftext|>",
5
+ "clean_up_tokenization_spaces": true,
6
+ "eos_token": "<|endoftext|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|endoftext|>",
10
+ "<|startoftranscript|>",
11
+ "<|en|>",
12
+ "<|zh|>",
13
+ "<|de|>",
14
+ "<|es|>",
15
+ "<|ru|>",
16
+ "<|ko|>",
17
+ "<|fr|>",
18
+ "<|ja|>",
19
+ "<|pt|>",
20
+ "<|tr|>",
21
+ "<|pl|>",
22
+ "<|ca|>",
23
+ "<|nl|>",
24
+ "<|ar|>",
25
+ "<|sv|>",
26
+ "<|it|>",
27
+ "<|id|>",
28
+ "<|hi|>",
29
+ "<|fi|>",
30
+ "<|vi|>",
31
+ "<|he|>",
32
+ "<|uk|>",
33
+ "<|el|>",
34
+ "<|ms|>",
35
+ "<|cs|>",
36
+ "<|ro|>",
37
+ "<|da|>",
38
+ "<|hu|>",
39
+ "<|ta|>",
40
+ "<|no|>",
41
+ "<|th|>",
42
+ "<|ur|>",
43
+ "<|hr|>",
44
+ "<|bg|>",
45
+ "<|lt|>",
46
+ "<|la|>",
47
+ "<|mi|>",
48
+ "<|ml|>",
49
+ "<|cy|>",
50
+ "<|sk|>",
51
+ "<|te|>",
52
+ "<|fa|>",
53
+ "<|lv|>",
54
+ "<|bn|>",
55
+ "<|sr|>",
56
+ "<|az|>",
57
+ "<|sl|>",
58
+ "<|kn|>",
59
+ "<|et|>",
60
+ "<|mk|>",
61
+ "<|br|>",
62
+ "<|eu|>",
63
+ "<|is|>",
64
+ "<|hy|>",
65
+ "<|ne|>",
66
+ "<|mn|>",
67
+ "<|bs|>",
68
+ "<|kk|>",
69
+ "<|sq|>",
70
+ "<|sw|>",
71
+ "<|gl|>",
72
+ "<|mr|>",
73
+ "<|pa|>",
74
+ "<|si|>",
75
+ "<|km|>",
76
+ "<|sn|>",
77
+ "<|yo|>",
78
+ "<|so|>",
79
+ "<|af|>",
80
+ "<|oc|>",
81
+ "<|ka|>",
82
+ "<|be|>",
83
+ "<|tg|>",
84
+ "<|sd|>",
85
+ "<|gu|>",
86
+ "<|am|>",
87
+ "<|yi|>",
88
+ "<|lo|>",
89
+ "<|uz|>",
90
+ "<|fo|>",
91
+ "<|ht|>",
92
+ "<|ps|>",
93
+ "<|tk|>",
94
+ "<|nn|>",
95
+ "<|mt|>",
96
+ "<|sa|>",
97
+ "<|lb|>",
98
+ "<|my|>",
99
+ "<|bo|>",
100
+ "<|tl|>",
101
+ "<|mg|>",
102
+ "<|as|>",
103
+ "<|tt|>",
104
+ "<|haw|>",
105
+ "<|ln|>",
106
+ "<|ha|>",
107
+ "<|ba|>",
108
+ "<|jw|>",
109
+ "<|su|>",
110
+ "<|translate|>",
111
+ "<|transcribe|>",
112
+ "<|startoflm|>",
113
+ "<|startofprev|>",
114
+ "<|nocaptions|>",
115
+ "<|notimestamps|>"
116
+ ],
117
+ "is_local": true,
118
+ "language": "urdu",
119
+ "model_max_length": 1024,
120
+ "pad_token": "<|endoftext|>",
121
+ "predict_timestamps": false,
122
+ "processor_class": "WhisperProcessor",
123
+ "return_attention_mask": false,
124
+ "task": "transcribe",
125
+ "tokenizer_class": "WhisperTokenizer",
126
+ "unk_token": "<|endoftext|>"
127
+ }