Upload processor

Browse files

Files changed (4) hide show

preprocessor_config.json +6 -10
special_tokens_map.json +4 -131
tokenizer_config.json +11 -32
vocab.json +0 -0

preprocessor_config.json CHANGED Viewed

@@ -1,14 +1,10 @@
 {
-  "chunk_length": 30,
-  "feature_extractor_type": "WhisperFeatureExtractor",
-  "feature_size": 80,
-  "hop_length": 160,
-  "n_fft": 400,
-  "n_samples": 480000,
-  "nb_max_frames": 3000,
   "padding_side": "right",
-  "padding_value": 0.0,
-  "processor_class": "WhisperProcessor",
-  "return_attention_mask": false,
   "sampling_rate": 16000
 }

 {
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
   "padding_side": "right",
+  "padding_value": 0,
+  "processor_class": "Wav2Vec2Processor",
+  "return_attention_mask": true,
   "sampling_rate": 16000
 }

special_tokens_map.json CHANGED Viewed

@@ -1,133 +1,6 @@
 {
-  "additional_special_tokens": [
-    "<|endoftext|>",
-    "<|startoftranscript|>",
-    "<|en|>",
-    "<|zh|>",
-    "<|de|>",
-    "<|es|>",
-    "<|ru|>",
-    "<|ko|>",
-    "<|fr|>",
-    "<|ja|>",
-    "<|pt|>",
-    "<|tr|>",
-    "<|pl|>",
-    "<|ca|>",
-    "<|nl|>",
-    "<|ar|>",
-    "<|sv|>",
-    "<|it|>",
-    "<|id|>",
-    "<|hi|>",
-    "<|fi|>",
-    "<|vi|>",
-    "<|he|>",
-    "<|uk|>",
-    "<|el|>",
-    "<|ms|>",
-    "<|cs|>",
-    "<|ro|>",
-    "<|da|>",
-    "<|hu|>",
-    "<|ta|>",
-    "<|no|>",
-    "<|th|>",
-    "<|ur|>",
-    "<|hr|>",
-    "<|bg|>",
-    "<|lt|>",
-    "<|la|>",
-    "<|mi|>",
-    "<|ml|>",
-    "<|cy|>",
-    "<|sk|>",
-    "<|te|>",
-    "<|fa|>",
-    "<|lv|>",
-    "<|bn|>",
-    "<|sr|>",
-    "<|az|>",
-    "<|sl|>",
-    "<|kn|>",
-    "<|et|>",
-    "<|mk|>",
-    "<|br|>",
-    "<|eu|>",
-    "<|is|>",
-    "<|hy|>",
-    "<|ne|>",
-    "<|mn|>",
-    "<|bs|>",
-    "<|kk|>",
-    "<|sq|>",
-    "<|sw|>",
-    "<|gl|>",
-    "<|mr|>",
-    "<|pa|>",
-    "<|si|>",
-    "<|km|>",
-    "<|sn|>",
-    "<|yo|>",
-    "<|so|>",
-    "<|af|>",
-    "<|oc|>",
-    "<|ka|>",
-    "<|be|>",
-    "<|tg|>",
-    "<|sd|>",
-    "<|gu|>",
-    "<|am|>",
-    "<|yi|>",
-    "<|lo|>",
-    "<|uz|>",
-    "<|fo|>",
-    "<|ht|>",
-    "<|ps|>",
-    "<|tk|>",
-    "<|nn|>",
-    "<|mt|>",
-    "<|sa|>",
-    "<|lb|>",
-    "<|my|>",
-    "<|bo|>",
-    "<|tl|>",
-    "<|mg|>",
-    "<|as|>",
-    "<|tt|>",
-    "<|haw|>",
-    "<|ln|>",
-    "<|ha|>",
-    "<|ba|>",
-    "<|jw|>",
-    "<|su|>",
-    "<|translate|>",
-    "<|transcribe|>",
-    "<|startoflm|>",
-    "<|startofprev|>",
-    "<|nocaptions|>",
-    "<|notimestamps|>"
-  ],
-  "bos_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eos_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "pad_token": "<|endoftext|>",
-  "unk_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  }
 }

 {
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
 }

tokenizer_config.json CHANGED Viewed

@@ -1,35 +1,14 @@
 {
-  "add_bos_token": false,
-  "add_prefix_space": false,
-  "bos_token": {
-    "__type": "AddedToken",
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
   "clean_up_tokenization_spaces": true,
-  "eos_token": {
-    "__type": "AddedToken",
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "errors": "replace",
-  "model_max_length": 1024,
-  "pad_token": null,
-  "processor_class": "WhisperProcessor",
-  "return_attention_mask": false,
-  "tokenizer_class": "WhisperTokenizer",
-  "unk_token": {
-    "__type": "AddedToken",
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  }
 }

 {
+  "bos_token": "<s>",
   "clean_up_tokenization_spaces": true,
+  "do_lower_case": false,
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "processor_class": "Wav2Vec2Processor",
+  "replace_word_delimiter_char": " ",
+  "target_lang": null,
+  "tokenizer_class": "Wav2Vec2CTCTokenizer",
+  "unk_token": "<unk>",
+  "word_delimiter_token": "|"
 }

vocab.json CHANGED Viewed

The diff for this file is too large to render. See raw diff