Samuael commited on
Commit
9530f81
·
verified ·
1 Parent(s): 2dab322

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -98,6 +98,5 @@
98
  "<extra_id_97>": 1002,
99
  "<extra_id_98>": 1001,
100
  "<extra_id_99>": 1000,
101
- "<extra_id_9>": 1090,
102
- "<pad>": 1100
103
  }
 
98
  "<extra_id_97>": 1002,
99
  "<extra_id_98>": 1001,
100
  "<extra_id_99>": 1000,
101
+ "<extra_id_9>": 1090
 
102
  }
special_tokens_map.json CHANGED
@@ -101,6 +101,7 @@
101
  "<extra_id_98>",
102
  "<extra_id_99>"
103
  ],
 
104
  "eos_token": {
105
  "content": "</s>",
106
  "lstrip": false,
 
101
  "<extra_id_98>",
102
  "<extra_id_99>"
103
  ],
104
+ "bos_token": "<s>",
105
  "eos_token": {
106
  "content": "</s>",
107
  "lstrip": false,
spiece.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4e4c09bd68c20916dfa8472b3c979527234d7b22afa71e1ea0bb36ee79a1bbd
3
- size 253571
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d862dc059c77d41f3108bbbd7578cc6d4f80fe3306247647c77c6747a39ef717
3
+ size 253584
tokenizer_config.json CHANGED
@@ -2,6 +2,14 @@
2
  "add_prefix_space": true,
3
  "added_tokens_decoder": {
4
  "0": {
 
 
 
 
 
 
 
 
5
  "content": "<unk>",
6
  "lstrip": false,
7
  "normalized": false,
@@ -10,6 +18,14 @@
10
  "special": true
11
  },
12
  "2": {
 
 
 
 
 
 
 
 
13
  "content": "</s>",
14
  "lstrip": false,
15
  "normalized": false,
@@ -816,14 +832,6 @@
816
  "rstrip": true,
817
  "single_word": false,
818
  "special": true
819
- },
820
- "1100": {
821
- "content": "<pad>",
822
- "lstrip": false,
823
- "normalized": false,
824
- "rstrip": false,
825
- "single_word": false,
826
- "special": true
827
  }
828
  },
829
  "additional_special_tokens": [
@@ -928,12 +936,16 @@
928
  "<extra_id_98>",
929
  "<extra_id_99>"
930
  ],
 
931
  "clean_up_tokenization_spaces": true,
 
 
932
  "eos_token": "</s>",
933
  "extra_ids": 100,
934
  "legacy": true,
935
  "model_max_length": 1000000000000000019884624838656,
936
  "pad_token": "<pad>",
 
937
  "sp_model_kwargs": {},
938
  "tokenizer_class": "T5Tokenizer",
939
  "unk_token": "<unk>"
 
2
  "add_prefix_space": true,
3
  "added_tokens_decoder": {
4
  "0": {
5
+ "content": "<pad>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
  "content": "<unk>",
14
  "lstrip": false,
15
  "normalized": false,
 
18
  "special": true
19
  },
20
  "2": {
21
+ "content": "<s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
  "content": "</s>",
30
  "lstrip": false,
31
  "normalized": false,
 
832
  "rstrip": true,
833
  "single_word": false,
834
  "special": true
 
 
 
 
 
 
 
 
835
  }
836
  },
837
  "additional_special_tokens": [
 
936
  "<extra_id_98>",
937
  "<extra_id_99>"
938
  ],
939
+ "bos_token": "<s>",
940
  "clean_up_tokenization_spaces": true,
941
+ "do_basic_tokenize": true,
942
+ "do_lower_case": true,
943
  "eos_token": "</s>",
944
  "extra_ids": 100,
945
  "legacy": true,
946
  "model_max_length": 1000000000000000019884624838656,
947
  "pad_token": "<pad>",
948
+ "padding": true,
949
  "sp_model_kwargs": {},
950
  "tokenizer_class": "T5Tokenizer",
951
  "unk_token": "<unk>"