dstilesr commited on
Commit
f1bc79d
·
verified ·
1 Parent(s): f64cdaf

Tokenizer comment 'Train Existing Classifier with Contrastive Training'

Browse files
special_tokens_map.json CHANGED
@@ -34,6 +34,13 @@
34
  "rstrip": false,
35
  "single_word": false
36
  },
 
 
 
 
 
 
 
37
  "unk_token": {
38
  "content": "[UNK]",
39
  "lstrip": false,
 
34
  "rstrip": false,
35
  "single_word": false
36
  },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
  "unk_token": {
45
  "content": "[UNK]",
46
  "lstrip": false,
tokenizer.json CHANGED
@@ -1,7 +1,19 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
4
- "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
@@ -47,6 +59,15 @@
47
  "rstrip": false,
48
  "normalized": false,
49
  "special": true
 
 
 
 
 
 
 
 
 
50
  }
51
  ],
52
  "normalizer": {
@@ -70,10 +91,28 @@
70
  "use_regex": true
71
  },
72
  "post_processor": {
73
- "type": "ByteLevel",
74
- "add_prefix_space": true,
75
- "trim_offsets": true,
76
- "use_regex": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  },
78
  "decoder": {
79
  "type": "ByteLevel",
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 512,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": "BatchLongest",
11
+ "direction": "Right",
12
+ "pad_to_multiple_of": null,
13
+ "pad_id": 1,
14
+ "pad_type_id": 0,
15
+ "pad_token": "[PAD]"
16
+ },
17
  "added_tokens": [
18
  {
19
  "id": 0,
 
59
  "rstrip": false,
60
  "normalized": false,
61
  "special": true
62
+ },
63
+ {
64
+ "id": 75001,
65
+ "content": "[SEP]",
66
+ "single_word": false,
67
+ "lstrip": false,
68
+ "rstrip": false,
69
+ "normalized": false,
70
+ "special": true
71
  }
72
  ],
73
  "normalizer": {
 
91
  "use_regex": true
92
  },
93
  "post_processor": {
94
+ "type": "Sequence",
95
+ "processors": [
96
+ {
97
+ "type": "ByteLevel",
98
+ "add_prefix_space": true,
99
+ "trim_offsets": true,
100
+ "use_regex": true
101
+ },
102
+ {
103
+ "type": "RobertaProcessing",
104
+ "sep": [
105
+ "[SEP]",
106
+ 75001
107
+ ],
108
+ "cls": [
109
+ "[CLS]",
110
+ 75000
111
+ ],
112
+ "trim_offsets": true,
113
+ "add_prefix_space": true
114
+ }
115
+ ]
116
  },
117
  "decoder": {
118
  "type": "ByteLevel",
tokenizer_config.json CHANGED
@@ -39,6 +39,14 @@
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
 
 
 
 
 
 
 
 
42
  }
43
  },
44
  "bos_token": "[CLS]",
@@ -53,6 +61,7 @@
53
  "pad_token": "[PAD]",
54
  "pad_token_type_id": 0,
55
  "padding_side": "right",
 
56
  "stride": 0,
57
  "tokenizer_class": "PreTrainedTokenizerFast",
58
  "truncation_side": "right",
 
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
+ },
43
+ "75001": {
44
+ "content": "[SEP]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
  }
51
  },
52
  "bos_token": "[CLS]",
 
61
  "pad_token": "[PAD]",
62
  "pad_token_type_id": 0,
63
  "padding_side": "right",
64
+ "sep_token": "[SEP]",
65
  "stride": 0,
66
  "tokenizer_class": "PreTrainedTokenizerFast",
67
  "truncation_side": "right",