Tokenizer comment 'Train Existing Classifier with Contrastive Training'

Files changed (3) hide show

special_tokens_map.json CHANGED Viewed

@@ -34,6 +34,13 @@
     "rstrip": false,
     "single_word": false
   },
   "unk_token": {
     "content": "[UNK]",
     "lstrip": false,

     "rstrip": false,
     "single_word": false
   },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
   "unk_token": {
     "content": "[UNK]",
     "lstrip": false,

tokenizer.json CHANGED Viewed

@@ -1,7 +1,19 @@
 {
   "version": "1.0",
-  "truncation": null,
-  "padding": null,
   "added_tokens": [
     {
       "id": 0,
@@ -47,6 +59,15 @@
       "rstrip": false,
       "normalized": false,
       "special": true
     }
   ],
   "normalizer": {
@@ -70,10 +91,28 @@
     "use_regex": true
   },
   "post_processor": {
-    "type": "ByteLevel",
-    "add_prefix_space": true,
-    "trim_offsets": true,
-    "use_regex": true
   },
   "decoder": {
     "type": "ByteLevel",

 {
   "version": "1.0",
+  "truncation": {
+    "direction": "Right",
+    "max_length": 512,
+    "strategy": "LongestFirst",
+    "stride": 0
+  },
+  "padding": {
+    "strategy": "BatchLongest",
+    "direction": "Right",
+    "pad_to_multiple_of": null,
+    "pad_id": 1,
+    "pad_type_id": 0,
+    "pad_token": "[PAD]"
+  },
   "added_tokens": [
     {
       "id": 0,
       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 75001,
+      "content": "[SEP]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": {
     "use_regex": true
   },
   "post_processor": {
+    "type": "Sequence",
+    "processors": [
+      {
+        "type": "ByteLevel",
+        "add_prefix_space": true,
+        "trim_offsets": true,
+        "use_regex": true
+      },
+      {
+        "type": "RobertaProcessing",
+        "sep": [
+          "[SEP]",
+          75001
+        ],
+        "cls": [
+          "[CLS]",
+          75000
+        ],
+        "trim_offsets": true,
+        "add_prefix_space": true
+      }
+    ]
   },
   "decoder": {
     "type": "ByteLevel",

tokenizer_config.json CHANGED Viewed

@@ -39,6 +39,14 @@
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "bos_token": "[CLS]",
@@ -53,6 +61,7 @@
   "pad_token": "[PAD]",
   "pad_token_type_id": 0,
   "padding_side": "right",
   "stride": 0,
   "tokenizer_class": "PreTrainedTokenizerFast",
   "truncation_side": "right",

       "rstrip": false,
       "single_word": false,
       "special": true
+    },
+    "75001": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     }
   },
   "bos_token": "[CLS]",
   "pad_token": "[PAD]",
   "pad_token_type_id": 0,
   "padding_side": "right",
+  "sep_token": "[SEP]",
   "stride": 0,
   "tokenizer_class": "PreTrainedTokenizerFast",
   "truncation_side": "right",