Pretrained smaller model

Files changed (3) hide show

special_tokens_map.json CHANGED Viewed

@@ -34,6 +34,13 @@
     "rstrip": false,
     "single_word": false
   },
   "unk_token": {
     "content": "[UNK]",
     "lstrip": false,

     "rstrip": false,
     "single_word": false
   },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
   "unk_token": {
     "content": "[UNK]",
     "lstrip": false,

tokenizer.json CHANGED Viewed

@@ -47,6 +47,15 @@
       "rstrip": false,
       "normalized": false,
       "special": true
     }
   ],
   "normalizer": {
@@ -70,10 +79,28 @@
     "use_regex": true
   },
   "post_processor": {
-    "type": "ByteLevel",
-    "add_prefix_space": true,
-    "trim_offsets": true,
-    "use_regex": true
   },
   "decoder": {
     "type": "ByteLevel",

       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 75001,
+      "content": "[SEP]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": {
     "use_regex": true
   },
   "post_processor": {
+    "type": "Sequence",
+    "processors": [
+      {
+        "type": "ByteLevel",
+        "add_prefix_space": true,
+        "trim_offsets": true,
+        "use_regex": true
+      },
+      {
+        "type": "RobertaProcessing",
+        "sep": [
+          "[SEP]",
+          75001
+        ],
+        "cls": [
+          "[CLS]",
+          75000
+        ],
+        "trim_offsets": true,
+        "add_prefix_space": true
+      }
+    ]
   },
   "decoder": {
     "type": "ByteLevel",

tokenizer_config.json CHANGED Viewed

@@ -39,6 +39,14 @@
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "bos_token": "[CLS]",
@@ -49,6 +57,7 @@
   "mask_token": "[MASK]",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "[PAD]",
   "tokenizer_class": "PreTrainedTokenizerFast",
   "unk_token": "[UNK]"
 }

       "rstrip": false,
       "single_word": false,
       "special": true
+    },
+    "75001": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     }
   },
   "bos_token": "[CLS]",
   "mask_token": "[MASK]",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
   "tokenizer_class": "PreTrainedTokenizerFast",
   "unk_token": "[UNK]"
 }