Franso
/

reinvent_42M_64

@@ -4,7 +4,7 @@
   "padding": null,
   "added_tokens": [
     {
-      "id": 33,
       "content": "<pad>",
       "single_word": false,
       "lstrip": false,
@@ -13,7 +13,7 @@
       "special": true
     },
     {
-      "id": 34,
       "content": "<s>",
       "single_word": false,
       "lstrip": false,
@@ -22,7 +22,7 @@
       "special": true
     },
     {
-      "id": 35,
       "content": "</s>",
       "single_word": false,
       "lstrip": false,
@@ -31,7 +31,7 @@
       "special": true
     },
     {
-      "id": 36,
       "content": "<unk>",
       "single_word": false,
       "lstrip": false,
@@ -44,7 +44,7 @@
   "pre_tokenizer": {
     "type": "Split",
     "pattern": {
-      "Regex": "(|)"
     },
     "behavior": "Isolated",
     "invert": false
@@ -65,39 +65,175 @@
     "ignore_merges": false,
     "vocab": {
       "#": 0,
-      "(": 1,
-      ")": 2,
-      "+": 3,
-      "-": 4,
-      "/": 5,
-      "1": 6,
-      "2": 7,
-      "3": 8,
-      "4": 9,
-      "5": 10,
-      "6": 11,
-      "=": 12,
-      "@": 13,
-      "B": 14,
-      "C": 15,
-      "F": 16,
-      "H": 17,
-      "I": 18,
-      "N": 19,
-      "O": 20,
-      "P": 21,
-      "S": 22,
-      "[": 23,
-      "\\": 24,
-      "]": 25,
-      "c": 26,
-      "i": 27,
-      "l": 28,
-      "n": 29,
-      "o": 30,
-      "r": 31,
-      "s": 32
     },
-    "merges": []
   }
 }

   "padding": null,
   "added_tokens": [
     {
+      "id": 64,
       "content": "<pad>",
       "single_word": false,
       "lstrip": false,
       "special": true
     },
     {
+      "id": 65,
       "content": "<s>",
       "single_word": false,
       "lstrip": false,
       "special": true
     },
     {
+      "id": 66,
       "content": "</s>",
       "single_word": false,
       "lstrip": false,
       "special": true
     },
     {
+      "id": 67,
       "content": "<unk>",
       "single_word": false,
       "lstrip": false,
   "pre_tokenizer": {
     "type": "Split",
     "pattern": {
+      "Regex": "\\(|\\)"
     },
     "behavior": "Isolated",
     "invert": false
     "ignore_merges": false,
     "vocab": {
       "#": 0,
+      "%": 1,
+      "(": 2,
+      ")": 3,
+      "+": 4,
+      "-": 5,
+      "/": 6,
+      "0": 7,
+      "1": 8,
+      "2": 9,
+      "3": 10,
+      "4": 11,
+      "5": 12,
+      "6": 13,
+      "7": 14,
+      "8": 15,
+      "9": 16,
+      "=": 17,
+      "@": 18,
+      "B": 19,
+      "C": 20,
+      "F": 21,
+      "H": 22,
+      "I": 23,
+      "N": 24,
+      "O": 25,
+      "P": 26,
+      "S": 27,
+      "[": 28,
+      "\\": 29,
+      "]": 30,
+      "c": 31,
+      "i": 32,
+      "l": 33,
+      "n": 34,
+      "o": 35,
+      "r": 36,
+      "s": 37,
+      "cc": 38,
+      "CC": 39,
+      "c1": 40,
+      "=O": 41,
+      "c2": 42,
+      "H]": 43,
+      "[C": 44,
+      "[C@": 45,
+      "c1cc": 46,
+      "[C@@": 47,
+      "c3": 48,
+      "c2cc": 49,
+      "[C@H]": 50,
+      "[C@@H]": 51,
+      "NC": 52,
+      "c1ccc": 53,
+      "CCC": 54,
+      "CO": 55,
+      "cc1": 56,
+      "=C": 57,
+      "c1cccc": 58,
+      "n1": 59,
+      "N1": 60,
+      "nc": 61,
+      "c2cccc": 62,
+      "OC": 63
     },
+    "merges": [
+      [
+        "c",
+        "c"
+      ],
+      [
+        "C",
+        "C"
+      ],
+      [
+        "c",
+        "1"
+      ],
+      [
+        "=",
+        "O"
+      ],
+      [
+        "c",
+        "2"
+      ],
+      [
+        "H",
+        "]"
+      ],
+      [
+        "[",
+        "C"
+      ],
+      [
+        "[C",
+        "@"
+      ],
+      [
+        "c1",
+        "cc"
+      ],
+      [
+        "[C@",
+        "@"
+      ],
+      [
+        "c",
+        "3"
+      ],
+      [
+        "c2",
+        "cc"
+      ],
+      [
+        "[C@",
+        "H]"
+      ],
+      [
+        "[C@@",
+        "H]"
+      ],
+      [
+        "N",
+        "C"
+      ],
+      [
+        "c1cc",
+        "c"
+      ],
+      [
+        "CC",
+        "C"
+      ],
+      [
+        "C",
+        "O"
+      ],
+      [
+        "cc",
+        "1"
+      ],
+      [
+        "=",
+        "C"
+      ],
+      [
+        "c1cc",
+        "cc"
+      ],
+      [
+        "n",
+        "1"
+      ],
+      [
+        "N",
+        "1"
+      ],
+      [
+        "n",
+        "c"
+      ],
+      [
+        "c2cc",
+        "cc"
+      ],
+      [
+        "O",
+        "C"
+      ]
+    ]
   }
 }

tokenizer_config.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "added_tokens_decoder": {
-    "33": {
       "content": "<pad>",
       "lstrip": false,
       "normalized": false,
@@ -8,7 +8,7 @@
       "single_word": false,
       "special": true
     },
-    "34": {
       "content": "<s>",
       "lstrip": false,
       "normalized": false,
@@ -16,7 +16,7 @@
       "single_word": false,
       "special": true
     },
-    "35": {
       "content": "</s>",
       "lstrip": false,
       "normalized": false,
@@ -24,7 +24,7 @@
       "single_word": false,
       "special": true
     },
-    "36": {
       "content": "<unk>",
       "lstrip": false,
       "normalized": false,

 {
   "added_tokens_decoder": {
+    "64": {
       "content": "<pad>",
       "lstrip": false,
       "normalized": false,
       "single_word": false,
       "special": true
     },
+    "65": {
       "content": "<s>",
       "lstrip": false,
       "normalized": false,
       "single_word": false,
       "special": true
     },
+    "66": {
       "content": "</s>",
       "lstrip": false,
       "normalized": false,
       "single_word": false,
       "special": true
     },
+    "67": {
       "content": "<unk>",
       "lstrip": false,
       "normalized": false,