Upload tokenizer_config.json with huggingface_hub
Browse files- tokenizer_config.json +12 -11
tokenizer_config.json
CHANGED
|
@@ -1,16 +1,13 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
-
"
|
|
|
|
|
|
|
| 4 |
"vocab_size": 12000,
|
| 5 |
"pad_id": 0,
|
| 6 |
"bos_id": 1,
|
| 7 |
"eos_id": 2,
|
| 8 |
"unk_id": 3,
|
| 9 |
-
"character_coverage": 0.9999,
|
| 10 |
-
"byte_fallback": false,
|
| 11 |
-
"split_digits": false,
|
| 12 |
-
"normalization_rule_name": "identity",
|
| 13 |
-
"reserved_symbols_count": 6806,
|
| 14 |
"domains": [
|
| 15 |
"ROS",
|
| 16 |
"HTTP",
|
|
@@ -23,13 +20,11 @@
|
|
| 23 |
"CAL",
|
| 24 |
"FILE"
|
| 25 |
],
|
| 26 |
-
"
|
| 27 |
"<SCHEMA>",
|
| 28 |
"</SCHEMA>",
|
| 29 |
"<TASK>",
|
| 30 |
"</TASK>",
|
| 31 |
-
"<DOMAIN>",
|
| 32 |
-
"</DOMAIN>",
|
| 33 |
"<JSON>",
|
| 34 |
"</JSON>",
|
| 35 |
"<ACTION>",
|
|
@@ -37,6 +32,12 @@
|
|
| 37 |
"<META>",
|
| 38 |
"</META>"
|
| 39 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
"license": "apache-2.0",
|
| 41 |
-
"
|
| 42 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"name": "SAM Tokenizer",
|
| 3 |
+
"architecture": "NexusBPE",
|
| 4 |
+
"organization": "AMFORGE",
|
| 5 |
+
"version": "1.0",
|
| 6 |
"vocab_size": 12000,
|
| 7 |
"pad_id": 0,
|
| 8 |
"bos_id": 1,
|
| 9 |
"eos_id": 2,
|
| 10 |
"unk_id": 3,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
"domains": [
|
| 12 |
"ROS",
|
| 13 |
"HTTP",
|
|
|
|
| 20 |
"CAL",
|
| 21 |
"FILE"
|
| 22 |
],
|
| 23 |
+
"structural_markers": [
|
| 24 |
"<SCHEMA>",
|
| 25 |
"</SCHEMA>",
|
| 26 |
"<TASK>",
|
| 27 |
"</TASK>",
|
|
|
|
|
|
|
| 28 |
"<JSON>",
|
| 29 |
"</JSON>",
|
| 30 |
"<ACTION>",
|
|
|
|
| 32 |
"<META>",
|
| 33 |
"</META>"
|
| 34 |
],
|
| 35 |
+
"guarantees": [
|
| 36 |
+
"atomic numerics in supported ranges",
|
| 37 |
+
"atomic domain markers and structural tags",
|
| 38 |
+
"deterministic encoding",
|
| 39 |
+
"bit-perfect roundtrip on structured payloads"
|
| 40 |
+
],
|
| 41 |
"license": "apache-2.0",
|
| 42 |
+
"homepage": "https://huggingface.co/AMFORGE/sam_tokenizer"
|
| 43 |
}
|