ameforge commited on
Commit
ae968c7
·
verified ·
1 Parent(s): 319d2da

Upload tokenizer_config.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +12 -11
tokenizer_config.json CHANGED
@@ -1,16 +1,13 @@
1
  {
2
- "tokenizer_class": "SentencePieceProcessor",
3
- "model_type": "bpe",
 
 
4
  "vocab_size": 12000,
5
  "pad_id": 0,
6
  "bos_id": 1,
7
  "eos_id": 2,
8
  "unk_id": 3,
9
- "character_coverage": 0.9999,
10
- "byte_fallback": false,
11
- "split_digits": false,
12
- "normalization_rule_name": "identity",
13
- "reserved_symbols_count": 6806,
14
  "domains": [
15
  "ROS",
16
  "HTTP",
@@ -23,13 +20,11 @@
23
  "CAL",
24
  "FILE"
25
  ],
26
- "structural_tags": [
27
  "<SCHEMA>",
28
  "</SCHEMA>",
29
  "<TASK>",
30
  "</TASK>",
31
- "<DOMAIN>",
32
- "</DOMAIN>",
33
  "<JSON>",
34
  "</JSON>",
35
  "<ACTION>",
@@ -37,6 +32,12 @@
37
  "<META>",
38
  "</META>"
39
  ],
 
 
 
 
 
 
40
  "license": "apache-2.0",
41
- "organization": "AMFORGE"
42
  }
 
1
  {
2
+ "name": "SAM Tokenizer",
3
+ "architecture": "NexusBPE",
4
+ "organization": "AMFORGE",
5
+ "version": "1.0",
6
  "vocab_size": 12000,
7
  "pad_id": 0,
8
  "bos_id": 1,
9
  "eos_id": 2,
10
  "unk_id": 3,
 
 
 
 
 
11
  "domains": [
12
  "ROS",
13
  "HTTP",
 
20
  "CAL",
21
  "FILE"
22
  ],
23
+ "structural_markers": [
24
  "<SCHEMA>",
25
  "</SCHEMA>",
26
  "<TASK>",
27
  "</TASK>",
 
 
28
  "<JSON>",
29
  "</JSON>",
30
  "<ACTION>",
 
32
  "<META>",
33
  "</META>"
34
  ],
35
+ "guarantees": [
36
+ "atomic numerics in supported ranges",
37
+ "atomic domain markers and structural tags",
38
+ "deterministic encoding",
39
+ "bit-perfect roundtrip on structured payloads"
40
+ ],
41
  "license": "apache-2.0",
42
+ "homepage": "https://huggingface.co/AMFORGE/sam_tokenizer"
43
  }