gsaltintas commited on
Commit
23c9c8f
·
verified ·
1 Parent(s): 3d30a20

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. README.md +5 -0
  2. vocab.json +11 -11
README.md CHANGED
@@ -42,3 +42,8 @@ tokens = tokenizer.encode("Hello, world!")
42
  - `tokenizer.json` — Full HuggingFace tokenizer
43
  - `vocab.json` — Vocabulary mapping
44
  - `merges.txt` — BPE merge rules
 
 
 
 
 
 
42
  - `tokenizer.json` — Full HuggingFace tokenizer
43
  - `vocab.json` — Vocabulary mapping
44
  - `merges.txt` — BPE merge rules
45
+
46
+ ## Sample Encoding
47
+ | Text | Tokens | Token IDs |
48
+ |------|--------|-----------|
49
+ | `12345009 mod 67` | `1, 2, 3, 4, 5, 0, 0, 9, , mod, , 6, 7` | `8, 9, 10, 11, 12, 7, 7, 16, 6, 4, 6, 13, 14` |
vocab.json CHANGED
@@ -1,19 +1,19 @@
1
  {
2
- "6": 13,
3
- "4": 11,
4
- "1": 8,
5
- "3": 10,
6
  "<unk>": 0,
 
7
  "2": 9,
 
8
  "0": 7,
9
- "9": 16,
10
- "5": 12,
11
- "<pad>": 3,
12
  "</s>": 2,
13
- "7": 14,
14
- "=": 5,
15
  "<s>": 1,
16
- " ": 6,
 
17
  "mod": 4,
18
- "8": 15
 
 
 
 
 
19
  }
 
1
  {
 
 
 
 
2
  "<unk>": 0,
3
+ "8": 15,
4
  "2": 9,
5
+ "1": 8,
6
  "0": 7,
7
+ "6": 13,
 
 
8
  "</s>": 2,
 
 
9
  "<s>": 1,
10
+ "9": 16,
11
+ "7": 14,
12
  "mod": 4,
13
+ "<pad>": 3,
14
+ " ": 6,
15
+ "=": 5,
16
+ "5": 12,
17
+ "3": 10,
18
+ "4": 11
19
  }