Upload folder using huggingface_hub
Browse files- README.md +5 -0
- vocab.json +11 -11
README.md
CHANGED
|
@@ -42,3 +42,8 @@ tokens = tokenizer.encode("Hello, world!")
|
|
| 42 |
- `tokenizer.json` — Full HuggingFace tokenizer
|
| 43 |
- `vocab.json` — Vocabulary mapping
|
| 44 |
- `merges.txt` — BPE merge rules
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
- `tokenizer.json` — Full HuggingFace tokenizer
|
| 43 |
- `vocab.json` — Vocabulary mapping
|
| 44 |
- `merges.txt` — BPE merge rules
|
| 45 |
+
|
| 46 |
+
## Sample Encoding
|
| 47 |
+
| Text | Tokens | Token IDs |
|
| 48 |
+
|------|--------|-----------|
|
| 49 |
+
| `12345009 mod 67` | `1, 2, 3, 4, 5, 0, 0, 9, , mod, , 6, 7` | `8, 9, 10, 11, 12, 7, 7, 16, 6, 4, 6, 13, 14` |
|
vocab.json
CHANGED
|
@@ -1,19 +1,19 @@
|
|
| 1 |
{
|
| 2 |
-
"6": 13,
|
| 3 |
-
"4": 11,
|
| 4 |
-
"1": 8,
|
| 5 |
-
"3": 10,
|
| 6 |
"<unk>": 0,
|
|
|
|
| 7 |
"2": 9,
|
|
|
|
| 8 |
"0": 7,
|
| 9 |
-
"
|
| 10 |
-
"5": 12,
|
| 11 |
-
"<pad>": 3,
|
| 12 |
"</s>": 2,
|
| 13 |
-
"7": 14,
|
| 14 |
-
"=": 5,
|
| 15 |
"<s>": 1,
|
| 16 |
-
"
|
|
|
|
| 17 |
"mod": 4,
|
| 18 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
}
|
|
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"<unk>": 0,
|
| 3 |
+
"8": 15,
|
| 4 |
"2": 9,
|
| 5 |
+
"1": 8,
|
| 6 |
"0": 7,
|
| 7 |
+
"6": 13,
|
|
|
|
|
|
|
| 8 |
"</s>": 2,
|
|
|
|
|
|
|
| 9 |
"<s>": 1,
|
| 10 |
+
"9": 16,
|
| 11 |
+
"7": 14,
|
| 12 |
"mod": 4,
|
| 13 |
+
"<pad>": 3,
|
| 14 |
+
" ": 6,
|
| 15 |
+
"=": 5,
|
| 16 |
+
"5": 12,
|
| 17 |
+
"3": 10,
|
| 18 |
+
"4": 11
|
| 19 |
}
|