Ezekiel999 commited on
Commit
6688fc3
·
verified ·
1 Parent(s): c562521

Upload tokenizer_config.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +60 -0
tokenizer_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "AksaraTokenizer",
3
+ "model_type": "aksarallm",
4
+ "vocab_size": 32768,
5
+ "special_tokens": {
6
+ "[PAD]": 0,
7
+ "[EOS]": 1,
8
+ "[BOS]": 2,
9
+ "[UNK]": 3,
10
+ "[SEP]": 4,
11
+ "[MASK]": 5,
12
+ "[SYSTEM]": 6,
13
+ "[USER]": 7,
14
+ "[ASST]": 8,
15
+ "[INST]": 9,
16
+ "[/INST]": 10,
17
+ "[LANG_ID]": 11,
18
+ "[LANG_JV]": 12,
19
+ "[LANG_SU]": 13,
20
+ "[LANG_BL]": 14,
21
+ "[LANG_BT]": 15,
22
+ "[LANG_BG]": 16,
23
+ "[LANG_MN]": 17,
24
+ "[LANG_MD]": 18,
25
+ "[LANG_AC]": 19,
26
+ "[LANG_BJ]": 20,
27
+ "[LANG_EN]": 21,
28
+ "[TURN]": 22,
29
+ "[THINK]": 23,
30
+ "[/THINK]": 24,
31
+ "[CODE]": 25,
32
+ "[/CODE]": 26,
33
+ "[CITE]": 27,
34
+ "[NEWLINE]": 28
35
+ },
36
+ "pad_token": "[PAD]",
37
+ "eos_token": "[EOS]",
38
+ "bos_token": "[BOS]",
39
+ "unk_token": "[UNK]",
40
+ "sep_token": "[SEP]",
41
+ "mask_token": "[MASK]",
42
+ "max_length": 512,
43
+ "pre_tokenizer": "ByteLevel",
44
+ "model": "BPE",
45
+ "version": "1.0.0",
46
+ "language": [
47
+ "id",
48
+ "jv",
49
+ "su",
50
+ "bal",
51
+ "btk",
52
+ "bug",
53
+ "min",
54
+ "mad",
55
+ "ace",
56
+ "bjn",
57
+ "en"
58
+ ],
59
+ "description": "Custom BPE tokenizer for AksaraLLM, optimized for Indonesian and local languages"
60
+ }