GoshKolotyan commited on
Commit
b97b002
·
verified ·
1 Parent(s): da7dd17

Initial upload of perovskite tokenizer - tokenizer_config.json

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +60 -0
tokenizer_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 5000,
3
+ "elements": [
4
+ "DMA",
5
+ "Na",
6
+ "Te",
7
+ "Bi",
8
+ "Sb",
9
+ "Ti",
10
+ "Tb",
11
+ "Mg",
12
+ "Li",
13
+ "Cd",
14
+ "Yb",
15
+ "Ni",
16
+ "Nb",
17
+ "Mn",
18
+ "Cu",
19
+ "Tl",
20
+ "Zn",
21
+ "Br",
22
+ "Sr",
23
+ "Hg",
24
+ "Pd",
25
+ "Ge",
26
+ "Cl",
27
+ "In",
28
+ "Sn",
29
+ "Au",
30
+ "Fe",
31
+ "La",
32
+ "Pb",
33
+ "Rb",
34
+ "FA",
35
+ "Co",
36
+ "Ba",
37
+ "Cs",
38
+ "Ag",
39
+ "Ga",
40
+ "Se",
41
+ "MA",
42
+ "P",
43
+ "Y",
44
+ "S",
45
+ "K",
46
+ "I",
47
+ "F"
48
+ ],
49
+ "num_elements": 44,
50
+ "tokenizer_type": "BPE",
51
+ "special_tokens": [
52
+ "[PAD]",
53
+ "[UNK]",
54
+ "[CLS]",
55
+ "[SEP]",
56
+ "[MASK]"
57
+ ],
58
+ "description": "Chemical-aware tokenizer for perovskite formulas",
59
+ "usage": "Tokenizes chemical formulas while preserving chemical entity boundaries"
60
+ }