vojtam commited on
Commit
a810689
·
verified ·
1 Parent(s): 19ed2d3

Upload tokenizer.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer.json +115 -0
tokenizer.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<unk>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[PAD]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<|endoftext|>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ }
33
+ ],
34
+ "normalizer": null,
35
+ "pre_tokenizer": {
36
+ "type": "ByteLevel",
37
+ "add_prefix_space": false,
38
+ "trim_offsets": true,
39
+ "use_regex": true
40
+ },
41
+ "post_processor": {
42
+ "type": "ByteLevel",
43
+ "add_prefix_space": true,
44
+ "trim_offsets": false,
45
+ "use_regex": true
46
+ },
47
+ "decoder": {
48
+ "type": "ByteLevel",
49
+ "add_prefix_space": true,
50
+ "trim_offsets": true,
51
+ "use_regex": true
52
+ },
53
+ "model": {
54
+ "type": "BPE",
55
+ "dropout": null,
56
+ "unk_token": null,
57
+ "continuing_subword_prefix": "",
58
+ "end_of_word_suffix": "",
59
+ "fuse_unk": false,
60
+ "byte_fallback": false,
61
+ "ignore_merges": false,
62
+ "vocab": {
63
+ "<unk>": 0,
64
+ "[PAD]": 1,
65
+ "<|endoftext|>": 2,
66
+ "AA": 3,
67
+ "TT": 4,
68
+ "TG": 5,
69
+ "CA": 6,
70
+ "CC": 7,
71
+ "TA": 8,
72
+ "GG": 9,
73
+ "TC": 10,
74
+ "A": 11,
75
+ "T": 12,
76
+ "G": 13,
77
+ "C": 14,
78
+ "▁": 15
79
+ },
80
+ "merges": [
81
+ [
82
+ "A",
83
+ "A"
84
+ ],
85
+ [
86
+ "T",
87
+ "T"
88
+ ],
89
+ [
90
+ "T",
91
+ "G"
92
+ ],
93
+ [
94
+ "C",
95
+ "A"
96
+ ],
97
+ [
98
+ "C",
99
+ "C"
100
+ ],
101
+ [
102
+ "T",
103
+ "A"
104
+ ],
105
+ [
106
+ "G",
107
+ "G"
108
+ ],
109
+ [
110
+ "T",
111
+ "C"
112
+ ]
113
+ ]
114
+ }
115
+ }