MarkGG commited on
Commit
5cc2898
·
1 Parent(s): 0b51d77

Upload tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[ex0]": 31972,
3
+ "[ex10]": 31982,
4
+ "[ex11]": 31983,
5
+ "[ex12]": 31984,
6
+ "[ex13]": 31985,
7
+ "[ex14]": 31986,
8
+ "[ex15]": 31987,
9
+ "[ex16]": 31988,
10
+ "[ex17]": 31989,
11
+ "[ex18]": 31990,
12
+ "[ex19]": 31991,
13
+ "[ex1]": 31973,
14
+ "[ex20]": 31992,
15
+ "[ex21]": 31993,
16
+ "[ex22]": 31994,
17
+ "[ex23]": 31995,
18
+ "[ex24]": 31996,
19
+ "[ex25]": 31997,
20
+ "[ex26]": 31998,
21
+ "[ex27]": 31999,
22
+ "[ex28]": 32000,
23
+ "[ex29]": 32001,
24
+ "[ex2]": 31974,
25
+ "[ex30]": 32002,
26
+ "[ex31]": 32003,
27
+ "[ex32]": 32004,
28
+ "[ex33]": 32005,
29
+ "[ex34]": 32006,
30
+ "[ex35]": 32007,
31
+ "[ex36]": 32008,
32
+ "[ex37]": 32009,
33
+ "[ex38]": 32010,
34
+ "[ex39]": 32011,
35
+ "[ex3]": 31975,
36
+ "[ex40]": 32012,
37
+ "[ex41]": 32013,
38
+ "[ex42]": 32014,
39
+ "[ex43]": 32015,
40
+ "[ex44]": 32016,
41
+ "[ex45]": 32017,
42
+ "[ex46]": 32018,
43
+ "[ex47]": 32019,
44
+ "[ex48]": 32020,
45
+ "[ex49]": 32021,
46
+ "[ex4]": 31976,
47
+ "[ex50]": 32022,
48
+ "[ex51]": 32023,
49
+ "[ex52]": 32024,
50
+ "[ex53]": 32025,
51
+ "[ex54]": 32026,
52
+ "[ex55]": 32027,
53
+ "[ex56]": 32028,
54
+ "[ex57]": 32029,
55
+ "[ex58]": 32030,
56
+ "[ex59]": 32031,
57
+ "[ex5]": 31977,
58
+ "[ex60]": 32032,
59
+ "[ex61]": 32033,
60
+ "[ex62]": 32034,
61
+ "[ex63]": 32035,
62
+ "[ex64]": 32036,
63
+ "[ex65]": 32037,
64
+ "[ex66]": 32038,
65
+ "[ex67]": 32039,
66
+ "[ex68]": 32040,
67
+ "[ex69]": 32041,
68
+ "[ex6]": 31978,
69
+ "[ex70]": 32042,
70
+ "[ex71]": 32043,
71
+ "[ex72]": 32044,
72
+ "[ex73]": 32045,
73
+ "[ex74]": 32046,
74
+ "[ex75]": 32047,
75
+ "[ex76]": 32048,
76
+ "[ex77]": 32049,
77
+ "[ex78]": 32050,
78
+ "[ex79]": 32051,
79
+ "[ex7]": 31979,
80
+ "[ex80]": 32052,
81
+ "[ex81]": 32053,
82
+ "[ex82]": 32054,
83
+ "[ex83]": 32055,
84
+ "[ex84]": 32056,
85
+ "[ex85]": 32057,
86
+ "[ex86]": 32058,
87
+ "[ex87]": 32059,
88
+ "[ex88]": 32060,
89
+ "[ex89]": 32061,
90
+ "[ex8]": 31980,
91
+ "[ex90]": 32062,
92
+ "[ex91]": 32063,
93
+ "[ex92]": 32064,
94
+ "[ex93]": 32065,
95
+ "[ex94]": 32066,
96
+ "[ex95]": 32067,
97
+ "[ex96]": 32068,
98
+ "[ex97]": 32069,
99
+ "[ex98]": 32070,
100
+ "[ex99]": 32071,
101
+ "[ex9]": 31981,
102
+ "[frl]": 31970,
103
+ "[mrl]": 31971,
104
+ "ext.": 32072,
105
+ "int.": 32073
106
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "unk_token": "<|endoftext|>"
5
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "eos_token": "<|endoftext|>",
5
+ "model_max_length": 1024,
6
+ "name_or_path": "gpt2",
7
+ "special_tokens_map_file": null,
8
+ "tokenizer_class": "GPT2Tokenizer",
9
+ "unk_token": "<|endoftext|>"
10
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff