rigarashi commited on
Commit
f9df6a2
·
verified ·
1 Parent(s): 30fe0f3

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -1,3 +1,3 @@
1
  {
2
- "<|endoftext|>": 25597
3
  }
 
1
  {
2
+ "<|endoftext|>": 25629
3
  }
merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -1,24 +1,6 @@
1
  {
2
- "bos_token": {
3
- "content": "<|endoftext|>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<|endoftext|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
  "pad_token": "<|endoftext|>",
17
- "unk_token": {
18
- "content": "<|endoftext|>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
  }
 
1
  {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
 
 
 
 
 
 
 
 
 
 
 
 
4
  "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
 
 
 
 
 
 
6
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -41,7 +41,7 @@
41
  "single_word": false,
42
  "special": true
43
  },
44
- "25597": {
45
  "content": "<|endoftext|>",
46
  "lstrip": false,
47
  "normalized": false,
@@ -53,15 +53,8 @@
53
  "bos_token": "<|endoftext|>",
54
  "clean_up_tokenization_spaces": true,
55
  "eos_token": "<|endoftext|>",
56
- "max_length": 128,
57
  "model_max_length": 1000000000000000019884624838656,
58
- "pad_to_multiple_of": null,
59
  "pad_token": "<|endoftext|>",
60
- "pad_token_type_id": 0,
61
- "padding_side": "right",
62
- "stride": 0,
63
  "tokenizer_class": "GPT2Tokenizer",
64
- "truncation_side": "right",
65
- "truncation_strategy": "longest_first",
66
  "unk_token": "<|endoftext|>"
67
  }
 
41
  "single_word": false,
42
  "special": true
43
  },
44
+ "25629": {
45
  "content": "<|endoftext|>",
46
  "lstrip": false,
47
  "normalized": false,
 
53
  "bos_token": "<|endoftext|>",
54
  "clean_up_tokenization_spaces": true,
55
  "eos_token": "<|endoftext|>",
 
56
  "model_max_length": 1000000000000000019884624838656,
 
57
  "pad_token": "<|endoftext|>",
 
 
 
58
  "tokenizer_class": "GPT2Tokenizer",
 
 
59
  "unk_token": "<|endoftext|>"
60
  }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff