awni commited on
Commit
919175d
·
verified ·
1 Parent(s): add7c7c

Upload 2 files

Browse files
Files changed (2) hide show
  1. tokenization_kimi.py +1 -1
  2. tokenizer_config.json +48 -8
tokenization_kimi.py CHANGED
@@ -16,7 +16,7 @@ from shutil import copyfile
16
  from tiktoken.load import load_tiktoken_bpe
17
  from tokenizers import AddedToken, pre_tokenizers, Regex
18
  from transformers.tokenization_utils import PreTrainedTokenizer
19
- from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
20
  from typing import Any
21
 
22
 
 
16
  from tiktoken.load import load_tiktoken_bpe
17
  from tokenizers import AddedToken, pre_tokenizers, Regex
18
  from transformers.tokenization_utils import PreTrainedTokenizer
19
+ from transformers.convert_slow_tokenizer import bytes_to_unicode
20
  from typing import Any
21
 
22
 
tokenizer_config.json CHANGED
@@ -72,6 +72,46 @@
72
  "single_word": false,
73
  "special": true
74
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  "163601": {
76
  "content": "<|im_middle|>",
77
  "lstrip": false,
@@ -107,12 +147,6 @@
107
  "<|im_system|>",
108
  "<|im_middle|>"
109
  ],
110
- "auto_map": {
111
- "AutoTokenizer": [
112
- "tokenization_kimi.TikTokenTokenizer",
113
- null
114
- ]
115
- },
116
  "bos_token": "[BOS]",
117
  "clean_up_tokenization_spaces": false,
118
  "eos_token": "[EOS]",
@@ -120,5 +154,11 @@
120
  "model_max_length": 1000000000000000019884624838656,
121
  "pad_token": "[PAD]",
122
  "tokenizer_class": "TikTokenTokenizer",
123
- "unk_token": "[UNK]"
124
- }
 
 
 
 
 
 
 
72
  "single_word": false,
73
  "special": true
74
  },
75
+ "163595": {
76
+ "content": "<|tool_calls_section_begin|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": false
82
+ },
83
+ "163596": {
84
+ "content": "<|tool_calls_section_end|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": false
90
+ },
91
+ "163597": {
92
+ "content": "<|tool_call_begin|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": false
98
+ },
99
+ "163598": {
100
+ "content": "<|tool_call_argument_begin|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": false
106
+ },
107
+ "163599": {
108
+ "content": "<|tool_call_end|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": false
114
+ },
115
  "163601": {
116
  "content": "<|im_middle|>",
117
  "lstrip": false,
 
147
  "<|im_system|>",
148
  "<|im_middle|>"
149
  ],
 
 
 
 
 
 
150
  "bos_token": "[BOS]",
151
  "clean_up_tokenization_spaces": false,
152
  "eos_token": "[EOS]",
 
154
  "model_max_length": 1000000000000000019884624838656,
155
  "pad_token": "[PAD]",
156
  "tokenizer_class": "TikTokenTokenizer",
157
+ "unk_token": "[UNK]",
158
+ "auto_map": {
159
+ "AutoTokenizer": [
160
+ "tokenization_kimi.TikTokenTokenizer",
161
+ null
162
+ ]
163
+ }
164
+ }