Pinkstack commited on
Commit
a8c9c83
·
verified ·
1 Parent(s): a10ae1b

updated tokenizer for testing may not work properly

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +19 -52
tokenizer_config.json CHANGED
@@ -26,7 +26,7 @@
26
  "special": true
27
  },
28
  "3": {
29
- "content": "<repo_name>",
30
  "lstrip": false,
31
  "normalized": false,
32
  "rstrip": false,
@@ -34,7 +34,7 @@
34
  "special": true
35
  },
36
  "4": {
37
- "content": "<reponame>",
38
  "lstrip": false,
39
  "normalized": false,
40
  "rstrip": false,
@@ -42,7 +42,7 @@
42
  "special": true
43
  },
44
  "5": {
45
- "content": "<file_sep>",
46
  "lstrip": false,
47
  "normalized": false,
48
  "rstrip": false,
@@ -50,7 +50,7 @@
50
  "special": true
51
  },
52
  "6": {
53
- "content": "<filename>",
54
  "lstrip": false,
55
  "normalized": false,
56
  "rstrip": false,
@@ -58,7 +58,7 @@
58
  "special": true
59
  },
60
  "7": {
61
- "content": "<gh_stars>",
62
  "lstrip": false,
63
  "normalized": false,
64
  "rstrip": false,
@@ -66,7 +66,7 @@
66
  "special": true
67
  },
68
  "8": {
69
- "content": "<issue_start>",
70
  "lstrip": false,
71
  "normalized": false,
72
  "rstrip": false,
@@ -74,7 +74,7 @@
74
  "special": true
75
  },
76
  "9": {
77
- "content": "<issue_comment>",
78
  "lstrip": false,
79
  "normalized": false,
80
  "rstrip": false,
@@ -82,7 +82,7 @@
82
  "special": true
83
  },
84
  "10": {
85
- "content": "<issue_closed>",
86
  "lstrip": false,
87
  "normalized": false,
88
  "rstrip": false,
@@ -90,47 +90,7 @@
90
  "special": true
91
  },
92
  "11": {
93
- "content": "<jupyter_start>",
94
- "lstrip": false,
95
- "normalized": false,
96
- "rstrip": false,
97
- "single_word": false,
98
- "special": true
99
- },
100
- "12": {
101
- "content": "<jupyter_text>",
102
- "lstrip": false,
103
- "normalized": false,
104
- "rstrip": false,
105
- "single_word": false,
106
- "special": true
107
- },
108
- "13": {
109
- "content": "<jupyter_code>",
110
- "lstrip": false,
111
- "normalized": false,
112
- "rstrip": false,
113
- "single_word": false,
114
- "special": true
115
- },
116
- "14": {
117
- "content": "<jupyter_output>",
118
- "lstrip": false,
119
- "normalized": false,
120
- "rstrip": false,
121
- "single_word": false,
122
- "special": true
123
- },
124
- "15": {
125
- "content": "<jupyter_script>",
126
- "lstrip": false,
127
- "normalized": false,
128
- "rstrip": false,
129
- "single_word": false,
130
- "special": true
131
- },
132
- "16": {
133
- "content": "<empty_output>",
134
  "lstrip": false,
135
  "normalized": false,
136
  "rstrip": false,
@@ -139,12 +99,19 @@
139
  }
140
  },
141
  "bos_token": "<|im_start|>",
142
- "chat_template": "{% if 'role' in messages[0] %}{% for message in messages %}{% if message['role'] == 'user' %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' }}{% else %}{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}{% else %}{% for message in messages %}{% if message['from'] == 'human' %}{{'<|im_start|>user\n' + message['value'] + '<|im_end|>\n'}}{% elif message['from'] == 'gpt' %}{{'<|im_start|>assistant\n' + message['value'] + '<|im_end|>\n' }}{% else %}{{ '<|im_start|>system\n' + message['value'] + '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}{% endif %}",
143
  "clean_up_tokenization_spaces": false,
144
  "eos_token": "<|im_end|>",
145
- "extra_special_tokens": {},
 
 
 
 
 
 
 
146
  "model_max_length": 1000000000000000019884624838656,
147
  "pad_token": "<|endoftext|>",
148
  "tokenizer_class": "GPT2Tokenizer",
149
  "unk_token": "<|endoftext|>"
150
- }
 
26
  "special": true
27
  },
28
  "3": {
29
+ "content": "<think>",
30
  "lstrip": false,
31
  "normalized": false,
32
  "rstrip": false,
 
34
  "special": true
35
  },
36
  "4": {
37
+ "content": "</think>",
38
  "lstrip": false,
39
  "normalized": false,
40
  "rstrip": false,
 
42
  "special": true
43
  },
44
  "5": {
45
+ "content": "<output>",
46
  "lstrip": false,
47
  "normalized": false,
48
  "rstrip": false,
 
50
  "special": true
51
  },
52
  "6": {
53
+ "content": "</output>",
54
  "lstrip": false,
55
  "normalized": false,
56
  "rstrip": false,
 
58
  "special": true
59
  },
60
  "7": {
61
+ "content": "<model_identity>",
62
  "lstrip": false,
63
  "normalized": false,
64
  "rstrip": false,
 
66
  "special": true
67
  },
68
  "8": {
69
+ "content": "</model_identity>",
70
  "lstrip": false,
71
  "normalized": false,
72
  "rstrip": false,
 
74
  "special": true
75
  },
76
  "9": {
77
+ "content": "<repo_name>",
78
  "lstrip": false,
79
  "normalized": false,
80
  "rstrip": false,
 
82
  "special": true
83
  },
84
  "10": {
85
+ "content": "<reponame>",
86
  "lstrip": false,
87
  "normalized": false,
88
  "rstrip": false,
 
90
  "special": true
91
  },
92
  "11": {
93
+ "content": "<file_sep>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  "lstrip": false,
95
  "normalized": false,
96
  "rstrip": false,
 
99
  }
100
  },
101
  "bos_token": "<|im_start|>",
102
+ "chat_template": "{% if message['role'] == 'system' %}<model_identity>Your name is Superthoughts lite by Pinkstack. You are an open weights AI model released in 2025 with built-in information up to 2024.</model_identity>\n{% endif %}{% if 'role' in messages[0] %}{% for message in messages %}{% if message['role'] == 'user' %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|im_start|>assistant\n<think>\n' + message['thinking_content'] + '</think>\n<output>\n' + message['content'] + '</output>\n<|im_end|>\n' }}{% else %}{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}{% else %}{% for message in messages %}{% if message['from'] == 'human' %}{{'<|im_start|>user\n' + message['value'] + '<|im_end|>\n'}}{% elif message['from'] == 'assistant' %}{{'<|im_start|>assistant\n<think>\n' + message['thinking_value'] + '</think>\n<output>\n' + message['value'] + '</output>\n<|im_end|>\n' }}{% else %}{{ '<|im_start|>system\n' + message['value'] + '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}{% endif %}",
103
  "clean_up_tokenization_spaces": false,
104
  "eos_token": "<|im_end|>",
105
+ "extra_special_tokens": {
106
+ "think_token": "<think>",
107
+ "think_end_token": "</think>",
108
+ "output_token": "<output>",
109
+ "output_end_token": "</output>",
110
+ "model_identity_token": "<model_identity>",
111
+ "model_identity_end_token": "</model_identity>"
112
+ },
113
  "model_max_length": 1000000000000000019884624838656,
114
  "pad_token": "<|endoftext|>",
115
  "tokenizer_class": "GPT2Tokenizer",
116
  "unk_token": "<|endoftext|>"
117
+ }