qchapp commited on
Commit
0218dff
·
verified ·
1 Parent(s): 8f107e2

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-2800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-2871/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
checkpoint-2800/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
checkpoint-2800/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151643,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "max_position_embeddings": 32768,
15
+ "max_window_layers": 28,
16
+ "model_type": "qwen3",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 8,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": null,
22
+ "rope_theta": 1000000,
23
+ "sliding_window": null,
24
+ "tie_word_embeddings": true,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.51.3",
27
+ "use_cache": true,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 151936
30
+ }
checkpoint-2800/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.51.3"
6
+ }
checkpoint-2800/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2800/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7457ab2fe2401fcf9d0e503c89ae96efe6561e34a70ef69b228912ecb5aba18f
3
+ size 2384234968
checkpoint-2800/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:988e94edaeeed4ae6dc0267d8bca5a6a84353f883e4be58ddd2d23d457d7f670
3
+ size 4768662910
checkpoint-2800/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3fcb8b7132fdda989f7bbb14a5bf464435849629fe731ccbc64c4724068a57e
3
+ size 14244
checkpoint-2800/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7b5a7636a42c2a6084d8cb51bcf6c918bdd46c55757f7d35eaae15c4b9a5dec
3
+ size 988
checkpoint-2800/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05442058f80174c35e48cdab408a0ddd35a3c882923c67467ac2efab6773f71c
3
+ size 1064
checkpoint-2800/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-2800/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:352a863cd2761388ccc58f1432467ba6a1037bf12df9069889b142fa246471f6
3
+ size 11422752
checkpoint-2800/tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|endoftext|>",
233
+ "errors": "replace",
234
+ "extra_special_tokens": {},
235
+ "model_max_length": 131072,
236
+ "pad_token": "<|endoftext|>",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
checkpoint-2800/trainer_state.json ADDED
@@ -0,0 +1,2106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2800,
3
+ "best_metric": 0.94057297706604,
4
+ "best_model_checkpoint": "models/MNLP_M3_rag_model_test/checkpoint-2800",
5
+ "epoch": 2.923237597911227,
6
+ "eval_steps": 200,
7
+ "global_step": 2800,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.010443864229765013,
14
+ "grad_norm": 10.195783615112305,
15
+ "learning_rate": 9.000000000000001e-07,
16
+ "loss": 3.4273,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.020887728459530026,
21
+ "grad_norm": 7.810600280761719,
22
+ "learning_rate": 1.9000000000000002e-06,
23
+ "loss": 3.2543,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.031331592689295036,
28
+ "grad_norm": 5.423489570617676,
29
+ "learning_rate": 2.9e-06,
30
+ "loss": 3.0848,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.04177545691906005,
35
+ "grad_norm": 6.003882884979248,
36
+ "learning_rate": 3.900000000000001e-06,
37
+ "loss": 3.0378,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.05221932114882506,
42
+ "grad_norm": 5.395635604858398,
43
+ "learning_rate": 4.9000000000000005e-06,
44
+ "loss": 2.935,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.06266318537859007,
49
+ "grad_norm": 5.4613823890686035,
50
+ "learning_rate": 5.9e-06,
51
+ "loss": 2.8095,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.0731070496083551,
56
+ "grad_norm": 5.638515472412109,
57
+ "learning_rate": 6.9e-06,
58
+ "loss": 2.8053,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.0835509138381201,
63
+ "grad_norm": 5.723353385925293,
64
+ "learning_rate": 7.9e-06,
65
+ "loss": 2.7206,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.09399477806788512,
70
+ "grad_norm": 6.676548480987549,
71
+ "learning_rate": 8.900000000000001e-06,
72
+ "loss": 2.6614,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.10443864229765012,
77
+ "grad_norm": 6.09738302230835,
78
+ "learning_rate": 9.9e-06,
79
+ "loss": 2.4909,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.11488250652741515,
84
+ "grad_norm": 6.762812614440918,
85
+ "learning_rate": 9.967520750631542e-06,
86
+ "loss": 2.4897,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.12532637075718014,
91
+ "grad_norm": 6.7795796394348145,
92
+ "learning_rate": 9.931432695777698e-06,
93
+ "loss": 2.3994,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.13577023498694518,
98
+ "grad_norm": 6.7266669273376465,
99
+ "learning_rate": 9.895344640923855e-06,
100
+ "loss": 2.395,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.1462140992167102,
105
+ "grad_norm": 6.685121536254883,
106
+ "learning_rate": 9.859256586070011e-06,
107
+ "loss": 2.3435,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.1566579634464752,
112
+ "grad_norm": 7.3826985359191895,
113
+ "learning_rate": 9.823168531216168e-06,
114
+ "loss": 2.1847,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.1671018276762402,
119
+ "grad_norm": 6.167830467224121,
120
+ "learning_rate": 9.787080476362326e-06,
121
+ "loss": 2.2339,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.17754569190600522,
126
+ "grad_norm": 9.586319923400879,
127
+ "learning_rate": 9.750992421508482e-06,
128
+ "loss": 2.1565,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.18798955613577023,
133
+ "grad_norm": 9.096244812011719,
134
+ "learning_rate": 9.714904366654639e-06,
135
+ "loss": 2.2398,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.19843342036553524,
140
+ "grad_norm": 9.149866104125977,
141
+ "learning_rate": 9.68242511728618e-06,
142
+ "loss": 2.0402,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.20887728459530025,
147
+ "grad_norm": 7.662464141845703,
148
+ "learning_rate": 9.646337062432336e-06,
149
+ "loss": 1.853,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.20887728459530025,
154
+ "eval_loss": 1.9208216667175293,
155
+ "eval_runtime": 23.2773,
156
+ "eval_samples_per_second": 36.559,
157
+ "eval_steps_per_second": 4.597,
158
+ "step": 200
159
+ },
160
+ {
161
+ "epoch": 0.2193211488250653,
162
+ "grad_norm": 7.436556339263916,
163
+ "learning_rate": 9.610249007578492e-06,
164
+ "loss": 1.9207,
165
+ "step": 210
166
+ },
167
+ {
168
+ "epoch": 0.2297650130548303,
169
+ "grad_norm": 7.291353225708008,
170
+ "learning_rate": 9.574160952724649e-06,
171
+ "loss": 1.9513,
172
+ "step": 220
173
+ },
174
+ {
175
+ "epoch": 0.2402088772845953,
176
+ "grad_norm": 7.357730865478516,
177
+ "learning_rate": 9.538072897870805e-06,
178
+ "loss": 1.8785,
179
+ "step": 230
180
+ },
181
+ {
182
+ "epoch": 0.2506527415143603,
183
+ "grad_norm": 7.417892932891846,
184
+ "learning_rate": 9.501984843016962e-06,
185
+ "loss": 1.8304,
186
+ "step": 240
187
+ },
188
+ {
189
+ "epoch": 0.26109660574412535,
190
+ "grad_norm": 7.092027187347412,
191
+ "learning_rate": 9.46589678816312e-06,
192
+ "loss": 1.7244,
193
+ "step": 250
194
+ },
195
+ {
196
+ "epoch": 0.27154046997389036,
197
+ "grad_norm": 8.140602111816406,
198
+ "learning_rate": 9.429808733309276e-06,
199
+ "loss": 1.9412,
200
+ "step": 260
201
+ },
202
+ {
203
+ "epoch": 0.2819843342036554,
204
+ "grad_norm": 6.528462886810303,
205
+ "learning_rate": 9.393720678455433e-06,
206
+ "loss": 1.7654,
207
+ "step": 270
208
+ },
209
+ {
210
+ "epoch": 0.2924281984334204,
211
+ "grad_norm": 6.523935794830322,
212
+ "learning_rate": 9.35763262360159e-06,
213
+ "loss": 1.8401,
214
+ "step": 280
215
+ },
216
+ {
217
+ "epoch": 0.3028720626631854,
218
+ "grad_norm": 6.85488224029541,
219
+ "learning_rate": 9.321544568747746e-06,
220
+ "loss": 1.8402,
221
+ "step": 290
222
+ },
223
+ {
224
+ "epoch": 0.3133159268929504,
225
+ "grad_norm": 6.1414875984191895,
226
+ "learning_rate": 9.285456513893902e-06,
227
+ "loss": 1.682,
228
+ "step": 300
229
+ },
230
+ {
231
+ "epoch": 0.3237597911227154,
232
+ "grad_norm": 7.549343109130859,
233
+ "learning_rate": 9.249368459040059e-06,
234
+ "loss": 1.5124,
235
+ "step": 310
236
+ },
237
+ {
238
+ "epoch": 0.3342036553524804,
239
+ "grad_norm": 7.781056880950928,
240
+ "learning_rate": 9.213280404186215e-06,
241
+ "loss": 1.6474,
242
+ "step": 320
243
+ },
244
+ {
245
+ "epoch": 0.34464751958224543,
246
+ "grad_norm": 6.674058437347412,
247
+ "learning_rate": 9.177192349332372e-06,
248
+ "loss": 1.6971,
249
+ "step": 330
250
+ },
251
+ {
252
+ "epoch": 0.35509138381201044,
253
+ "grad_norm": 8.93964958190918,
254
+ "learning_rate": 9.14110429447853e-06,
255
+ "loss": 1.5545,
256
+ "step": 340
257
+ },
258
+ {
259
+ "epoch": 0.36553524804177545,
260
+ "grad_norm": 7.935058116912842,
261
+ "learning_rate": 9.105016239624686e-06,
262
+ "loss": 1.6633,
263
+ "step": 350
264
+ },
265
+ {
266
+ "epoch": 0.37597911227154046,
267
+ "grad_norm": 6.446653842926025,
268
+ "learning_rate": 9.068928184770843e-06,
269
+ "loss": 1.6209,
270
+ "step": 360
271
+ },
272
+ {
273
+ "epoch": 0.38642297650130547,
274
+ "grad_norm": 9.611429214477539,
275
+ "learning_rate": 9.036448935402382e-06,
276
+ "loss": 1.4615,
277
+ "step": 370
278
+ },
279
+ {
280
+ "epoch": 0.3968668407310705,
281
+ "grad_norm": 8.988059043884277,
282
+ "learning_rate": 9.000360880548538e-06,
283
+ "loss": 1.5815,
284
+ "step": 380
285
+ },
286
+ {
287
+ "epoch": 0.4073107049608355,
288
+ "grad_norm": 7.07716178894043,
289
+ "learning_rate": 8.964272825694695e-06,
290
+ "loss": 1.6719,
291
+ "step": 390
292
+ },
293
+ {
294
+ "epoch": 0.4177545691906005,
295
+ "grad_norm": 6.789717674255371,
296
+ "learning_rate": 8.928184770840851e-06,
297
+ "loss": 1.4354,
298
+ "step": 400
299
+ },
300
+ {
301
+ "epoch": 0.4177545691906005,
302
+ "eval_loss": 1.4963077306747437,
303
+ "eval_runtime": 23.1593,
304
+ "eval_samples_per_second": 36.746,
305
+ "eval_steps_per_second": 4.62,
306
+ "step": 400
307
+ },
308
+ {
309
+ "epoch": 0.4281984334203655,
310
+ "grad_norm": 6.676051139831543,
311
+ "learning_rate": 8.89209671598701e-06,
312
+ "loss": 1.695,
313
+ "step": 410
314
+ },
315
+ {
316
+ "epoch": 0.4386422976501306,
317
+ "grad_norm": 6.711126327514648,
318
+ "learning_rate": 8.856008661133166e-06,
319
+ "loss": 1.6562,
320
+ "step": 420
321
+ },
322
+ {
323
+ "epoch": 0.4490861618798956,
324
+ "grad_norm": 7.0567731857299805,
325
+ "learning_rate": 8.819920606279322e-06,
326
+ "loss": 1.6646,
327
+ "step": 430
328
+ },
329
+ {
330
+ "epoch": 0.4595300261096606,
331
+ "grad_norm": 6.944768905639648,
332
+ "learning_rate": 8.783832551425479e-06,
333
+ "loss": 1.4864,
334
+ "step": 440
335
+ },
336
+ {
337
+ "epoch": 0.4699738903394256,
338
+ "grad_norm": 6.025828838348389,
339
+ "learning_rate": 8.747744496571635e-06,
340
+ "loss": 1.4261,
341
+ "step": 450
342
+ },
343
+ {
344
+ "epoch": 0.4804177545691906,
345
+ "grad_norm": 7.318626880645752,
346
+ "learning_rate": 8.711656441717792e-06,
347
+ "loss": 1.4183,
348
+ "step": 460
349
+ },
350
+ {
351
+ "epoch": 0.4908616187989556,
352
+ "grad_norm": 9.207475662231445,
353
+ "learning_rate": 8.675568386863948e-06,
354
+ "loss": 1.6308,
355
+ "step": 470
356
+ },
357
+ {
358
+ "epoch": 0.5013054830287206,
359
+ "grad_norm": 7.152507781982422,
360
+ "learning_rate": 8.639480332010105e-06,
361
+ "loss": 1.5118,
362
+ "step": 480
363
+ },
364
+ {
365
+ "epoch": 0.5117493472584856,
366
+ "grad_norm": 6.89570426940918,
367
+ "learning_rate": 8.603392277156261e-06,
368
+ "loss": 1.6088,
369
+ "step": 490
370
+ },
371
+ {
372
+ "epoch": 0.5221932114882507,
373
+ "grad_norm": 8.016292572021484,
374
+ "learning_rate": 8.567304222302419e-06,
375
+ "loss": 1.6461,
376
+ "step": 500
377
+ },
378
+ {
379
+ "epoch": 0.5326370757180157,
380
+ "grad_norm": 5.700518608093262,
381
+ "learning_rate": 8.531216167448576e-06,
382
+ "loss": 1.5497,
383
+ "step": 510
384
+ },
385
+ {
386
+ "epoch": 0.5430809399477807,
387
+ "grad_norm": 7.127388000488281,
388
+ "learning_rate": 8.495128112594732e-06,
389
+ "loss": 1.4443,
390
+ "step": 520
391
+ },
392
+ {
393
+ "epoch": 0.5535248041775457,
394
+ "grad_norm": 9.201033592224121,
395
+ "learning_rate": 8.459040057740888e-06,
396
+ "loss": 1.3384,
397
+ "step": 530
398
+ },
399
+ {
400
+ "epoch": 0.5639686684073107,
401
+ "grad_norm": 9.565522193908691,
402
+ "learning_rate": 8.422952002887045e-06,
403
+ "loss": 1.3732,
404
+ "step": 540
405
+ },
406
+ {
407
+ "epoch": 0.5744125326370757,
408
+ "grad_norm": 6.29095458984375,
409
+ "learning_rate": 8.386863948033201e-06,
410
+ "loss": 1.5768,
411
+ "step": 550
412
+ },
413
+ {
414
+ "epoch": 0.5848563968668408,
415
+ "grad_norm": 7.279628753662109,
416
+ "learning_rate": 8.350775893179358e-06,
417
+ "loss": 1.5237,
418
+ "step": 560
419
+ },
420
+ {
421
+ "epoch": 0.5953002610966057,
422
+ "grad_norm": 5.866252899169922,
423
+ "learning_rate": 8.314687838325514e-06,
424
+ "loss": 1.491,
425
+ "step": 570
426
+ },
427
+ {
428
+ "epoch": 0.6057441253263708,
429
+ "grad_norm": 6.8628621101379395,
430
+ "learning_rate": 8.27859978347167e-06,
431
+ "loss": 1.3991,
432
+ "step": 580
433
+ },
434
+ {
435
+ "epoch": 0.6161879895561357,
436
+ "grad_norm": 5.9102654457092285,
437
+ "learning_rate": 8.242511728617829e-06,
438
+ "loss": 1.3842,
439
+ "step": 590
440
+ },
441
+ {
442
+ "epoch": 0.6266318537859008,
443
+ "grad_norm": 6.6509199142456055,
444
+ "learning_rate": 8.206423673763985e-06,
445
+ "loss": 1.4015,
446
+ "step": 600
447
+ },
448
+ {
449
+ "epoch": 0.6266318537859008,
450
+ "eval_loss": 1.3116086721420288,
451
+ "eval_runtime": 23.131,
452
+ "eval_samples_per_second": 36.79,
453
+ "eval_steps_per_second": 4.626,
454
+ "step": 600
455
+ },
456
+ {
457
+ "epoch": 0.6370757180156658,
458
+ "grad_norm": 10.004154205322266,
459
+ "learning_rate": 8.170335618910142e-06,
460
+ "loss": 1.4046,
461
+ "step": 610
462
+ },
463
+ {
464
+ "epoch": 0.6475195822454308,
465
+ "grad_norm": 6.851074695587158,
466
+ "learning_rate": 8.134247564056298e-06,
467
+ "loss": 1.3358,
468
+ "step": 620
469
+ },
470
+ {
471
+ "epoch": 0.6579634464751958,
472
+ "grad_norm": 6.676290512084961,
473
+ "learning_rate": 8.098159509202455e-06,
474
+ "loss": 1.2649,
475
+ "step": 630
476
+ },
477
+ {
478
+ "epoch": 0.6684073107049608,
479
+ "grad_norm": 6.854121685028076,
480
+ "learning_rate": 8.062071454348611e-06,
481
+ "loss": 1.4563,
482
+ "step": 640
483
+ },
484
+ {
485
+ "epoch": 0.6788511749347258,
486
+ "grad_norm": 7.104318141937256,
487
+ "learning_rate": 8.025983399494768e-06,
488
+ "loss": 1.2667,
489
+ "step": 650
490
+ },
491
+ {
492
+ "epoch": 0.6892950391644909,
493
+ "grad_norm": 6.3163957595825195,
494
+ "learning_rate": 7.989895344640924e-06,
495
+ "loss": 1.4532,
496
+ "step": 660
497
+ },
498
+ {
499
+ "epoch": 0.6997389033942559,
500
+ "grad_norm": 7.5210652351379395,
501
+ "learning_rate": 7.95380728978708e-06,
502
+ "loss": 1.179,
503
+ "step": 670
504
+ },
505
+ {
506
+ "epoch": 0.7101827676240209,
507
+ "grad_norm": 7.099525451660156,
508
+ "learning_rate": 7.917719234933237e-06,
509
+ "loss": 1.2406,
510
+ "step": 680
511
+ },
512
+ {
513
+ "epoch": 0.720626631853786,
514
+ "grad_norm": 6.814334392547607,
515
+ "learning_rate": 7.881631180079395e-06,
516
+ "loss": 1.1178,
517
+ "step": 690
518
+ },
519
+ {
520
+ "epoch": 0.7310704960835509,
521
+ "grad_norm": 6.895037651062012,
522
+ "learning_rate": 7.845543125225551e-06,
523
+ "loss": 1.3291,
524
+ "step": 700
525
+ },
526
+ {
527
+ "epoch": 0.741514360313316,
528
+ "grad_norm": 6.383569717407227,
529
+ "learning_rate": 7.809455070371708e-06,
530
+ "loss": 1.2887,
531
+ "step": 710
532
+ },
533
+ {
534
+ "epoch": 0.7519582245430809,
535
+ "grad_norm": 8.13758373260498,
536
+ "learning_rate": 7.773367015517864e-06,
537
+ "loss": 1.2576,
538
+ "step": 720
539
+ },
540
+ {
541
+ "epoch": 0.762402088772846,
542
+ "grad_norm": 5.2739362716674805,
543
+ "learning_rate": 7.73727896066402e-06,
544
+ "loss": 1.2835,
545
+ "step": 730
546
+ },
547
+ {
548
+ "epoch": 0.7728459530026109,
549
+ "grad_norm": 7.7005934715271,
550
+ "learning_rate": 7.701190905810177e-06,
551
+ "loss": 1.2633,
552
+ "step": 740
553
+ },
554
+ {
555
+ "epoch": 0.783289817232376,
556
+ "grad_norm": 6.738245964050293,
557
+ "learning_rate": 7.665102850956334e-06,
558
+ "loss": 1.2129,
559
+ "step": 750
560
+ },
561
+ {
562
+ "epoch": 0.793733681462141,
563
+ "grad_norm": 5.725705146789551,
564
+ "learning_rate": 7.629014796102491e-06,
565
+ "loss": 1.2501,
566
+ "step": 760
567
+ },
568
+ {
569
+ "epoch": 0.804177545691906,
570
+ "grad_norm": 6.256880760192871,
571
+ "learning_rate": 7.5929267412486475e-06,
572
+ "loss": 1.2567,
573
+ "step": 770
574
+ },
575
+ {
576
+ "epoch": 0.814621409921671,
577
+ "grad_norm": 6.835810661315918,
578
+ "learning_rate": 7.556838686394804e-06,
579
+ "loss": 1.2408,
580
+ "step": 780
581
+ },
582
+ {
583
+ "epoch": 0.825065274151436,
584
+ "grad_norm": 5.811525821685791,
585
+ "learning_rate": 7.52075063154096e-06,
586
+ "loss": 1.2409,
587
+ "step": 790
588
+ },
589
+ {
590
+ "epoch": 0.835509138381201,
591
+ "grad_norm": 8.554996490478516,
592
+ "learning_rate": 7.484662576687118e-06,
593
+ "loss": 1.2575,
594
+ "step": 800
595
+ },
596
+ {
597
+ "epoch": 0.835509138381201,
598
+ "eval_loss": 1.2006852626800537,
599
+ "eval_runtime": 23.1272,
600
+ "eval_samples_per_second": 36.796,
601
+ "eval_steps_per_second": 4.627,
602
+ "step": 800
603
+ },
604
+ {
605
+ "epoch": 0.8459530026109661,
606
+ "grad_norm": 7.091418266296387,
607
+ "learning_rate": 7.448574521833274e-06,
608
+ "loss": 1.0256,
609
+ "step": 810
610
+ },
611
+ {
612
+ "epoch": 0.856396866840731,
613
+ "grad_norm": 7.5542683601379395,
614
+ "learning_rate": 7.4124864669794306e-06,
615
+ "loss": 1.0605,
616
+ "step": 820
617
+ },
618
+ {
619
+ "epoch": 0.8668407310704961,
620
+ "grad_norm": 7.177179336547852,
621
+ "learning_rate": 7.376398412125587e-06,
622
+ "loss": 1.1747,
623
+ "step": 830
624
+ },
625
+ {
626
+ "epoch": 0.8772845953002611,
627
+ "grad_norm": 6.226783752441406,
628
+ "learning_rate": 7.3403103572717434e-06,
629
+ "loss": 1.1518,
630
+ "step": 840
631
+ },
632
+ {
633
+ "epoch": 0.8877284595300261,
634
+ "grad_norm": 7.062104225158691,
635
+ "learning_rate": 7.304222302417901e-06,
636
+ "loss": 1.2322,
637
+ "step": 850
638
+ },
639
+ {
640
+ "epoch": 0.8981723237597912,
641
+ "grad_norm": 8.756392478942871,
642
+ "learning_rate": 7.268134247564057e-06,
643
+ "loss": 1.271,
644
+ "step": 860
645
+ },
646
+ {
647
+ "epoch": 0.9086161879895561,
648
+ "grad_norm": 6.364953994750977,
649
+ "learning_rate": 7.232046192710214e-06,
650
+ "loss": 1.1853,
651
+ "step": 870
652
+ },
653
+ {
654
+ "epoch": 0.9190600522193212,
655
+ "grad_norm": 6.629589080810547,
656
+ "learning_rate": 7.19595813785637e-06,
657
+ "loss": 1.2279,
658
+ "step": 880
659
+ },
660
+ {
661
+ "epoch": 0.9295039164490861,
662
+ "grad_norm": 5.4719977378845215,
663
+ "learning_rate": 7.1598700830025265e-06,
664
+ "loss": 1.0789,
665
+ "step": 890
666
+ },
667
+ {
668
+ "epoch": 0.9399477806788512,
669
+ "grad_norm": 5.824141025543213,
670
+ "learning_rate": 7.123782028148684e-06,
671
+ "loss": 1.0435,
672
+ "step": 900
673
+ },
674
+ {
675
+ "epoch": 0.9503916449086162,
676
+ "grad_norm": 9.214339256286621,
677
+ "learning_rate": 7.08769397329484e-06,
678
+ "loss": 1.0518,
679
+ "step": 910
680
+ },
681
+ {
682
+ "epoch": 0.9608355091383812,
683
+ "grad_norm": 6.3007941246032715,
684
+ "learning_rate": 7.051605918440997e-06,
685
+ "loss": 1.0297,
686
+ "step": 920
687
+ },
688
+ {
689
+ "epoch": 0.9712793733681462,
690
+ "grad_norm": 5.431386947631836,
691
+ "learning_rate": 7.015517863587153e-06,
692
+ "loss": 1.1292,
693
+ "step": 930
694
+ },
695
+ {
696
+ "epoch": 0.9817232375979112,
697
+ "grad_norm": 8.941716194152832,
698
+ "learning_rate": 6.9794298087333105e-06,
699
+ "loss": 1.1617,
700
+ "step": 940
701
+ },
702
+ {
703
+ "epoch": 0.9921671018276762,
704
+ "grad_norm": 8.926597595214844,
705
+ "learning_rate": 6.943341753879467e-06,
706
+ "loss": 1.3497,
707
+ "step": 950
708
+ },
709
+ {
710
+ "epoch": 1.002088772845953,
711
+ "grad_norm": 6.556091785430908,
712
+ "learning_rate": 6.907253699025623e-06,
713
+ "loss": 1.176,
714
+ "step": 960
715
+ },
716
+ {
717
+ "epoch": 1.012532637075718,
718
+ "grad_norm": 12.022808074951172,
719
+ "learning_rate": 6.87116564417178e-06,
720
+ "loss": 0.9251,
721
+ "step": 970
722
+ },
723
+ {
724
+ "epoch": 1.022976501305483,
725
+ "grad_norm": 7.0122389793396,
726
+ "learning_rate": 6.835077589317936e-06,
727
+ "loss": 0.9231,
728
+ "step": 980
729
+ },
730
+ {
731
+ "epoch": 1.033420365535248,
732
+ "grad_norm": 6.355090141296387,
733
+ "learning_rate": 6.7989895344640936e-06,
734
+ "loss": 1.0371,
735
+ "step": 990
736
+ },
737
+ {
738
+ "epoch": 1.0438642297650131,
739
+ "grad_norm": 7.36827278137207,
740
+ "learning_rate": 6.76290147961025e-06,
741
+ "loss": 1.0666,
742
+ "step": 1000
743
+ },
744
+ {
745
+ "epoch": 1.0438642297650131,
746
+ "eval_loss": 1.1358416080474854,
747
+ "eval_runtime": 23.1354,
748
+ "eval_samples_per_second": 36.783,
749
+ "eval_steps_per_second": 4.625,
750
+ "step": 1000
751
+ },
752
+ {
753
+ "epoch": 1.054308093994778,
754
+ "grad_norm": 5.1998772621154785,
755
+ "learning_rate": 6.7268134247564065e-06,
756
+ "loss": 0.847,
757
+ "step": 1010
758
+ },
759
+ {
760
+ "epoch": 1.064751958224543,
761
+ "grad_norm": 6.828693866729736,
762
+ "learning_rate": 6.690725369902563e-06,
763
+ "loss": 1.0755,
764
+ "step": 1020
765
+ },
766
+ {
767
+ "epoch": 1.0751958224543081,
768
+ "grad_norm": 6.408750534057617,
769
+ "learning_rate": 6.65463731504872e-06,
770
+ "loss": 0.9039,
771
+ "step": 1030
772
+ },
773
+ {
774
+ "epoch": 1.0856396866840732,
775
+ "grad_norm": 7.197463035583496,
776
+ "learning_rate": 6.618549260194877e-06,
777
+ "loss": 0.959,
778
+ "step": 1040
779
+ },
780
+ {
781
+ "epoch": 1.096083550913838,
782
+ "grad_norm": 5.504906177520752,
783
+ "learning_rate": 6.582461205341033e-06,
784
+ "loss": 0.9467,
785
+ "step": 1050
786
+ },
787
+ {
788
+ "epoch": 1.106527415143603,
789
+ "grad_norm": 7.88711404800415,
790
+ "learning_rate": 6.5463731504871896e-06,
791
+ "loss": 1.0635,
792
+ "step": 1060
793
+ },
794
+ {
795
+ "epoch": 1.1169712793733682,
796
+ "grad_norm": 5.1561055183410645,
797
+ "learning_rate": 6.510285095633346e-06,
798
+ "loss": 0.9635,
799
+ "step": 1070
800
+ },
801
+ {
802
+ "epoch": 1.1274151436031332,
803
+ "grad_norm": 5.088303565979004,
804
+ "learning_rate": 6.474197040779503e-06,
805
+ "loss": 0.9605,
806
+ "step": 1080
807
+ },
808
+ {
809
+ "epoch": 1.137859007832898,
810
+ "grad_norm": 4.595157146453857,
811
+ "learning_rate": 6.43810898592566e-06,
812
+ "loss": 1.0494,
813
+ "step": 1090
814
+ },
815
+ {
816
+ "epoch": 1.1483028720626631,
817
+ "grad_norm": 6.682077884674072,
818
+ "learning_rate": 6.402020931071816e-06,
819
+ "loss": 0.888,
820
+ "step": 1100
821
+ },
822
+ {
823
+ "epoch": 1.1587467362924282,
824
+ "grad_norm": 9.08073902130127,
825
+ "learning_rate": 6.365932876217973e-06,
826
+ "loss": 1.1239,
827
+ "step": 1110
828
+ },
829
+ {
830
+ "epoch": 1.1691906005221933,
831
+ "grad_norm": 7.480021953582764,
832
+ "learning_rate": 6.329844821364129e-06,
833
+ "loss": 1.0337,
834
+ "step": 1120
835
+ },
836
+ {
837
+ "epoch": 1.1796344647519583,
838
+ "grad_norm": 5.750502109527588,
839
+ "learning_rate": 6.293756766510286e-06,
840
+ "loss": 1.132,
841
+ "step": 1130
842
+ },
843
+ {
844
+ "epoch": 1.1900783289817232,
845
+ "grad_norm": 6.151831150054932,
846
+ "learning_rate": 6.257668711656443e-06,
847
+ "loss": 0.9985,
848
+ "step": 1140
849
+ },
850
+ {
851
+ "epoch": 1.2005221932114882,
852
+ "grad_norm": 6.567698001861572,
853
+ "learning_rate": 6.221580656802599e-06,
854
+ "loss": 1.1177,
855
+ "step": 1150
856
+ },
857
+ {
858
+ "epoch": 1.2109660574412533,
859
+ "grad_norm": 6.1963887214660645,
860
+ "learning_rate": 6.185492601948756e-06,
861
+ "loss": 0.9328,
862
+ "step": 1160
863
+ },
864
+ {
865
+ "epoch": 1.2214099216710184,
866
+ "grad_norm": 5.044003009796143,
867
+ "learning_rate": 6.149404547094913e-06,
868
+ "loss": 0.8986,
869
+ "step": 1170
870
+ },
871
+ {
872
+ "epoch": 1.2318537859007832,
873
+ "grad_norm": 6.648619651794434,
874
+ "learning_rate": 6.113316492241068e-06,
875
+ "loss": 0.9905,
876
+ "step": 1180
877
+ },
878
+ {
879
+ "epoch": 1.2422976501305483,
880
+ "grad_norm": 9.110396385192871,
881
+ "learning_rate": 6.077228437387225e-06,
882
+ "loss": 1.0126,
883
+ "step": 1190
884
+ },
885
+ {
886
+ "epoch": 1.2527415143603133,
887
+ "grad_norm": 8.707374572753906,
888
+ "learning_rate": 6.0411403825333815e-06,
889
+ "loss": 0.8611,
890
+ "step": 1200
891
+ },
892
+ {
893
+ "epoch": 1.2527415143603133,
894
+ "eval_loss": 1.0788438320159912,
895
+ "eval_runtime": 23.1258,
896
+ "eval_samples_per_second": 36.799,
897
+ "eval_steps_per_second": 4.627,
898
+ "step": 1200
899
+ },
900
+ {
901
+ "epoch": 1.2631853785900784,
902
+ "grad_norm": 5.457468509674072,
903
+ "learning_rate": 6.005052327679538e-06,
904
+ "loss": 0.8327,
905
+ "step": 1210
906
+ },
907
+ {
908
+ "epoch": 1.2736292428198435,
909
+ "grad_norm": 6.050765037536621,
910
+ "learning_rate": 5.9689642728256944e-06,
911
+ "loss": 1.0782,
912
+ "step": 1220
913
+ },
914
+ {
915
+ "epoch": 1.2840731070496083,
916
+ "grad_norm": 6.254447937011719,
917
+ "learning_rate": 5.932876217971852e-06,
918
+ "loss": 0.9388,
919
+ "step": 1230
920
+ },
921
+ {
922
+ "epoch": 1.2945169712793734,
923
+ "grad_norm": 5.181304931640625,
924
+ "learning_rate": 5.896788163118008e-06,
925
+ "loss": 0.9711,
926
+ "step": 1240
927
+ },
928
+ {
929
+ "epoch": 1.3049608355091384,
930
+ "grad_norm": 6.832638263702393,
931
+ "learning_rate": 5.860700108264165e-06,
932
+ "loss": 1.0333,
933
+ "step": 1250
934
+ },
935
+ {
936
+ "epoch": 1.3154046997389033,
937
+ "grad_norm": 8.406023025512695,
938
+ "learning_rate": 5.824612053410321e-06,
939
+ "loss": 1.0902,
940
+ "step": 1260
941
+ },
942
+ {
943
+ "epoch": 1.3258485639686683,
944
+ "grad_norm": 6.346268653869629,
945
+ "learning_rate": 5.7885239985564775e-06,
946
+ "loss": 0.8651,
947
+ "step": 1270
948
+ },
949
+ {
950
+ "epoch": 1.3362924281984334,
951
+ "grad_norm": 8.447615623474121,
952
+ "learning_rate": 5.752435943702635e-06,
953
+ "loss": 1.046,
954
+ "step": 1280
955
+ },
956
+ {
957
+ "epoch": 1.3467362924281985,
958
+ "grad_norm": 8.351264953613281,
959
+ "learning_rate": 5.716347888848791e-06,
960
+ "loss": 1.029,
961
+ "step": 1290
962
+ },
963
+ {
964
+ "epoch": 1.3571801566579635,
965
+ "grad_norm": 6.036417007446289,
966
+ "learning_rate": 5.680259833994948e-06,
967
+ "loss": 1.0089,
968
+ "step": 1300
969
+ },
970
+ {
971
+ "epoch": 1.3676240208877284,
972
+ "grad_norm": 5.646811485290527,
973
+ "learning_rate": 5.644171779141104e-06,
974
+ "loss": 0.9346,
975
+ "step": 1310
976
+ },
977
+ {
978
+ "epoch": 1.3780678851174935,
979
+ "grad_norm": 6.4004950523376465,
980
+ "learning_rate": 5.608083724287261e-06,
981
+ "loss": 0.9489,
982
+ "step": 1320
983
+ },
984
+ {
985
+ "epoch": 1.3885117493472585,
986
+ "grad_norm": 5.746732234954834,
987
+ "learning_rate": 5.571995669433418e-06,
988
+ "loss": 0.9356,
989
+ "step": 1330
990
+ },
991
+ {
992
+ "epoch": 1.3989556135770234,
993
+ "grad_norm": 5.3405632972717285,
994
+ "learning_rate": 5.535907614579574e-06,
995
+ "loss": 0.7099,
996
+ "step": 1340
997
+ },
998
+ {
999
+ "epoch": 1.4093994778067884,
1000
+ "grad_norm": 6.424510955810547,
1001
+ "learning_rate": 5.499819559725731e-06,
1002
+ "loss": 0.6955,
1003
+ "step": 1350
1004
+ },
1005
+ {
1006
+ "epoch": 1.4198433420365535,
1007
+ "grad_norm": 5.751818656921387,
1008
+ "learning_rate": 5.463731504871887e-06,
1009
+ "loss": 0.9094,
1010
+ "step": 1360
1011
+ },
1012
+ {
1013
+ "epoch": 1.4302872062663186,
1014
+ "grad_norm": 7.787084579467773,
1015
+ "learning_rate": 5.4276434500180445e-06,
1016
+ "loss": 0.9618,
1017
+ "step": 1370
1018
+ },
1019
+ {
1020
+ "epoch": 1.4407310704960836,
1021
+ "grad_norm": 6.467785835266113,
1022
+ "learning_rate": 5.391555395164201e-06,
1023
+ "loss": 0.8549,
1024
+ "step": 1380
1025
+ },
1026
+ {
1027
+ "epoch": 1.4511749347258487,
1028
+ "grad_norm": 6.860886573791504,
1029
+ "learning_rate": 5.3554673403103574e-06,
1030
+ "loss": 0.8342,
1031
+ "step": 1390
1032
+ },
1033
+ {
1034
+ "epoch": 1.4616187989556135,
1035
+ "grad_norm": 5.627669811248779,
1036
+ "learning_rate": 5.319379285456514e-06,
1037
+ "loss": 0.9012,
1038
+ "step": 1400
1039
+ },
1040
+ {
1041
+ "epoch": 1.4616187989556135,
1042
+ "eval_loss": 1.0375572443008423,
1043
+ "eval_runtime": 23.1318,
1044
+ "eval_samples_per_second": 36.789,
1045
+ "eval_steps_per_second": 4.626,
1046
+ "step": 1400
1047
+ },
1048
+ {
1049
+ "epoch": 1.4720626631853786,
1050
+ "grad_norm": 7.144196033477783,
1051
+ "learning_rate": 5.28329123060267e-06,
1052
+ "loss": 0.9949,
1053
+ "step": 1410
1054
+ },
1055
+ {
1056
+ "epoch": 1.4825065274151437,
1057
+ "grad_norm": 6.208961486816406,
1058
+ "learning_rate": 5.247203175748828e-06,
1059
+ "loss": 1.0719,
1060
+ "step": 1420
1061
+ },
1062
+ {
1063
+ "epoch": 1.4929503916449085,
1064
+ "grad_norm": 7.110988140106201,
1065
+ "learning_rate": 5.211115120894984e-06,
1066
+ "loss": 0.9162,
1067
+ "step": 1430
1068
+ },
1069
+ {
1070
+ "epoch": 1.5033942558746736,
1071
+ "grad_norm": 6.903599739074707,
1072
+ "learning_rate": 5.1750270660411405e-06,
1073
+ "loss": 0.9974,
1074
+ "step": 1440
1075
+ },
1076
+ {
1077
+ "epoch": 1.5138381201044386,
1078
+ "grad_norm": 5.059232234954834,
1079
+ "learning_rate": 5.138939011187297e-06,
1080
+ "loss": 0.9602,
1081
+ "step": 1450
1082
+ },
1083
+ {
1084
+ "epoch": 1.5242819843342037,
1085
+ "grad_norm": 3.6045143604278564,
1086
+ "learning_rate": 5.1028509563334534e-06,
1087
+ "loss": 0.8448,
1088
+ "step": 1460
1089
+ },
1090
+ {
1091
+ "epoch": 1.5347258485639688,
1092
+ "grad_norm": 8.628995895385742,
1093
+ "learning_rate": 5.066762901479611e-06,
1094
+ "loss": 0.914,
1095
+ "step": 1470
1096
+ },
1097
+ {
1098
+ "epoch": 1.5451697127937338,
1099
+ "grad_norm": 7.227733612060547,
1100
+ "learning_rate": 5.030674846625767e-06,
1101
+ "loss": 0.9206,
1102
+ "step": 1480
1103
+ },
1104
+ {
1105
+ "epoch": 1.5556135770234987,
1106
+ "grad_norm": 7.930648326873779,
1107
+ "learning_rate": 4.994586791771924e-06,
1108
+ "loss": 1.1221,
1109
+ "step": 1490
1110
+ },
1111
+ {
1112
+ "epoch": 1.5660574412532637,
1113
+ "grad_norm": 6.340338706970215,
1114
+ "learning_rate": 4.95849873691808e-06,
1115
+ "loss": 0.8298,
1116
+ "step": 1500
1117
+ },
1118
+ {
1119
+ "epoch": 1.5765013054830286,
1120
+ "grad_norm": 5.558096408843994,
1121
+ "learning_rate": 4.922410682064237e-06,
1122
+ "loss": 0.9457,
1123
+ "step": 1510
1124
+ },
1125
+ {
1126
+ "epoch": 1.5869451697127936,
1127
+ "grad_norm": 7.608903408050537,
1128
+ "learning_rate": 4.886322627210394e-06,
1129
+ "loss": 0.9164,
1130
+ "step": 1520
1131
+ },
1132
+ {
1133
+ "epoch": 1.5973890339425587,
1134
+ "grad_norm": 9.46885871887207,
1135
+ "learning_rate": 4.85023457235655e-06,
1136
+ "loss": 0.9498,
1137
+ "step": 1530
1138
+ },
1139
+ {
1140
+ "epoch": 1.6078328981723238,
1141
+ "grad_norm": 7.6691107749938965,
1142
+ "learning_rate": 4.814146517502707e-06,
1143
+ "loss": 0.9764,
1144
+ "step": 1540
1145
+ },
1146
+ {
1147
+ "epoch": 1.6182767624020888,
1148
+ "grad_norm": 8.231538772583008,
1149
+ "learning_rate": 4.778058462648863e-06,
1150
+ "loss": 0.8314,
1151
+ "step": 1550
1152
+ },
1153
+ {
1154
+ "epoch": 1.628720626631854,
1155
+ "grad_norm": 5.868668556213379,
1156
+ "learning_rate": 4.7419704077950205e-06,
1157
+ "loss": 0.9127,
1158
+ "step": 1560
1159
+ },
1160
+ {
1161
+ "epoch": 1.6391644908616188,
1162
+ "grad_norm": 7.236291885375977,
1163
+ "learning_rate": 4.705882352941177e-06,
1164
+ "loss": 0.9485,
1165
+ "step": 1570
1166
+ },
1167
+ {
1168
+ "epoch": 1.6496083550913838,
1169
+ "grad_norm": 6.871162414550781,
1170
+ "learning_rate": 4.669794298087333e-06,
1171
+ "loss": 1.0603,
1172
+ "step": 1580
1173
+ },
1174
+ {
1175
+ "epoch": 1.6600522193211487,
1176
+ "grad_norm": 6.303982734680176,
1177
+ "learning_rate": 4.63370624323349e-06,
1178
+ "loss": 0.945,
1179
+ "step": 1590
1180
+ },
1181
+ {
1182
+ "epoch": 1.6704960835509137,
1183
+ "grad_norm": 7.167915344238281,
1184
+ "learning_rate": 4.597618188379647e-06,
1185
+ "loss": 0.7351,
1186
+ "step": 1600
1187
+ },
1188
+ {
1189
+ "epoch": 1.6704960835509137,
1190
+ "eval_loss": 1.0095878839492798,
1191
+ "eval_runtime": 23.1917,
1192
+ "eval_samples_per_second": 36.694,
1193
+ "eval_steps_per_second": 4.614,
1194
+ "step": 1600
1195
+ },
1196
+ {
1197
+ "epoch": 1.6809399477806788,
1198
+ "grad_norm": 6.626043796539307,
1199
+ "learning_rate": 4.5615301335258035e-06,
1200
+ "loss": 0.9369,
1201
+ "step": 1610
1202
+ },
1203
+ {
1204
+ "epoch": 1.6913838120104439,
1205
+ "grad_norm": 6.645303726196289,
1206
+ "learning_rate": 4.52544207867196e-06,
1207
+ "loss": 0.8509,
1208
+ "step": 1620
1209
+ },
1210
+ {
1211
+ "epoch": 1.701827676240209,
1212
+ "grad_norm": 7.132906913757324,
1213
+ "learning_rate": 4.4893540238181164e-06,
1214
+ "loss": 0.9522,
1215
+ "step": 1630
1216
+ },
1217
+ {
1218
+ "epoch": 1.712271540469974,
1219
+ "grad_norm": 6.155941009521484,
1220
+ "learning_rate": 4.453265968964273e-06,
1221
+ "loss": 0.9662,
1222
+ "step": 1640
1223
+ },
1224
+ {
1225
+ "epoch": 1.722715404699739,
1226
+ "grad_norm": 5.0147705078125,
1227
+ "learning_rate": 4.41717791411043e-06,
1228
+ "loss": 0.8755,
1229
+ "step": 1650
1230
+ },
1231
+ {
1232
+ "epoch": 1.733159268929504,
1233
+ "grad_norm": 7.039682388305664,
1234
+ "learning_rate": 4.381089859256587e-06,
1235
+ "loss": 0.8761,
1236
+ "step": 1660
1237
+ },
1238
+ {
1239
+ "epoch": 1.743603133159269,
1240
+ "grad_norm": 8.536678314208984,
1241
+ "learning_rate": 4.345001804402743e-06,
1242
+ "loss": 0.8054,
1243
+ "step": 1670
1244
+ },
1245
+ {
1246
+ "epoch": 1.7540469973890338,
1247
+ "grad_norm": 6.860482215881348,
1248
+ "learning_rate": 4.3089137495488995e-06,
1249
+ "loss": 0.9095,
1250
+ "step": 1680
1251
+ },
1252
+ {
1253
+ "epoch": 1.7644908616187989,
1254
+ "grad_norm": 6.3796563148498535,
1255
+ "learning_rate": 4.272825694695056e-06,
1256
+ "loss": 1.08,
1257
+ "step": 1690
1258
+ },
1259
+ {
1260
+ "epoch": 1.774934725848564,
1261
+ "grad_norm": 11.013219833374023,
1262
+ "learning_rate": 4.236737639841213e-06,
1263
+ "loss": 0.834,
1264
+ "step": 1700
1265
+ },
1266
+ {
1267
+ "epoch": 1.785378590078329,
1268
+ "grad_norm": 6.231220722198486,
1269
+ "learning_rate": 4.20064958498737e-06,
1270
+ "loss": 0.8488,
1271
+ "step": 1710
1272
+ },
1273
+ {
1274
+ "epoch": 1.795822454308094,
1275
+ "grad_norm": 7.019144535064697,
1276
+ "learning_rate": 4.164561530133526e-06,
1277
+ "loss": 0.7922,
1278
+ "step": 1720
1279
+ },
1280
+ {
1281
+ "epoch": 1.8062663185378591,
1282
+ "grad_norm": 6.586852550506592,
1283
+ "learning_rate": 4.128473475279683e-06,
1284
+ "loss": 0.8326,
1285
+ "step": 1730
1286
+ },
1287
+ {
1288
+ "epoch": 1.816710182767624,
1289
+ "grad_norm": 6.34022855758667,
1290
+ "learning_rate": 4.09238542042584e-06,
1291
+ "loss": 0.906,
1292
+ "step": 1740
1293
+ },
1294
+ {
1295
+ "epoch": 1.827154046997389,
1296
+ "grad_norm": 7.541686058044434,
1297
+ "learning_rate": 4.056297365571996e-06,
1298
+ "loss": 0.9029,
1299
+ "step": 1750
1300
+ },
1301
+ {
1302
+ "epoch": 1.837597911227154,
1303
+ "grad_norm": 5.867885589599609,
1304
+ "learning_rate": 4.020209310718153e-06,
1305
+ "loss": 0.8072,
1306
+ "step": 1760
1307
+ },
1308
+ {
1309
+ "epoch": 1.848041775456919,
1310
+ "grad_norm": 4.805484771728516,
1311
+ "learning_rate": 3.984121255864309e-06,
1312
+ "loss": 0.9158,
1313
+ "step": 1770
1314
+ },
1315
+ {
1316
+ "epoch": 1.858485639686684,
1317
+ "grad_norm": 5.949447154998779,
1318
+ "learning_rate": 3.948033201010466e-06,
1319
+ "loss": 0.8116,
1320
+ "step": 1780
1321
+ },
1322
+ {
1323
+ "epoch": 1.868929503916449,
1324
+ "grad_norm": 7.641289234161377,
1325
+ "learning_rate": 3.911945146156623e-06,
1326
+ "loss": 0.8824,
1327
+ "step": 1790
1328
+ },
1329
+ {
1330
+ "epoch": 1.8793733681462141,
1331
+ "grad_norm": 5.530484199523926,
1332
+ "learning_rate": 3.8758570913027795e-06,
1333
+ "loss": 0.9264,
1334
+ "step": 1800
1335
+ },
1336
+ {
1337
+ "epoch": 1.8793733681462141,
1338
+ "eval_loss": 0.9786838293075562,
1339
+ "eval_runtime": 23.1205,
1340
+ "eval_samples_per_second": 36.807,
1341
+ "eval_steps_per_second": 4.628,
1342
+ "step": 1800
1343
+ },
1344
+ {
1345
+ "epoch": 1.8898172323759792,
1346
+ "grad_norm": 5.981470584869385,
1347
+ "learning_rate": 3.839769036448936e-06,
1348
+ "loss": 0.7929,
1349
+ "step": 1810
1350
+ },
1351
+ {
1352
+ "epoch": 1.9002610966057443,
1353
+ "grad_norm": 8.019466400146484,
1354
+ "learning_rate": 3.8036809815950928e-06,
1355
+ "loss": 0.7616,
1356
+ "step": 1820
1357
+ },
1358
+ {
1359
+ "epoch": 1.9107049608355091,
1360
+ "grad_norm": 7.649405479431152,
1361
+ "learning_rate": 3.7675929267412492e-06,
1362
+ "loss": 0.8661,
1363
+ "step": 1830
1364
+ },
1365
+ {
1366
+ "epoch": 1.9211488250652742,
1367
+ "grad_norm": 8.259872436523438,
1368
+ "learning_rate": 3.7315048718874057e-06,
1369
+ "loss": 1.0303,
1370
+ "step": 1840
1371
+ },
1372
+ {
1373
+ "epoch": 1.931592689295039,
1374
+ "grad_norm": 5.947850227355957,
1375
+ "learning_rate": 3.695416817033562e-06,
1376
+ "loss": 0.9326,
1377
+ "step": 1850
1378
+ },
1379
+ {
1380
+ "epoch": 1.942036553524804,
1381
+ "grad_norm": 5.193607330322266,
1382
+ "learning_rate": 3.6593287621797186e-06,
1383
+ "loss": 0.8527,
1384
+ "step": 1860
1385
+ },
1386
+ {
1387
+ "epoch": 1.9524804177545692,
1388
+ "grad_norm": 6.412321090698242,
1389
+ "learning_rate": 3.623240707325875e-06,
1390
+ "loss": 0.8768,
1391
+ "step": 1870
1392
+ },
1393
+ {
1394
+ "epoch": 1.9629242819843342,
1395
+ "grad_norm": 6.859325408935547,
1396
+ "learning_rate": 3.587152652472032e-06,
1397
+ "loss": 0.8131,
1398
+ "step": 1880
1399
+ },
1400
+ {
1401
+ "epoch": 1.9733681462140993,
1402
+ "grad_norm": 4.910820484161377,
1403
+ "learning_rate": 3.5510645976181883e-06,
1404
+ "loss": 0.7232,
1405
+ "step": 1890
1406
+ },
1407
+ {
1408
+ "epoch": 1.9838120104438643,
1409
+ "grad_norm": 6.052480220794678,
1410
+ "learning_rate": 3.5149765427643452e-06,
1411
+ "loss": 0.7567,
1412
+ "step": 1900
1413
+ },
1414
+ {
1415
+ "epoch": 1.9942558746736292,
1416
+ "grad_norm": 6.609030246734619,
1417
+ "learning_rate": 3.4788884879105017e-06,
1418
+ "loss": 0.8219,
1419
+ "step": 1910
1420
+ },
1421
+ {
1422
+ "epoch": 2.004177545691906,
1423
+ "grad_norm": 4.539740562438965,
1424
+ "learning_rate": 3.4428004330566585e-06,
1425
+ "loss": 0.8498,
1426
+ "step": 1920
1427
+ },
1428
+ {
1429
+ "epoch": 2.014621409921671,
1430
+ "grad_norm": 5.981167316436768,
1431
+ "learning_rate": 3.406712378202815e-06,
1432
+ "loss": 0.7024,
1433
+ "step": 1930
1434
+ },
1435
+ {
1436
+ "epoch": 2.025065274151436,
1437
+ "grad_norm": 6.600665092468262,
1438
+ "learning_rate": 3.3706243233489714e-06,
1439
+ "loss": 0.7467,
1440
+ "step": 1940
1441
+ },
1442
+ {
1443
+ "epoch": 2.035509138381201,
1444
+ "grad_norm": 5.560609340667725,
1445
+ "learning_rate": 3.3345362684951283e-06,
1446
+ "loss": 0.7434,
1447
+ "step": 1950
1448
+ },
1449
+ {
1450
+ "epoch": 2.045953002610966,
1451
+ "grad_norm": 4.679533004760742,
1452
+ "learning_rate": 3.2984482136412848e-06,
1453
+ "loss": 0.7125,
1454
+ "step": 1960
1455
+ },
1456
+ {
1457
+ "epoch": 2.056396866840731,
1458
+ "grad_norm": 7.177086353302002,
1459
+ "learning_rate": 3.2623601587874416e-06,
1460
+ "loss": 0.6745,
1461
+ "step": 1970
1462
+ },
1463
+ {
1464
+ "epoch": 2.066840731070496,
1465
+ "grad_norm": 6.860986709594727,
1466
+ "learning_rate": 3.226272103933598e-06,
1467
+ "loss": 0.6814,
1468
+ "step": 1980
1469
+ },
1470
+ {
1471
+ "epoch": 2.077284595300261,
1472
+ "grad_norm": 8.40719223022461,
1473
+ "learning_rate": 3.190184049079755e-06,
1474
+ "loss": 0.7077,
1475
+ "step": 1990
1476
+ },
1477
+ {
1478
+ "epoch": 2.0877284595300263,
1479
+ "grad_norm": 5.830367088317871,
1480
+ "learning_rate": 3.1540959942259114e-06,
1481
+ "loss": 0.8435,
1482
+ "step": 2000
1483
+ },
1484
+ {
1485
+ "epoch": 2.0877284595300263,
1486
+ "eval_loss": 0.9753687381744385,
1487
+ "eval_runtime": 23.1964,
1488
+ "eval_samples_per_second": 36.687,
1489
+ "eval_steps_per_second": 4.613,
1490
+ "step": 2000
1491
+ },
1492
+ {
1493
+ "epoch": 2.0981723237597913,
1494
+ "grad_norm": 6.964289665222168,
1495
+ "learning_rate": 3.118007939372068e-06,
1496
+ "loss": 0.6964,
1497
+ "step": 2010
1498
+ },
1499
+ {
1500
+ "epoch": 2.108616187989556,
1501
+ "grad_norm": 6.764989852905273,
1502
+ "learning_rate": 3.0819198845182247e-06,
1503
+ "loss": 0.7107,
1504
+ "step": 2020
1505
+ },
1506
+ {
1507
+ "epoch": 2.119060052219321,
1508
+ "grad_norm": 7.992194652557373,
1509
+ "learning_rate": 3.045831829664381e-06,
1510
+ "loss": 0.7146,
1511
+ "step": 2030
1512
+ },
1513
+ {
1514
+ "epoch": 2.129503916449086,
1515
+ "grad_norm": 7.102138996124268,
1516
+ "learning_rate": 3.009743774810538e-06,
1517
+ "loss": 0.686,
1518
+ "step": 2040
1519
+ },
1520
+ {
1521
+ "epoch": 2.139947780678851,
1522
+ "grad_norm": 7.446751117706299,
1523
+ "learning_rate": 2.9736557199566945e-06,
1524
+ "loss": 0.7681,
1525
+ "step": 2050
1526
+ },
1527
+ {
1528
+ "epoch": 2.1503916449086162,
1529
+ "grad_norm": 7.091776371002197,
1530
+ "learning_rate": 2.9375676651028514e-06,
1531
+ "loss": 0.7674,
1532
+ "step": 2060
1533
+ },
1534
+ {
1535
+ "epoch": 2.1608355091383813,
1536
+ "grad_norm": 7.994192123413086,
1537
+ "learning_rate": 2.901479610249008e-06,
1538
+ "loss": 0.7187,
1539
+ "step": 2070
1540
+ },
1541
+ {
1542
+ "epoch": 2.1712793733681464,
1543
+ "grad_norm": 4.8329386711120605,
1544
+ "learning_rate": 2.8653915553951643e-06,
1545
+ "loss": 0.7501,
1546
+ "step": 2080
1547
+ },
1548
+ {
1549
+ "epoch": 2.1817232375979114,
1550
+ "grad_norm": 6.802753925323486,
1551
+ "learning_rate": 2.829303500541321e-06,
1552
+ "loss": 0.5658,
1553
+ "step": 2090
1554
+ },
1555
+ {
1556
+ "epoch": 2.192167101827676,
1557
+ "grad_norm": 7.07351541519165,
1558
+ "learning_rate": 2.7932154456874776e-06,
1559
+ "loss": 0.8106,
1560
+ "step": 2100
1561
+ },
1562
+ {
1563
+ "epoch": 2.202610966057441,
1564
+ "grad_norm": 6.761138916015625,
1565
+ "learning_rate": 2.7571273908336344e-06,
1566
+ "loss": 0.7434,
1567
+ "step": 2110
1568
+ },
1569
+ {
1570
+ "epoch": 2.213054830287206,
1571
+ "grad_norm": 4.220724582672119,
1572
+ "learning_rate": 2.721039335979791e-06,
1573
+ "loss": 0.7526,
1574
+ "step": 2120
1575
+ },
1576
+ {
1577
+ "epoch": 2.2234986945169712,
1578
+ "grad_norm": 6.14243745803833,
1579
+ "learning_rate": 2.6849512811259478e-06,
1580
+ "loss": 0.6486,
1581
+ "step": 2130
1582
+ },
1583
+ {
1584
+ "epoch": 2.2339425587467363,
1585
+ "grad_norm": 8.640827178955078,
1586
+ "learning_rate": 2.6488632262721042e-06,
1587
+ "loss": 0.5949,
1588
+ "step": 2140
1589
+ },
1590
+ {
1591
+ "epoch": 2.2443864229765014,
1592
+ "grad_norm": 6.576625823974609,
1593
+ "learning_rate": 2.6127751714182607e-06,
1594
+ "loss": 0.6923,
1595
+ "step": 2150
1596
+ },
1597
+ {
1598
+ "epoch": 2.2548302872062664,
1599
+ "grad_norm": 6.136504173278809,
1600
+ "learning_rate": 2.5766871165644175e-06,
1601
+ "loss": 0.6255,
1602
+ "step": 2160
1603
+ },
1604
+ {
1605
+ "epoch": 2.2652741514360315,
1606
+ "grad_norm": 7.2910847663879395,
1607
+ "learning_rate": 2.540599061710574e-06,
1608
+ "loss": 0.6843,
1609
+ "step": 2170
1610
+ },
1611
+ {
1612
+ "epoch": 2.275718015665796,
1613
+ "grad_norm": 6.936916351318359,
1614
+ "learning_rate": 2.504511006856731e-06,
1615
+ "loss": 0.6751,
1616
+ "step": 2180
1617
+ },
1618
+ {
1619
+ "epoch": 2.286161879895561,
1620
+ "grad_norm": 5.899853229522705,
1621
+ "learning_rate": 2.4684229520028873e-06,
1622
+ "loss": 0.6584,
1623
+ "step": 2190
1624
+ },
1625
+ {
1626
+ "epoch": 2.2966057441253263,
1627
+ "grad_norm": 4.891731262207031,
1628
+ "learning_rate": 2.4323348971490438e-06,
1629
+ "loss": 0.6373,
1630
+ "step": 2200
1631
+ },
1632
+ {
1633
+ "epoch": 2.2966057441253263,
1634
+ "eval_loss": 0.958043098449707,
1635
+ "eval_runtime": 23.1881,
1636
+ "eval_samples_per_second": 36.7,
1637
+ "eval_steps_per_second": 4.614,
1638
+ "step": 2200
1639
+ },
1640
+ {
1641
+ "epoch": 2.3070496083550913,
1642
+ "grad_norm": 6.206886291503906,
1643
+ "learning_rate": 2.3962468422952e-06,
1644
+ "loss": 0.6725,
1645
+ "step": 2210
1646
+ },
1647
+ {
1648
+ "epoch": 2.3174934725848564,
1649
+ "grad_norm": 4.663551330566406,
1650
+ "learning_rate": 2.360158787441357e-06,
1651
+ "loss": 0.5578,
1652
+ "step": 2220
1653
+ },
1654
+ {
1655
+ "epoch": 2.3279373368146214,
1656
+ "grad_norm": 6.175649166107178,
1657
+ "learning_rate": 2.3240707325875135e-06,
1658
+ "loss": 0.6835,
1659
+ "step": 2230
1660
+ },
1661
+ {
1662
+ "epoch": 2.3383812010443865,
1663
+ "grad_norm": 6.676774501800537,
1664
+ "learning_rate": 2.2879826777336704e-06,
1665
+ "loss": 0.7966,
1666
+ "step": 2240
1667
+ },
1668
+ {
1669
+ "epoch": 2.3488250652741516,
1670
+ "grad_norm": 8.847614288330078,
1671
+ "learning_rate": 2.251894622879827e-06,
1672
+ "loss": 0.7679,
1673
+ "step": 2250
1674
+ },
1675
+ {
1676
+ "epoch": 2.3592689295039166,
1677
+ "grad_norm": 6.491757869720459,
1678
+ "learning_rate": 2.2158065680259837e-06,
1679
+ "loss": 0.6274,
1680
+ "step": 2260
1681
+ },
1682
+ {
1683
+ "epoch": 2.3697127937336813,
1684
+ "grad_norm": 6.540876388549805,
1685
+ "learning_rate": 2.17971851317214e-06,
1686
+ "loss": 0.674,
1687
+ "step": 2270
1688
+ },
1689
+ {
1690
+ "epoch": 2.3801566579634463,
1691
+ "grad_norm": 7.067712306976318,
1692
+ "learning_rate": 2.1436304583182966e-06,
1693
+ "loss": 0.6716,
1694
+ "step": 2280
1695
+ },
1696
+ {
1697
+ "epoch": 2.3906005221932114,
1698
+ "grad_norm": 4.959332466125488,
1699
+ "learning_rate": 2.1075424034644535e-06,
1700
+ "loss": 0.6729,
1701
+ "step": 2290
1702
+ },
1703
+ {
1704
+ "epoch": 2.4010443864229765,
1705
+ "grad_norm": 4.016025066375732,
1706
+ "learning_rate": 2.07145434861061e-06,
1707
+ "loss": 0.6358,
1708
+ "step": 2300
1709
+ },
1710
+ {
1711
+ "epoch": 2.4114882506527415,
1712
+ "grad_norm": 4.044537544250488,
1713
+ "learning_rate": 2.035366293756767e-06,
1714
+ "loss": 0.6867,
1715
+ "step": 2310
1716
+ },
1717
+ {
1718
+ "epoch": 2.4219321148825066,
1719
+ "grad_norm": 4.88841438293457,
1720
+ "learning_rate": 1.9992782389029233e-06,
1721
+ "loss": 0.7561,
1722
+ "step": 2320
1723
+ },
1724
+ {
1725
+ "epoch": 2.4323759791122717,
1726
+ "grad_norm": 7.33749532699585,
1727
+ "learning_rate": 1.96319018404908e-06,
1728
+ "loss": 0.6579,
1729
+ "step": 2330
1730
+ },
1731
+ {
1732
+ "epoch": 2.4428198433420367,
1733
+ "grad_norm": 6.818521976470947,
1734
+ "learning_rate": 1.9271021291952366e-06,
1735
+ "loss": 0.7322,
1736
+ "step": 2340
1737
+ },
1738
+ {
1739
+ "epoch": 2.453263707571802,
1740
+ "grad_norm": 5.549405097961426,
1741
+ "learning_rate": 1.8910140743413932e-06,
1742
+ "loss": 0.634,
1743
+ "step": 2350
1744
+ },
1745
+ {
1746
+ "epoch": 2.4637075718015664,
1747
+ "grad_norm": 6.154874801635742,
1748
+ "learning_rate": 1.85492601948755e-06,
1749
+ "loss": 0.6044,
1750
+ "step": 2360
1751
+ },
1752
+ {
1753
+ "epoch": 2.4741514360313315,
1754
+ "grad_norm": 5.5303521156311035,
1755
+ "learning_rate": 1.8188379646337066e-06,
1756
+ "loss": 0.6833,
1757
+ "step": 2370
1758
+ },
1759
+ {
1760
+ "epoch": 2.4845953002610965,
1761
+ "grad_norm": 6.135169982910156,
1762
+ "learning_rate": 1.7827499097798632e-06,
1763
+ "loss": 0.7765,
1764
+ "step": 2380
1765
+ },
1766
+ {
1767
+ "epoch": 2.4950391644908616,
1768
+ "grad_norm": 7.397289752960205,
1769
+ "learning_rate": 1.7466618549260197e-06,
1770
+ "loss": 0.6748,
1771
+ "step": 2390
1772
+ },
1773
+ {
1774
+ "epoch": 2.5054830287206267,
1775
+ "grad_norm": 5.909689426422119,
1776
+ "learning_rate": 1.7105738000721763e-06,
1777
+ "loss": 0.6791,
1778
+ "step": 2400
1779
+ },
1780
+ {
1781
+ "epoch": 2.5054830287206267,
1782
+ "eval_loss": 0.955007791519165,
1783
+ "eval_runtime": 23.2476,
1784
+ "eval_samples_per_second": 36.606,
1785
+ "eval_steps_per_second": 4.603,
1786
+ "step": 2400
1787
+ },
1788
+ {
1789
+ "epoch": 2.5159268929503917,
1790
+ "grad_norm": 6.320558547973633,
1791
+ "learning_rate": 1.6744857452183328e-06,
1792
+ "loss": 0.6219,
1793
+ "step": 2410
1794
+ },
1795
+ {
1796
+ "epoch": 2.526370757180157,
1797
+ "grad_norm": 7.978168487548828,
1798
+ "learning_rate": 1.6383976903644894e-06,
1799
+ "loss": 0.8254,
1800
+ "step": 2420
1801
+ },
1802
+ {
1803
+ "epoch": 2.5368146214099214,
1804
+ "grad_norm": 6.5808210372924805,
1805
+ "learning_rate": 1.602309635510646e-06,
1806
+ "loss": 0.7243,
1807
+ "step": 2430
1808
+ },
1809
+ {
1810
+ "epoch": 2.547258485639687,
1811
+ "grad_norm": 4.769480228424072,
1812
+ "learning_rate": 1.5662215806568025e-06,
1813
+ "loss": 0.7497,
1814
+ "step": 2440
1815
+ },
1816
+ {
1817
+ "epoch": 2.5577023498694516,
1818
+ "grad_norm": 5.738780975341797,
1819
+ "learning_rate": 1.5301335258029592e-06,
1820
+ "loss": 0.7048,
1821
+ "step": 2450
1822
+ },
1823
+ {
1824
+ "epoch": 2.5681462140992166,
1825
+ "grad_norm": 5.658013343811035,
1826
+ "learning_rate": 1.4940454709491159e-06,
1827
+ "loss": 0.739,
1828
+ "step": 2460
1829
+ },
1830
+ {
1831
+ "epoch": 2.5785900783289817,
1832
+ "grad_norm": 6.587325096130371,
1833
+ "learning_rate": 1.4579574160952725e-06,
1834
+ "loss": 0.7076,
1835
+ "step": 2470
1836
+ },
1837
+ {
1838
+ "epoch": 2.5890339425587467,
1839
+ "grad_norm": 5.956645965576172,
1840
+ "learning_rate": 1.4218693612414292e-06,
1841
+ "loss": 0.6372,
1842
+ "step": 2480
1843
+ },
1844
+ {
1845
+ "epoch": 2.599477806788512,
1846
+ "grad_norm": 5.966655731201172,
1847
+ "learning_rate": 1.3857813063875859e-06,
1848
+ "loss": 0.6934,
1849
+ "step": 2490
1850
+ },
1851
+ {
1852
+ "epoch": 2.609921671018277,
1853
+ "grad_norm": 5.313653945922852,
1854
+ "learning_rate": 1.3496932515337425e-06,
1855
+ "loss": 0.7163,
1856
+ "step": 2500
1857
+ },
1858
+ {
1859
+ "epoch": 2.620365535248042,
1860
+ "grad_norm": 6.935596466064453,
1861
+ "learning_rate": 1.313605196679899e-06,
1862
+ "loss": 0.7104,
1863
+ "step": 2510
1864
+ },
1865
+ {
1866
+ "epoch": 2.6308093994778066,
1867
+ "grad_norm": 4.822442054748535,
1868
+ "learning_rate": 1.2775171418260556e-06,
1869
+ "loss": 0.6378,
1870
+ "step": 2520
1871
+ },
1872
+ {
1873
+ "epoch": 2.641253263707572,
1874
+ "grad_norm": 5.288422107696533,
1875
+ "learning_rate": 1.2414290869722123e-06,
1876
+ "loss": 0.6463,
1877
+ "step": 2530
1878
+ },
1879
+ {
1880
+ "epoch": 2.6516971279373367,
1881
+ "grad_norm": 6.668851852416992,
1882
+ "learning_rate": 1.205341032118369e-06,
1883
+ "loss": 0.7505,
1884
+ "step": 2540
1885
+ },
1886
+ {
1887
+ "epoch": 2.6621409921671018,
1888
+ "grad_norm": 5.71054220199585,
1889
+ "learning_rate": 1.1692529772645256e-06,
1890
+ "loss": 0.5856,
1891
+ "step": 2550
1892
+ },
1893
+ {
1894
+ "epoch": 2.672584856396867,
1895
+ "grad_norm": 6.284550666809082,
1896
+ "learning_rate": 1.1331649224106823e-06,
1897
+ "loss": 0.8122,
1898
+ "step": 2560
1899
+ },
1900
+ {
1901
+ "epoch": 2.683028720626632,
1902
+ "grad_norm": 8.781463623046875,
1903
+ "learning_rate": 1.097076867556839e-06,
1904
+ "loss": 0.7063,
1905
+ "step": 2570
1906
+ },
1907
+ {
1908
+ "epoch": 2.693472584856397,
1909
+ "grad_norm": 7.29454231262207,
1910
+ "learning_rate": 1.0609888127029954e-06,
1911
+ "loss": 0.7429,
1912
+ "step": 2580
1913
+ },
1914
+ {
1915
+ "epoch": 2.703916449086162,
1916
+ "grad_norm": 5.689371109008789,
1917
+ "learning_rate": 1.024900757849152e-06,
1918
+ "loss": 0.6658,
1919
+ "step": 2590
1920
+ },
1921
+ {
1922
+ "epoch": 2.714360313315927,
1923
+ "grad_norm": 7.286506175994873,
1924
+ "learning_rate": 9.888127029953087e-07,
1925
+ "loss": 0.748,
1926
+ "step": 2600
1927
+ },
1928
+ {
1929
+ "epoch": 2.714360313315927,
1930
+ "eval_loss": 0.9431054592132568,
1931
+ "eval_runtime": 23.1747,
1932
+ "eval_samples_per_second": 36.721,
1933
+ "eval_steps_per_second": 4.617,
1934
+ "step": 2600
1935
+ },
1936
+ {
1937
+ "epoch": 2.7248041775456917,
1938
+ "grad_norm": 5.635782241821289,
1939
+ "learning_rate": 9.527246481414652e-07,
1940
+ "loss": 0.673,
1941
+ "step": 2610
1942
+ },
1943
+ {
1944
+ "epoch": 2.7352480417754568,
1945
+ "grad_norm": 5.282413959503174,
1946
+ "learning_rate": 9.166365932876219e-07,
1947
+ "loss": 0.9037,
1948
+ "step": 2620
1949
+ },
1950
+ {
1951
+ "epoch": 2.745691906005222,
1952
+ "grad_norm": 7.922749042510986,
1953
+ "learning_rate": 8.805485384337785e-07,
1954
+ "loss": 0.6862,
1955
+ "step": 2630
1956
+ },
1957
+ {
1958
+ "epoch": 2.756135770234987,
1959
+ "grad_norm": 5.463962078094482,
1960
+ "learning_rate": 8.444604835799351e-07,
1961
+ "loss": 0.6,
1962
+ "step": 2640
1963
+ },
1964
+ {
1965
+ "epoch": 2.766579634464752,
1966
+ "grad_norm": 8.0007963180542,
1967
+ "learning_rate": 8.083724287260918e-07,
1968
+ "loss": 0.6688,
1969
+ "step": 2650
1970
+ },
1971
+ {
1972
+ "epoch": 2.777023498694517,
1973
+ "grad_norm": 7.617900371551514,
1974
+ "learning_rate": 7.722843738722483e-07,
1975
+ "loss": 0.7277,
1976
+ "step": 2660
1977
+ },
1978
+ {
1979
+ "epoch": 2.787467362924282,
1980
+ "grad_norm": 5.969784259796143,
1981
+ "learning_rate": 7.36196319018405e-07,
1982
+ "loss": 0.7085,
1983
+ "step": 2670
1984
+ },
1985
+ {
1986
+ "epoch": 2.7979112271540467,
1987
+ "grad_norm": 5.169407367706299,
1988
+ "learning_rate": 7.001082641645617e-07,
1989
+ "loss": 0.7344,
1990
+ "step": 2680
1991
+ },
1992
+ {
1993
+ "epoch": 2.8083550913838122,
1994
+ "grad_norm": 8.009687423706055,
1995
+ "learning_rate": 6.640202093107181e-07,
1996
+ "loss": 0.6457,
1997
+ "step": 2690
1998
+ },
1999
+ {
2000
+ "epoch": 2.818798955613577,
2001
+ "grad_norm": 7.4137187004089355,
2002
+ "learning_rate": 6.279321544568748e-07,
2003
+ "loss": 0.6416,
2004
+ "step": 2700
2005
+ },
2006
+ {
2007
+ "epoch": 2.829242819843342,
2008
+ "grad_norm": 5.35453462600708,
2009
+ "learning_rate": 5.918440996030314e-07,
2010
+ "loss": 0.7867,
2011
+ "step": 2710
2012
+ },
2013
+ {
2014
+ "epoch": 2.839686684073107,
2015
+ "grad_norm": 7.469908237457275,
2016
+ "learning_rate": 5.557560447491881e-07,
2017
+ "loss": 0.7955,
2018
+ "step": 2720
2019
+ },
2020
+ {
2021
+ "epoch": 2.850130548302872,
2022
+ "grad_norm": 6.326605319976807,
2023
+ "learning_rate": 5.196679898953446e-07,
2024
+ "loss": 0.6995,
2025
+ "step": 2730
2026
+ },
2027
+ {
2028
+ "epoch": 2.860574412532637,
2029
+ "grad_norm": 7.096553802490234,
2030
+ "learning_rate": 4.835799350415013e-07,
2031
+ "loss": 0.6625,
2032
+ "step": 2740
2033
+ },
2034
+ {
2035
+ "epoch": 2.871018276762402,
2036
+ "grad_norm": 5.128674507141113,
2037
+ "learning_rate": 4.474918801876579e-07,
2038
+ "loss": 0.6791,
2039
+ "step": 2750
2040
+ },
2041
+ {
2042
+ "epoch": 2.8814621409921672,
2043
+ "grad_norm": 6.457350254058838,
2044
+ "learning_rate": 4.1140382533381457e-07,
2045
+ "loss": 0.7586,
2046
+ "step": 2760
2047
+ },
2048
+ {
2049
+ "epoch": 2.891906005221932,
2050
+ "grad_norm": 6.231655597686768,
2051
+ "learning_rate": 3.753157704799711e-07,
2052
+ "loss": 0.677,
2053
+ "step": 2770
2054
+ },
2055
+ {
2056
+ "epoch": 2.9023498694516974,
2057
+ "grad_norm": 6.412544250488281,
2058
+ "learning_rate": 3.392277156261278e-07,
2059
+ "loss": 0.7449,
2060
+ "step": 2780
2061
+ },
2062
+ {
2063
+ "epoch": 2.912793733681462,
2064
+ "grad_norm": 3.725374698638916,
2065
+ "learning_rate": 3.031396607722844e-07,
2066
+ "loss": 0.5804,
2067
+ "step": 2790
2068
+ },
2069
+ {
2070
+ "epoch": 2.923237597911227,
2071
+ "grad_norm": 5.284286975860596,
2072
+ "learning_rate": 2.67051605918441e-07,
2073
+ "loss": 0.571,
2074
+ "step": 2800
2075
+ },
2076
+ {
2077
+ "epoch": 2.923237597911227,
2078
+ "eval_loss": 0.94057297706604,
2079
+ "eval_runtime": 23.1841,
2080
+ "eval_samples_per_second": 36.706,
2081
+ "eval_steps_per_second": 4.615,
2082
+ "step": 2800
2083
+ }
2084
+ ],
2085
+ "logging_steps": 10,
2086
+ "max_steps": 2871,
2087
+ "num_input_tokens_seen": 0,
2088
+ "num_train_epochs": 3,
2089
+ "save_steps": 200,
2090
+ "stateful_callbacks": {
2091
+ "TrainerControl": {
2092
+ "args": {
2093
+ "should_epoch_stop": false,
2094
+ "should_evaluate": false,
2095
+ "should_log": false,
2096
+ "should_save": true,
2097
+ "should_training_stop": false
2098
+ },
2099
+ "attributes": {}
2100
+ }
2101
+ },
2102
+ "total_flos": 3.029626779598848e+16,
2103
+ "train_batch_size": 2,
2104
+ "trial_name": null,
2105
+ "trial_params": null
2106
+ }
checkpoint-2800/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61994a9000437d03570fb294e230b58ca459988c8e493e7a09ed1c5e37d56ce8
3
+ size 5240
checkpoint-2800/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2871/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
checkpoint-2871/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151643,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "max_position_embeddings": 32768,
15
+ "max_window_layers": 28,
16
+ "model_type": "qwen3",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 8,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": null,
22
+ "rope_theta": 1000000,
23
+ "sliding_window": null,
24
+ "tie_word_embeddings": true,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.51.3",
27
+ "use_cache": true,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 151936
30
+ }
checkpoint-2871/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.51.3"
6
+ }
checkpoint-2871/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2871/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb7435fb35b6d2722726ec1f9480c63cfb5795393e9f4203680a827442c63e22
3
+ size 2384234968
checkpoint-2871/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fb73049f6ba02660ed1e3143e07acde06d3292d15cd5ba77c0714108e6829d7
3
+ size 4768662910
checkpoint-2871/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3fcb8b7132fdda989f7bbb14a5bf464435849629fe731ccbc64c4724068a57e
3
+ size 14244
checkpoint-2871/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72aa8ca04f96d23089218f4003e772032c35a4be5a38e5d3f958347fba451169
3
+ size 988
checkpoint-2871/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32b0f5c56c482757ba8adbd0756653706b07893365c50b173034f6c52abe377a
3
+ size 1064
checkpoint-2871/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-2871/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:352a863cd2761388ccc58f1432467ba6a1037bf12df9069889b142fa246471f6
3
+ size 11422752
checkpoint-2871/tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|endoftext|>",
233
+ "errors": "replace",
234
+ "extra_special_tokens": {},
235
+ "model_max_length": 131072,
236
+ "pad_token": "<|endoftext|>",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
checkpoint-2871/trainer_state.json ADDED
@@ -0,0 +1,2155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2800,
3
+ "best_metric": 0.94057297706604,
4
+ "best_model_checkpoint": "models/MNLP_M3_rag_model_test/checkpoint-2800",
5
+ "epoch": 2.9973890339425586,
6
+ "eval_steps": 200,
7
+ "global_step": 2871,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.010443864229765013,
14
+ "grad_norm": 10.195783615112305,
15
+ "learning_rate": 9.000000000000001e-07,
16
+ "loss": 3.4273,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.020887728459530026,
21
+ "grad_norm": 7.810600280761719,
22
+ "learning_rate": 1.9000000000000002e-06,
23
+ "loss": 3.2543,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.031331592689295036,
28
+ "grad_norm": 5.423489570617676,
29
+ "learning_rate": 2.9e-06,
30
+ "loss": 3.0848,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.04177545691906005,
35
+ "grad_norm": 6.003882884979248,
36
+ "learning_rate": 3.900000000000001e-06,
37
+ "loss": 3.0378,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.05221932114882506,
42
+ "grad_norm": 5.395635604858398,
43
+ "learning_rate": 4.9000000000000005e-06,
44
+ "loss": 2.935,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.06266318537859007,
49
+ "grad_norm": 5.4613823890686035,
50
+ "learning_rate": 5.9e-06,
51
+ "loss": 2.8095,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.0731070496083551,
56
+ "grad_norm": 5.638515472412109,
57
+ "learning_rate": 6.9e-06,
58
+ "loss": 2.8053,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.0835509138381201,
63
+ "grad_norm": 5.723353385925293,
64
+ "learning_rate": 7.9e-06,
65
+ "loss": 2.7206,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.09399477806788512,
70
+ "grad_norm": 6.676548480987549,
71
+ "learning_rate": 8.900000000000001e-06,
72
+ "loss": 2.6614,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.10443864229765012,
77
+ "grad_norm": 6.09738302230835,
78
+ "learning_rate": 9.9e-06,
79
+ "loss": 2.4909,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.11488250652741515,
84
+ "grad_norm": 6.762812614440918,
85
+ "learning_rate": 9.967520750631542e-06,
86
+ "loss": 2.4897,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.12532637075718014,
91
+ "grad_norm": 6.7795796394348145,
92
+ "learning_rate": 9.931432695777698e-06,
93
+ "loss": 2.3994,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.13577023498694518,
98
+ "grad_norm": 6.7266669273376465,
99
+ "learning_rate": 9.895344640923855e-06,
100
+ "loss": 2.395,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.1462140992167102,
105
+ "grad_norm": 6.685121536254883,
106
+ "learning_rate": 9.859256586070011e-06,
107
+ "loss": 2.3435,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.1566579634464752,
112
+ "grad_norm": 7.3826985359191895,
113
+ "learning_rate": 9.823168531216168e-06,
114
+ "loss": 2.1847,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.1671018276762402,
119
+ "grad_norm": 6.167830467224121,
120
+ "learning_rate": 9.787080476362326e-06,
121
+ "loss": 2.2339,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.17754569190600522,
126
+ "grad_norm": 9.586319923400879,
127
+ "learning_rate": 9.750992421508482e-06,
128
+ "loss": 2.1565,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.18798955613577023,
133
+ "grad_norm": 9.096244812011719,
134
+ "learning_rate": 9.714904366654639e-06,
135
+ "loss": 2.2398,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.19843342036553524,
140
+ "grad_norm": 9.149866104125977,
141
+ "learning_rate": 9.68242511728618e-06,
142
+ "loss": 2.0402,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.20887728459530025,
147
+ "grad_norm": 7.662464141845703,
148
+ "learning_rate": 9.646337062432336e-06,
149
+ "loss": 1.853,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.20887728459530025,
154
+ "eval_loss": 1.9208216667175293,
155
+ "eval_runtime": 23.2773,
156
+ "eval_samples_per_second": 36.559,
157
+ "eval_steps_per_second": 4.597,
158
+ "step": 200
159
+ },
160
+ {
161
+ "epoch": 0.2193211488250653,
162
+ "grad_norm": 7.436556339263916,
163
+ "learning_rate": 9.610249007578492e-06,
164
+ "loss": 1.9207,
165
+ "step": 210
166
+ },
167
+ {
168
+ "epoch": 0.2297650130548303,
169
+ "grad_norm": 7.291353225708008,
170
+ "learning_rate": 9.574160952724649e-06,
171
+ "loss": 1.9513,
172
+ "step": 220
173
+ },
174
+ {
175
+ "epoch": 0.2402088772845953,
176
+ "grad_norm": 7.357730865478516,
177
+ "learning_rate": 9.538072897870805e-06,
178
+ "loss": 1.8785,
179
+ "step": 230
180
+ },
181
+ {
182
+ "epoch": 0.2506527415143603,
183
+ "grad_norm": 7.417892932891846,
184
+ "learning_rate": 9.501984843016962e-06,
185
+ "loss": 1.8304,
186
+ "step": 240
187
+ },
188
+ {
189
+ "epoch": 0.26109660574412535,
190
+ "grad_norm": 7.092027187347412,
191
+ "learning_rate": 9.46589678816312e-06,
192
+ "loss": 1.7244,
193
+ "step": 250
194
+ },
195
+ {
196
+ "epoch": 0.27154046997389036,
197
+ "grad_norm": 8.140602111816406,
198
+ "learning_rate": 9.429808733309276e-06,
199
+ "loss": 1.9412,
200
+ "step": 260
201
+ },
202
+ {
203
+ "epoch": 0.2819843342036554,
204
+ "grad_norm": 6.528462886810303,
205
+ "learning_rate": 9.393720678455433e-06,
206
+ "loss": 1.7654,
207
+ "step": 270
208
+ },
209
+ {
210
+ "epoch": 0.2924281984334204,
211
+ "grad_norm": 6.523935794830322,
212
+ "learning_rate": 9.35763262360159e-06,
213
+ "loss": 1.8401,
214
+ "step": 280
215
+ },
216
+ {
217
+ "epoch": 0.3028720626631854,
218
+ "grad_norm": 6.85488224029541,
219
+ "learning_rate": 9.321544568747746e-06,
220
+ "loss": 1.8402,
221
+ "step": 290
222
+ },
223
+ {
224
+ "epoch": 0.3133159268929504,
225
+ "grad_norm": 6.1414875984191895,
226
+ "learning_rate": 9.285456513893902e-06,
227
+ "loss": 1.682,
228
+ "step": 300
229
+ },
230
+ {
231
+ "epoch": 0.3237597911227154,
232
+ "grad_norm": 7.549343109130859,
233
+ "learning_rate": 9.249368459040059e-06,
234
+ "loss": 1.5124,
235
+ "step": 310
236
+ },
237
+ {
238
+ "epoch": 0.3342036553524804,
239
+ "grad_norm": 7.781056880950928,
240
+ "learning_rate": 9.213280404186215e-06,
241
+ "loss": 1.6474,
242
+ "step": 320
243
+ },
244
+ {
245
+ "epoch": 0.34464751958224543,
246
+ "grad_norm": 6.674058437347412,
247
+ "learning_rate": 9.177192349332372e-06,
248
+ "loss": 1.6971,
249
+ "step": 330
250
+ },
251
+ {
252
+ "epoch": 0.35509138381201044,
253
+ "grad_norm": 8.93964958190918,
254
+ "learning_rate": 9.14110429447853e-06,
255
+ "loss": 1.5545,
256
+ "step": 340
257
+ },
258
+ {
259
+ "epoch": 0.36553524804177545,
260
+ "grad_norm": 7.935058116912842,
261
+ "learning_rate": 9.105016239624686e-06,
262
+ "loss": 1.6633,
263
+ "step": 350
264
+ },
265
+ {
266
+ "epoch": 0.37597911227154046,
267
+ "grad_norm": 6.446653842926025,
268
+ "learning_rate": 9.068928184770843e-06,
269
+ "loss": 1.6209,
270
+ "step": 360
271
+ },
272
+ {
273
+ "epoch": 0.38642297650130547,
274
+ "grad_norm": 9.611429214477539,
275
+ "learning_rate": 9.036448935402382e-06,
276
+ "loss": 1.4615,
277
+ "step": 370
278
+ },
279
+ {
280
+ "epoch": 0.3968668407310705,
281
+ "grad_norm": 8.988059043884277,
282
+ "learning_rate": 9.000360880548538e-06,
283
+ "loss": 1.5815,
284
+ "step": 380
285
+ },
286
+ {
287
+ "epoch": 0.4073107049608355,
288
+ "grad_norm": 7.07716178894043,
289
+ "learning_rate": 8.964272825694695e-06,
290
+ "loss": 1.6719,
291
+ "step": 390
292
+ },
293
+ {
294
+ "epoch": 0.4177545691906005,
295
+ "grad_norm": 6.789717674255371,
296
+ "learning_rate": 8.928184770840851e-06,
297
+ "loss": 1.4354,
298
+ "step": 400
299
+ },
300
+ {
301
+ "epoch": 0.4177545691906005,
302
+ "eval_loss": 1.4963077306747437,
303
+ "eval_runtime": 23.1593,
304
+ "eval_samples_per_second": 36.746,
305
+ "eval_steps_per_second": 4.62,
306
+ "step": 400
307
+ },
308
+ {
309
+ "epoch": 0.4281984334203655,
310
+ "grad_norm": 6.676051139831543,
311
+ "learning_rate": 8.89209671598701e-06,
312
+ "loss": 1.695,
313
+ "step": 410
314
+ },
315
+ {
316
+ "epoch": 0.4386422976501306,
317
+ "grad_norm": 6.711126327514648,
318
+ "learning_rate": 8.856008661133166e-06,
319
+ "loss": 1.6562,
320
+ "step": 420
321
+ },
322
+ {
323
+ "epoch": 0.4490861618798956,
324
+ "grad_norm": 7.0567731857299805,
325
+ "learning_rate": 8.819920606279322e-06,
326
+ "loss": 1.6646,
327
+ "step": 430
328
+ },
329
+ {
330
+ "epoch": 0.4595300261096606,
331
+ "grad_norm": 6.944768905639648,
332
+ "learning_rate": 8.783832551425479e-06,
333
+ "loss": 1.4864,
334
+ "step": 440
335
+ },
336
+ {
337
+ "epoch": 0.4699738903394256,
338
+ "grad_norm": 6.025828838348389,
339
+ "learning_rate": 8.747744496571635e-06,
340
+ "loss": 1.4261,
341
+ "step": 450
342
+ },
343
+ {
344
+ "epoch": 0.4804177545691906,
345
+ "grad_norm": 7.318626880645752,
346
+ "learning_rate": 8.711656441717792e-06,
347
+ "loss": 1.4183,
348
+ "step": 460
349
+ },
350
+ {
351
+ "epoch": 0.4908616187989556,
352
+ "grad_norm": 9.207475662231445,
353
+ "learning_rate": 8.675568386863948e-06,
354
+ "loss": 1.6308,
355
+ "step": 470
356
+ },
357
+ {
358
+ "epoch": 0.5013054830287206,
359
+ "grad_norm": 7.152507781982422,
360
+ "learning_rate": 8.639480332010105e-06,
361
+ "loss": 1.5118,
362
+ "step": 480
363
+ },
364
+ {
365
+ "epoch": 0.5117493472584856,
366
+ "grad_norm": 6.89570426940918,
367
+ "learning_rate": 8.603392277156261e-06,
368
+ "loss": 1.6088,
369
+ "step": 490
370
+ },
371
+ {
372
+ "epoch": 0.5221932114882507,
373
+ "grad_norm": 8.016292572021484,
374
+ "learning_rate": 8.567304222302419e-06,
375
+ "loss": 1.6461,
376
+ "step": 500
377
+ },
378
+ {
379
+ "epoch": 0.5326370757180157,
380
+ "grad_norm": 5.700518608093262,
381
+ "learning_rate": 8.531216167448576e-06,
382
+ "loss": 1.5497,
383
+ "step": 510
384
+ },
385
+ {
386
+ "epoch": 0.5430809399477807,
387
+ "grad_norm": 7.127388000488281,
388
+ "learning_rate": 8.495128112594732e-06,
389
+ "loss": 1.4443,
390
+ "step": 520
391
+ },
392
+ {
393
+ "epoch": 0.5535248041775457,
394
+ "grad_norm": 9.201033592224121,
395
+ "learning_rate": 8.459040057740888e-06,
396
+ "loss": 1.3384,
397
+ "step": 530
398
+ },
399
+ {
400
+ "epoch": 0.5639686684073107,
401
+ "grad_norm": 9.565522193908691,
402
+ "learning_rate": 8.422952002887045e-06,
403
+ "loss": 1.3732,
404
+ "step": 540
405
+ },
406
+ {
407
+ "epoch": 0.5744125326370757,
408
+ "grad_norm": 6.29095458984375,
409
+ "learning_rate": 8.386863948033201e-06,
410
+ "loss": 1.5768,
411
+ "step": 550
412
+ },
413
+ {
414
+ "epoch": 0.5848563968668408,
415
+ "grad_norm": 7.279628753662109,
416
+ "learning_rate": 8.350775893179358e-06,
417
+ "loss": 1.5237,
418
+ "step": 560
419
+ },
420
+ {
421
+ "epoch": 0.5953002610966057,
422
+ "grad_norm": 5.866252899169922,
423
+ "learning_rate": 8.314687838325514e-06,
424
+ "loss": 1.491,
425
+ "step": 570
426
+ },
427
+ {
428
+ "epoch": 0.6057441253263708,
429
+ "grad_norm": 6.8628621101379395,
430
+ "learning_rate": 8.27859978347167e-06,
431
+ "loss": 1.3991,
432
+ "step": 580
433
+ },
434
+ {
435
+ "epoch": 0.6161879895561357,
436
+ "grad_norm": 5.9102654457092285,
437
+ "learning_rate": 8.242511728617829e-06,
438
+ "loss": 1.3842,
439
+ "step": 590
440
+ },
441
+ {
442
+ "epoch": 0.6266318537859008,
443
+ "grad_norm": 6.6509199142456055,
444
+ "learning_rate": 8.206423673763985e-06,
445
+ "loss": 1.4015,
446
+ "step": 600
447
+ },
448
+ {
449
+ "epoch": 0.6266318537859008,
450
+ "eval_loss": 1.3116086721420288,
451
+ "eval_runtime": 23.131,
452
+ "eval_samples_per_second": 36.79,
453
+ "eval_steps_per_second": 4.626,
454
+ "step": 600
455
+ },
456
+ {
457
+ "epoch": 0.6370757180156658,
458
+ "grad_norm": 10.004154205322266,
459
+ "learning_rate": 8.170335618910142e-06,
460
+ "loss": 1.4046,
461
+ "step": 610
462
+ },
463
+ {
464
+ "epoch": 0.6475195822454308,
465
+ "grad_norm": 6.851074695587158,
466
+ "learning_rate": 8.134247564056298e-06,
467
+ "loss": 1.3358,
468
+ "step": 620
469
+ },
470
+ {
471
+ "epoch": 0.6579634464751958,
472
+ "grad_norm": 6.676290512084961,
473
+ "learning_rate": 8.098159509202455e-06,
474
+ "loss": 1.2649,
475
+ "step": 630
476
+ },
477
+ {
478
+ "epoch": 0.6684073107049608,
479
+ "grad_norm": 6.854121685028076,
480
+ "learning_rate": 8.062071454348611e-06,
481
+ "loss": 1.4563,
482
+ "step": 640
483
+ },
484
+ {
485
+ "epoch": 0.6788511749347258,
486
+ "grad_norm": 7.104318141937256,
487
+ "learning_rate": 8.025983399494768e-06,
488
+ "loss": 1.2667,
489
+ "step": 650
490
+ },
491
+ {
492
+ "epoch": 0.6892950391644909,
493
+ "grad_norm": 6.3163957595825195,
494
+ "learning_rate": 7.989895344640924e-06,
495
+ "loss": 1.4532,
496
+ "step": 660
497
+ },
498
+ {
499
+ "epoch": 0.6997389033942559,
500
+ "grad_norm": 7.5210652351379395,
501
+ "learning_rate": 7.95380728978708e-06,
502
+ "loss": 1.179,
503
+ "step": 670
504
+ },
505
+ {
506
+ "epoch": 0.7101827676240209,
507
+ "grad_norm": 7.099525451660156,
508
+ "learning_rate": 7.917719234933237e-06,
509
+ "loss": 1.2406,
510
+ "step": 680
511
+ },
512
+ {
513
+ "epoch": 0.720626631853786,
514
+ "grad_norm": 6.814334392547607,
515
+ "learning_rate": 7.881631180079395e-06,
516
+ "loss": 1.1178,
517
+ "step": 690
518
+ },
519
+ {
520
+ "epoch": 0.7310704960835509,
521
+ "grad_norm": 6.895037651062012,
522
+ "learning_rate": 7.845543125225551e-06,
523
+ "loss": 1.3291,
524
+ "step": 700
525
+ },
526
+ {
527
+ "epoch": 0.741514360313316,
528
+ "grad_norm": 6.383569717407227,
529
+ "learning_rate": 7.809455070371708e-06,
530
+ "loss": 1.2887,
531
+ "step": 710
532
+ },
533
+ {
534
+ "epoch": 0.7519582245430809,
535
+ "grad_norm": 8.13758373260498,
536
+ "learning_rate": 7.773367015517864e-06,
537
+ "loss": 1.2576,
538
+ "step": 720
539
+ },
540
+ {
541
+ "epoch": 0.762402088772846,
542
+ "grad_norm": 5.2739362716674805,
543
+ "learning_rate": 7.73727896066402e-06,
544
+ "loss": 1.2835,
545
+ "step": 730
546
+ },
547
+ {
548
+ "epoch": 0.7728459530026109,
549
+ "grad_norm": 7.7005934715271,
550
+ "learning_rate": 7.701190905810177e-06,
551
+ "loss": 1.2633,
552
+ "step": 740
553
+ },
554
+ {
555
+ "epoch": 0.783289817232376,
556
+ "grad_norm": 6.738245964050293,
557
+ "learning_rate": 7.665102850956334e-06,
558
+ "loss": 1.2129,
559
+ "step": 750
560
+ },
561
+ {
562
+ "epoch": 0.793733681462141,
563
+ "grad_norm": 5.725705146789551,
564
+ "learning_rate": 7.629014796102491e-06,
565
+ "loss": 1.2501,
566
+ "step": 760
567
+ },
568
+ {
569
+ "epoch": 0.804177545691906,
570
+ "grad_norm": 6.256880760192871,
571
+ "learning_rate": 7.5929267412486475e-06,
572
+ "loss": 1.2567,
573
+ "step": 770
574
+ },
575
+ {
576
+ "epoch": 0.814621409921671,
577
+ "grad_norm": 6.835810661315918,
578
+ "learning_rate": 7.556838686394804e-06,
579
+ "loss": 1.2408,
580
+ "step": 780
581
+ },
582
+ {
583
+ "epoch": 0.825065274151436,
584
+ "grad_norm": 5.811525821685791,
585
+ "learning_rate": 7.52075063154096e-06,
586
+ "loss": 1.2409,
587
+ "step": 790
588
+ },
589
+ {
590
+ "epoch": 0.835509138381201,
591
+ "grad_norm": 8.554996490478516,
592
+ "learning_rate": 7.484662576687118e-06,
593
+ "loss": 1.2575,
594
+ "step": 800
595
+ },
596
+ {
597
+ "epoch": 0.835509138381201,
598
+ "eval_loss": 1.2006852626800537,
599
+ "eval_runtime": 23.1272,
600
+ "eval_samples_per_second": 36.796,
601
+ "eval_steps_per_second": 4.627,
602
+ "step": 800
603
+ },
604
+ {
605
+ "epoch": 0.8459530026109661,
606
+ "grad_norm": 7.091418266296387,
607
+ "learning_rate": 7.448574521833274e-06,
608
+ "loss": 1.0256,
609
+ "step": 810
610
+ },
611
+ {
612
+ "epoch": 0.856396866840731,
613
+ "grad_norm": 7.5542683601379395,
614
+ "learning_rate": 7.4124864669794306e-06,
615
+ "loss": 1.0605,
616
+ "step": 820
617
+ },
618
+ {
619
+ "epoch": 0.8668407310704961,
620
+ "grad_norm": 7.177179336547852,
621
+ "learning_rate": 7.376398412125587e-06,
622
+ "loss": 1.1747,
623
+ "step": 830
624
+ },
625
+ {
626
+ "epoch": 0.8772845953002611,
627
+ "grad_norm": 6.226783752441406,
628
+ "learning_rate": 7.3403103572717434e-06,
629
+ "loss": 1.1518,
630
+ "step": 840
631
+ },
632
+ {
633
+ "epoch": 0.8877284595300261,
634
+ "grad_norm": 7.062104225158691,
635
+ "learning_rate": 7.304222302417901e-06,
636
+ "loss": 1.2322,
637
+ "step": 850
638
+ },
639
+ {
640
+ "epoch": 0.8981723237597912,
641
+ "grad_norm": 8.756392478942871,
642
+ "learning_rate": 7.268134247564057e-06,
643
+ "loss": 1.271,
644
+ "step": 860
645
+ },
646
+ {
647
+ "epoch": 0.9086161879895561,
648
+ "grad_norm": 6.364953994750977,
649
+ "learning_rate": 7.232046192710214e-06,
650
+ "loss": 1.1853,
651
+ "step": 870
652
+ },
653
+ {
654
+ "epoch": 0.9190600522193212,
655
+ "grad_norm": 6.629589080810547,
656
+ "learning_rate": 7.19595813785637e-06,
657
+ "loss": 1.2279,
658
+ "step": 880
659
+ },
660
+ {
661
+ "epoch": 0.9295039164490861,
662
+ "grad_norm": 5.4719977378845215,
663
+ "learning_rate": 7.1598700830025265e-06,
664
+ "loss": 1.0789,
665
+ "step": 890
666
+ },
667
+ {
668
+ "epoch": 0.9399477806788512,
669
+ "grad_norm": 5.824141025543213,
670
+ "learning_rate": 7.123782028148684e-06,
671
+ "loss": 1.0435,
672
+ "step": 900
673
+ },
674
+ {
675
+ "epoch": 0.9503916449086162,
676
+ "grad_norm": 9.214339256286621,
677
+ "learning_rate": 7.08769397329484e-06,
678
+ "loss": 1.0518,
679
+ "step": 910
680
+ },
681
+ {
682
+ "epoch": 0.9608355091383812,
683
+ "grad_norm": 6.3007941246032715,
684
+ "learning_rate": 7.051605918440997e-06,
685
+ "loss": 1.0297,
686
+ "step": 920
687
+ },
688
+ {
689
+ "epoch": 0.9712793733681462,
690
+ "grad_norm": 5.431386947631836,
691
+ "learning_rate": 7.015517863587153e-06,
692
+ "loss": 1.1292,
693
+ "step": 930
694
+ },
695
+ {
696
+ "epoch": 0.9817232375979112,
697
+ "grad_norm": 8.941716194152832,
698
+ "learning_rate": 6.9794298087333105e-06,
699
+ "loss": 1.1617,
700
+ "step": 940
701
+ },
702
+ {
703
+ "epoch": 0.9921671018276762,
704
+ "grad_norm": 8.926597595214844,
705
+ "learning_rate": 6.943341753879467e-06,
706
+ "loss": 1.3497,
707
+ "step": 950
708
+ },
709
+ {
710
+ "epoch": 1.002088772845953,
711
+ "grad_norm": 6.556091785430908,
712
+ "learning_rate": 6.907253699025623e-06,
713
+ "loss": 1.176,
714
+ "step": 960
715
+ },
716
+ {
717
+ "epoch": 1.012532637075718,
718
+ "grad_norm": 12.022808074951172,
719
+ "learning_rate": 6.87116564417178e-06,
720
+ "loss": 0.9251,
721
+ "step": 970
722
+ },
723
+ {
724
+ "epoch": 1.022976501305483,
725
+ "grad_norm": 7.0122389793396,
726
+ "learning_rate": 6.835077589317936e-06,
727
+ "loss": 0.9231,
728
+ "step": 980
729
+ },
730
+ {
731
+ "epoch": 1.033420365535248,
732
+ "grad_norm": 6.355090141296387,
733
+ "learning_rate": 6.7989895344640936e-06,
734
+ "loss": 1.0371,
735
+ "step": 990
736
+ },
737
+ {
738
+ "epoch": 1.0438642297650131,
739
+ "grad_norm": 7.36827278137207,
740
+ "learning_rate": 6.76290147961025e-06,
741
+ "loss": 1.0666,
742
+ "step": 1000
743
+ },
744
+ {
745
+ "epoch": 1.0438642297650131,
746
+ "eval_loss": 1.1358416080474854,
747
+ "eval_runtime": 23.1354,
748
+ "eval_samples_per_second": 36.783,
749
+ "eval_steps_per_second": 4.625,
750
+ "step": 1000
751
+ },
752
+ {
753
+ "epoch": 1.054308093994778,
754
+ "grad_norm": 5.1998772621154785,
755
+ "learning_rate": 6.7268134247564065e-06,
756
+ "loss": 0.847,
757
+ "step": 1010
758
+ },
759
+ {
760
+ "epoch": 1.064751958224543,
761
+ "grad_norm": 6.828693866729736,
762
+ "learning_rate": 6.690725369902563e-06,
763
+ "loss": 1.0755,
764
+ "step": 1020
765
+ },
766
+ {
767
+ "epoch": 1.0751958224543081,
768
+ "grad_norm": 6.408750534057617,
769
+ "learning_rate": 6.65463731504872e-06,
770
+ "loss": 0.9039,
771
+ "step": 1030
772
+ },
773
+ {
774
+ "epoch": 1.0856396866840732,
775
+ "grad_norm": 7.197463035583496,
776
+ "learning_rate": 6.618549260194877e-06,
777
+ "loss": 0.959,
778
+ "step": 1040
779
+ },
780
+ {
781
+ "epoch": 1.096083550913838,
782
+ "grad_norm": 5.504906177520752,
783
+ "learning_rate": 6.582461205341033e-06,
784
+ "loss": 0.9467,
785
+ "step": 1050
786
+ },
787
+ {
788
+ "epoch": 1.106527415143603,
789
+ "grad_norm": 7.88711404800415,
790
+ "learning_rate": 6.5463731504871896e-06,
791
+ "loss": 1.0635,
792
+ "step": 1060
793
+ },
794
+ {
795
+ "epoch": 1.1169712793733682,
796
+ "grad_norm": 5.1561055183410645,
797
+ "learning_rate": 6.510285095633346e-06,
798
+ "loss": 0.9635,
799
+ "step": 1070
800
+ },
801
+ {
802
+ "epoch": 1.1274151436031332,
803
+ "grad_norm": 5.088303565979004,
804
+ "learning_rate": 6.474197040779503e-06,
805
+ "loss": 0.9605,
806
+ "step": 1080
807
+ },
808
+ {
809
+ "epoch": 1.137859007832898,
810
+ "grad_norm": 4.595157146453857,
811
+ "learning_rate": 6.43810898592566e-06,
812
+ "loss": 1.0494,
813
+ "step": 1090
814
+ },
815
+ {
816
+ "epoch": 1.1483028720626631,
817
+ "grad_norm": 6.682077884674072,
818
+ "learning_rate": 6.402020931071816e-06,
819
+ "loss": 0.888,
820
+ "step": 1100
821
+ },
822
+ {
823
+ "epoch": 1.1587467362924282,
824
+ "grad_norm": 9.08073902130127,
825
+ "learning_rate": 6.365932876217973e-06,
826
+ "loss": 1.1239,
827
+ "step": 1110
828
+ },
829
+ {
830
+ "epoch": 1.1691906005221933,
831
+ "grad_norm": 7.480021953582764,
832
+ "learning_rate": 6.329844821364129e-06,
833
+ "loss": 1.0337,
834
+ "step": 1120
835
+ },
836
+ {
837
+ "epoch": 1.1796344647519583,
838
+ "grad_norm": 5.750502109527588,
839
+ "learning_rate": 6.293756766510286e-06,
840
+ "loss": 1.132,
841
+ "step": 1130
842
+ },
843
+ {
844
+ "epoch": 1.1900783289817232,
845
+ "grad_norm": 6.151831150054932,
846
+ "learning_rate": 6.257668711656443e-06,
847
+ "loss": 0.9985,
848
+ "step": 1140
849
+ },
850
+ {
851
+ "epoch": 1.2005221932114882,
852
+ "grad_norm": 6.567698001861572,
853
+ "learning_rate": 6.221580656802599e-06,
854
+ "loss": 1.1177,
855
+ "step": 1150
856
+ },
857
+ {
858
+ "epoch": 1.2109660574412533,
859
+ "grad_norm": 6.1963887214660645,
860
+ "learning_rate": 6.185492601948756e-06,
861
+ "loss": 0.9328,
862
+ "step": 1160
863
+ },
864
+ {
865
+ "epoch": 1.2214099216710184,
866
+ "grad_norm": 5.044003009796143,
867
+ "learning_rate": 6.149404547094913e-06,
868
+ "loss": 0.8986,
869
+ "step": 1170
870
+ },
871
+ {
872
+ "epoch": 1.2318537859007832,
873
+ "grad_norm": 6.648619651794434,
874
+ "learning_rate": 6.113316492241068e-06,
875
+ "loss": 0.9905,
876
+ "step": 1180
877
+ },
878
+ {
879
+ "epoch": 1.2422976501305483,
880
+ "grad_norm": 9.110396385192871,
881
+ "learning_rate": 6.077228437387225e-06,
882
+ "loss": 1.0126,
883
+ "step": 1190
884
+ },
885
+ {
886
+ "epoch": 1.2527415143603133,
887
+ "grad_norm": 8.707374572753906,
888
+ "learning_rate": 6.0411403825333815e-06,
889
+ "loss": 0.8611,
890
+ "step": 1200
891
+ },
892
+ {
893
+ "epoch": 1.2527415143603133,
894
+ "eval_loss": 1.0788438320159912,
895
+ "eval_runtime": 23.1258,
896
+ "eval_samples_per_second": 36.799,
897
+ "eval_steps_per_second": 4.627,
898
+ "step": 1200
899
+ },
900
+ {
901
+ "epoch": 1.2631853785900784,
902
+ "grad_norm": 5.457468509674072,
903
+ "learning_rate": 6.005052327679538e-06,
904
+ "loss": 0.8327,
905
+ "step": 1210
906
+ },
907
+ {
908
+ "epoch": 1.2736292428198435,
909
+ "grad_norm": 6.050765037536621,
910
+ "learning_rate": 5.9689642728256944e-06,
911
+ "loss": 1.0782,
912
+ "step": 1220
913
+ },
914
+ {
915
+ "epoch": 1.2840731070496083,
916
+ "grad_norm": 6.254447937011719,
917
+ "learning_rate": 5.932876217971852e-06,
918
+ "loss": 0.9388,
919
+ "step": 1230
920
+ },
921
+ {
922
+ "epoch": 1.2945169712793734,
923
+ "grad_norm": 5.181304931640625,
924
+ "learning_rate": 5.896788163118008e-06,
925
+ "loss": 0.9711,
926
+ "step": 1240
927
+ },
928
+ {
929
+ "epoch": 1.3049608355091384,
930
+ "grad_norm": 6.832638263702393,
931
+ "learning_rate": 5.860700108264165e-06,
932
+ "loss": 1.0333,
933
+ "step": 1250
934
+ },
935
+ {
936
+ "epoch": 1.3154046997389033,
937
+ "grad_norm": 8.406023025512695,
938
+ "learning_rate": 5.824612053410321e-06,
939
+ "loss": 1.0902,
940
+ "step": 1260
941
+ },
942
+ {
943
+ "epoch": 1.3258485639686683,
944
+ "grad_norm": 6.346268653869629,
945
+ "learning_rate": 5.7885239985564775e-06,
946
+ "loss": 0.8651,
947
+ "step": 1270
948
+ },
949
+ {
950
+ "epoch": 1.3362924281984334,
951
+ "grad_norm": 8.447615623474121,
952
+ "learning_rate": 5.752435943702635e-06,
953
+ "loss": 1.046,
954
+ "step": 1280
955
+ },
956
+ {
957
+ "epoch": 1.3467362924281985,
958
+ "grad_norm": 8.351264953613281,
959
+ "learning_rate": 5.716347888848791e-06,
960
+ "loss": 1.029,
961
+ "step": 1290
962
+ },
963
+ {
964
+ "epoch": 1.3571801566579635,
965
+ "grad_norm": 6.036417007446289,
966
+ "learning_rate": 5.680259833994948e-06,
967
+ "loss": 1.0089,
968
+ "step": 1300
969
+ },
970
+ {
971
+ "epoch": 1.3676240208877284,
972
+ "grad_norm": 5.646811485290527,
973
+ "learning_rate": 5.644171779141104e-06,
974
+ "loss": 0.9346,
975
+ "step": 1310
976
+ },
977
+ {
978
+ "epoch": 1.3780678851174935,
979
+ "grad_norm": 6.4004950523376465,
980
+ "learning_rate": 5.608083724287261e-06,
981
+ "loss": 0.9489,
982
+ "step": 1320
983
+ },
984
+ {
985
+ "epoch": 1.3885117493472585,
986
+ "grad_norm": 5.746732234954834,
987
+ "learning_rate": 5.571995669433418e-06,
988
+ "loss": 0.9356,
989
+ "step": 1330
990
+ },
991
+ {
992
+ "epoch": 1.3989556135770234,
993
+ "grad_norm": 5.3405632972717285,
994
+ "learning_rate": 5.535907614579574e-06,
995
+ "loss": 0.7099,
996
+ "step": 1340
997
+ },
998
+ {
999
+ "epoch": 1.4093994778067884,
1000
+ "grad_norm": 6.424510955810547,
1001
+ "learning_rate": 5.499819559725731e-06,
1002
+ "loss": 0.6955,
1003
+ "step": 1350
1004
+ },
1005
+ {
1006
+ "epoch": 1.4198433420365535,
1007
+ "grad_norm": 5.751818656921387,
1008
+ "learning_rate": 5.463731504871887e-06,
1009
+ "loss": 0.9094,
1010
+ "step": 1360
1011
+ },
1012
+ {
1013
+ "epoch": 1.4302872062663186,
1014
+ "grad_norm": 7.787084579467773,
1015
+ "learning_rate": 5.4276434500180445e-06,
1016
+ "loss": 0.9618,
1017
+ "step": 1370
1018
+ },
1019
+ {
1020
+ "epoch": 1.4407310704960836,
1021
+ "grad_norm": 6.467785835266113,
1022
+ "learning_rate": 5.391555395164201e-06,
1023
+ "loss": 0.8549,
1024
+ "step": 1380
1025
+ },
1026
+ {
1027
+ "epoch": 1.4511749347258487,
1028
+ "grad_norm": 6.860886573791504,
1029
+ "learning_rate": 5.3554673403103574e-06,
1030
+ "loss": 0.8342,
1031
+ "step": 1390
1032
+ },
1033
+ {
1034
+ "epoch": 1.4616187989556135,
1035
+ "grad_norm": 5.627669811248779,
1036
+ "learning_rate": 5.319379285456514e-06,
1037
+ "loss": 0.9012,
1038
+ "step": 1400
1039
+ },
1040
+ {
1041
+ "epoch": 1.4616187989556135,
1042
+ "eval_loss": 1.0375572443008423,
1043
+ "eval_runtime": 23.1318,
1044
+ "eval_samples_per_second": 36.789,
1045
+ "eval_steps_per_second": 4.626,
1046
+ "step": 1400
1047
+ },
1048
+ {
1049
+ "epoch": 1.4720626631853786,
1050
+ "grad_norm": 7.144196033477783,
1051
+ "learning_rate": 5.28329123060267e-06,
1052
+ "loss": 0.9949,
1053
+ "step": 1410
1054
+ },
1055
+ {
1056
+ "epoch": 1.4825065274151437,
1057
+ "grad_norm": 6.208961486816406,
1058
+ "learning_rate": 5.247203175748828e-06,
1059
+ "loss": 1.0719,
1060
+ "step": 1420
1061
+ },
1062
+ {
1063
+ "epoch": 1.4929503916449085,
1064
+ "grad_norm": 7.110988140106201,
1065
+ "learning_rate": 5.211115120894984e-06,
1066
+ "loss": 0.9162,
1067
+ "step": 1430
1068
+ },
1069
+ {
1070
+ "epoch": 1.5033942558746736,
1071
+ "grad_norm": 6.903599739074707,
1072
+ "learning_rate": 5.1750270660411405e-06,
1073
+ "loss": 0.9974,
1074
+ "step": 1440
1075
+ },
1076
+ {
1077
+ "epoch": 1.5138381201044386,
1078
+ "grad_norm": 5.059232234954834,
1079
+ "learning_rate": 5.138939011187297e-06,
1080
+ "loss": 0.9602,
1081
+ "step": 1450
1082
+ },
1083
+ {
1084
+ "epoch": 1.5242819843342037,
1085
+ "grad_norm": 3.6045143604278564,
1086
+ "learning_rate": 5.1028509563334534e-06,
1087
+ "loss": 0.8448,
1088
+ "step": 1460
1089
+ },
1090
+ {
1091
+ "epoch": 1.5347258485639688,
1092
+ "grad_norm": 8.628995895385742,
1093
+ "learning_rate": 5.066762901479611e-06,
1094
+ "loss": 0.914,
1095
+ "step": 1470
1096
+ },
1097
+ {
1098
+ "epoch": 1.5451697127937338,
1099
+ "grad_norm": 7.227733612060547,
1100
+ "learning_rate": 5.030674846625767e-06,
1101
+ "loss": 0.9206,
1102
+ "step": 1480
1103
+ },
1104
+ {
1105
+ "epoch": 1.5556135770234987,
1106
+ "grad_norm": 7.930648326873779,
1107
+ "learning_rate": 4.994586791771924e-06,
1108
+ "loss": 1.1221,
1109
+ "step": 1490
1110
+ },
1111
+ {
1112
+ "epoch": 1.5660574412532637,
1113
+ "grad_norm": 6.340338706970215,
1114
+ "learning_rate": 4.95849873691808e-06,
1115
+ "loss": 0.8298,
1116
+ "step": 1500
1117
+ },
1118
+ {
1119
+ "epoch": 1.5765013054830286,
1120
+ "grad_norm": 5.558096408843994,
1121
+ "learning_rate": 4.922410682064237e-06,
1122
+ "loss": 0.9457,
1123
+ "step": 1510
1124
+ },
1125
+ {
1126
+ "epoch": 1.5869451697127936,
1127
+ "grad_norm": 7.608903408050537,
1128
+ "learning_rate": 4.886322627210394e-06,
1129
+ "loss": 0.9164,
1130
+ "step": 1520
1131
+ },
1132
+ {
1133
+ "epoch": 1.5973890339425587,
1134
+ "grad_norm": 9.46885871887207,
1135
+ "learning_rate": 4.85023457235655e-06,
1136
+ "loss": 0.9498,
1137
+ "step": 1530
1138
+ },
1139
+ {
1140
+ "epoch": 1.6078328981723238,
1141
+ "grad_norm": 7.6691107749938965,
1142
+ "learning_rate": 4.814146517502707e-06,
1143
+ "loss": 0.9764,
1144
+ "step": 1540
1145
+ },
1146
+ {
1147
+ "epoch": 1.6182767624020888,
1148
+ "grad_norm": 8.231538772583008,
1149
+ "learning_rate": 4.778058462648863e-06,
1150
+ "loss": 0.8314,
1151
+ "step": 1550
1152
+ },
1153
+ {
1154
+ "epoch": 1.628720626631854,
1155
+ "grad_norm": 5.868668556213379,
1156
+ "learning_rate": 4.7419704077950205e-06,
1157
+ "loss": 0.9127,
1158
+ "step": 1560
1159
+ },
1160
+ {
1161
+ "epoch": 1.6391644908616188,
1162
+ "grad_norm": 7.236291885375977,
1163
+ "learning_rate": 4.705882352941177e-06,
1164
+ "loss": 0.9485,
1165
+ "step": 1570
1166
+ },
1167
+ {
1168
+ "epoch": 1.6496083550913838,
1169
+ "grad_norm": 6.871162414550781,
1170
+ "learning_rate": 4.669794298087333e-06,
1171
+ "loss": 1.0603,
1172
+ "step": 1580
1173
+ },
1174
+ {
1175
+ "epoch": 1.6600522193211487,
1176
+ "grad_norm": 6.303982734680176,
1177
+ "learning_rate": 4.63370624323349e-06,
1178
+ "loss": 0.945,
1179
+ "step": 1590
1180
+ },
1181
+ {
1182
+ "epoch": 1.6704960835509137,
1183
+ "grad_norm": 7.167915344238281,
1184
+ "learning_rate": 4.597618188379647e-06,
1185
+ "loss": 0.7351,
1186
+ "step": 1600
1187
+ },
1188
+ {
1189
+ "epoch": 1.6704960835509137,
1190
+ "eval_loss": 1.0095878839492798,
1191
+ "eval_runtime": 23.1917,
1192
+ "eval_samples_per_second": 36.694,
1193
+ "eval_steps_per_second": 4.614,
1194
+ "step": 1600
1195
+ },
1196
+ {
1197
+ "epoch": 1.6809399477806788,
1198
+ "grad_norm": 6.626043796539307,
1199
+ "learning_rate": 4.5615301335258035e-06,
1200
+ "loss": 0.9369,
1201
+ "step": 1610
1202
+ },
1203
+ {
1204
+ "epoch": 1.6913838120104439,
1205
+ "grad_norm": 6.645303726196289,
1206
+ "learning_rate": 4.52544207867196e-06,
1207
+ "loss": 0.8509,
1208
+ "step": 1620
1209
+ },
1210
+ {
1211
+ "epoch": 1.701827676240209,
1212
+ "grad_norm": 7.132906913757324,
1213
+ "learning_rate": 4.4893540238181164e-06,
1214
+ "loss": 0.9522,
1215
+ "step": 1630
1216
+ },
1217
+ {
1218
+ "epoch": 1.712271540469974,
1219
+ "grad_norm": 6.155941009521484,
1220
+ "learning_rate": 4.453265968964273e-06,
1221
+ "loss": 0.9662,
1222
+ "step": 1640
1223
+ },
1224
+ {
1225
+ "epoch": 1.722715404699739,
1226
+ "grad_norm": 5.0147705078125,
1227
+ "learning_rate": 4.41717791411043e-06,
1228
+ "loss": 0.8755,
1229
+ "step": 1650
1230
+ },
1231
+ {
1232
+ "epoch": 1.733159268929504,
1233
+ "grad_norm": 7.039682388305664,
1234
+ "learning_rate": 4.381089859256587e-06,
1235
+ "loss": 0.8761,
1236
+ "step": 1660
1237
+ },
1238
+ {
1239
+ "epoch": 1.743603133159269,
1240
+ "grad_norm": 8.536678314208984,
1241
+ "learning_rate": 4.345001804402743e-06,
1242
+ "loss": 0.8054,
1243
+ "step": 1670
1244
+ },
1245
+ {
1246
+ "epoch": 1.7540469973890338,
1247
+ "grad_norm": 6.860482215881348,
1248
+ "learning_rate": 4.3089137495488995e-06,
1249
+ "loss": 0.9095,
1250
+ "step": 1680
1251
+ },
1252
+ {
1253
+ "epoch": 1.7644908616187989,
1254
+ "grad_norm": 6.3796563148498535,
1255
+ "learning_rate": 4.272825694695056e-06,
1256
+ "loss": 1.08,
1257
+ "step": 1690
1258
+ },
1259
+ {
1260
+ "epoch": 1.774934725848564,
1261
+ "grad_norm": 11.013219833374023,
1262
+ "learning_rate": 4.236737639841213e-06,
1263
+ "loss": 0.834,
1264
+ "step": 1700
1265
+ },
1266
+ {
1267
+ "epoch": 1.785378590078329,
1268
+ "grad_norm": 6.231220722198486,
1269
+ "learning_rate": 4.20064958498737e-06,
1270
+ "loss": 0.8488,
1271
+ "step": 1710
1272
+ },
1273
+ {
1274
+ "epoch": 1.795822454308094,
1275
+ "grad_norm": 7.019144535064697,
1276
+ "learning_rate": 4.164561530133526e-06,
1277
+ "loss": 0.7922,
1278
+ "step": 1720
1279
+ },
1280
+ {
1281
+ "epoch": 1.8062663185378591,
1282
+ "grad_norm": 6.586852550506592,
1283
+ "learning_rate": 4.128473475279683e-06,
1284
+ "loss": 0.8326,
1285
+ "step": 1730
1286
+ },
1287
+ {
1288
+ "epoch": 1.816710182767624,
1289
+ "grad_norm": 6.34022855758667,
1290
+ "learning_rate": 4.09238542042584e-06,
1291
+ "loss": 0.906,
1292
+ "step": 1740
1293
+ },
1294
+ {
1295
+ "epoch": 1.827154046997389,
1296
+ "grad_norm": 7.541686058044434,
1297
+ "learning_rate": 4.056297365571996e-06,
1298
+ "loss": 0.9029,
1299
+ "step": 1750
1300
+ },
1301
+ {
1302
+ "epoch": 1.837597911227154,
1303
+ "grad_norm": 5.867885589599609,
1304
+ "learning_rate": 4.020209310718153e-06,
1305
+ "loss": 0.8072,
1306
+ "step": 1760
1307
+ },
1308
+ {
1309
+ "epoch": 1.848041775456919,
1310
+ "grad_norm": 4.805484771728516,
1311
+ "learning_rate": 3.984121255864309e-06,
1312
+ "loss": 0.9158,
1313
+ "step": 1770
1314
+ },
1315
+ {
1316
+ "epoch": 1.858485639686684,
1317
+ "grad_norm": 5.949447154998779,
1318
+ "learning_rate": 3.948033201010466e-06,
1319
+ "loss": 0.8116,
1320
+ "step": 1780
1321
+ },
1322
+ {
1323
+ "epoch": 1.868929503916449,
1324
+ "grad_norm": 7.641289234161377,
1325
+ "learning_rate": 3.911945146156623e-06,
1326
+ "loss": 0.8824,
1327
+ "step": 1790
1328
+ },
1329
+ {
1330
+ "epoch": 1.8793733681462141,
1331
+ "grad_norm": 5.530484199523926,
1332
+ "learning_rate": 3.8758570913027795e-06,
1333
+ "loss": 0.9264,
1334
+ "step": 1800
1335
+ },
1336
+ {
1337
+ "epoch": 1.8793733681462141,
1338
+ "eval_loss": 0.9786838293075562,
1339
+ "eval_runtime": 23.1205,
1340
+ "eval_samples_per_second": 36.807,
1341
+ "eval_steps_per_second": 4.628,
1342
+ "step": 1800
1343
+ },
1344
+ {
1345
+ "epoch": 1.8898172323759792,
1346
+ "grad_norm": 5.981470584869385,
1347
+ "learning_rate": 3.839769036448936e-06,
1348
+ "loss": 0.7929,
1349
+ "step": 1810
1350
+ },
1351
+ {
1352
+ "epoch": 1.9002610966057443,
1353
+ "grad_norm": 8.019466400146484,
1354
+ "learning_rate": 3.8036809815950928e-06,
1355
+ "loss": 0.7616,
1356
+ "step": 1820
1357
+ },
1358
+ {
1359
+ "epoch": 1.9107049608355091,
1360
+ "grad_norm": 7.649405479431152,
1361
+ "learning_rate": 3.7675929267412492e-06,
1362
+ "loss": 0.8661,
1363
+ "step": 1830
1364
+ },
1365
+ {
1366
+ "epoch": 1.9211488250652742,
1367
+ "grad_norm": 8.259872436523438,
1368
+ "learning_rate": 3.7315048718874057e-06,
1369
+ "loss": 1.0303,
1370
+ "step": 1840
1371
+ },
1372
+ {
1373
+ "epoch": 1.931592689295039,
1374
+ "grad_norm": 5.947850227355957,
1375
+ "learning_rate": 3.695416817033562e-06,
1376
+ "loss": 0.9326,
1377
+ "step": 1850
1378
+ },
1379
+ {
1380
+ "epoch": 1.942036553524804,
1381
+ "grad_norm": 5.193607330322266,
1382
+ "learning_rate": 3.6593287621797186e-06,
1383
+ "loss": 0.8527,
1384
+ "step": 1860
1385
+ },
1386
+ {
1387
+ "epoch": 1.9524804177545692,
1388
+ "grad_norm": 6.412321090698242,
1389
+ "learning_rate": 3.623240707325875e-06,
1390
+ "loss": 0.8768,
1391
+ "step": 1870
1392
+ },
1393
+ {
1394
+ "epoch": 1.9629242819843342,
1395
+ "grad_norm": 6.859325408935547,
1396
+ "learning_rate": 3.587152652472032e-06,
1397
+ "loss": 0.8131,
1398
+ "step": 1880
1399
+ },
1400
+ {
1401
+ "epoch": 1.9733681462140993,
1402
+ "grad_norm": 4.910820484161377,
1403
+ "learning_rate": 3.5510645976181883e-06,
1404
+ "loss": 0.7232,
1405
+ "step": 1890
1406
+ },
1407
+ {
1408
+ "epoch": 1.9838120104438643,
1409
+ "grad_norm": 6.052480220794678,
1410
+ "learning_rate": 3.5149765427643452e-06,
1411
+ "loss": 0.7567,
1412
+ "step": 1900
1413
+ },
1414
+ {
1415
+ "epoch": 1.9942558746736292,
1416
+ "grad_norm": 6.609030246734619,
1417
+ "learning_rate": 3.4788884879105017e-06,
1418
+ "loss": 0.8219,
1419
+ "step": 1910
1420
+ },
1421
+ {
1422
+ "epoch": 2.004177545691906,
1423
+ "grad_norm": 4.539740562438965,
1424
+ "learning_rate": 3.4428004330566585e-06,
1425
+ "loss": 0.8498,
1426
+ "step": 1920
1427
+ },
1428
+ {
1429
+ "epoch": 2.014621409921671,
1430
+ "grad_norm": 5.981167316436768,
1431
+ "learning_rate": 3.406712378202815e-06,
1432
+ "loss": 0.7024,
1433
+ "step": 1930
1434
+ },
1435
+ {
1436
+ "epoch": 2.025065274151436,
1437
+ "grad_norm": 6.600665092468262,
1438
+ "learning_rate": 3.3706243233489714e-06,
1439
+ "loss": 0.7467,
1440
+ "step": 1940
1441
+ },
1442
+ {
1443
+ "epoch": 2.035509138381201,
1444
+ "grad_norm": 5.560609340667725,
1445
+ "learning_rate": 3.3345362684951283e-06,
1446
+ "loss": 0.7434,
1447
+ "step": 1950
1448
+ },
1449
+ {
1450
+ "epoch": 2.045953002610966,
1451
+ "grad_norm": 4.679533004760742,
1452
+ "learning_rate": 3.2984482136412848e-06,
1453
+ "loss": 0.7125,
1454
+ "step": 1960
1455
+ },
1456
+ {
1457
+ "epoch": 2.056396866840731,
1458
+ "grad_norm": 7.177086353302002,
1459
+ "learning_rate": 3.2623601587874416e-06,
1460
+ "loss": 0.6745,
1461
+ "step": 1970
1462
+ },
1463
+ {
1464
+ "epoch": 2.066840731070496,
1465
+ "grad_norm": 6.860986709594727,
1466
+ "learning_rate": 3.226272103933598e-06,
1467
+ "loss": 0.6814,
1468
+ "step": 1980
1469
+ },
1470
+ {
1471
+ "epoch": 2.077284595300261,
1472
+ "grad_norm": 8.40719223022461,
1473
+ "learning_rate": 3.190184049079755e-06,
1474
+ "loss": 0.7077,
1475
+ "step": 1990
1476
+ },
1477
+ {
1478
+ "epoch": 2.0877284595300263,
1479
+ "grad_norm": 5.830367088317871,
1480
+ "learning_rate": 3.1540959942259114e-06,
1481
+ "loss": 0.8435,
1482
+ "step": 2000
1483
+ },
1484
+ {
1485
+ "epoch": 2.0877284595300263,
1486
+ "eval_loss": 0.9753687381744385,
1487
+ "eval_runtime": 23.1964,
1488
+ "eval_samples_per_second": 36.687,
1489
+ "eval_steps_per_second": 4.613,
1490
+ "step": 2000
1491
+ },
1492
+ {
1493
+ "epoch": 2.0981723237597913,
1494
+ "grad_norm": 6.964289665222168,
1495
+ "learning_rate": 3.118007939372068e-06,
1496
+ "loss": 0.6964,
1497
+ "step": 2010
1498
+ },
1499
+ {
1500
+ "epoch": 2.108616187989556,
1501
+ "grad_norm": 6.764989852905273,
1502
+ "learning_rate": 3.0819198845182247e-06,
1503
+ "loss": 0.7107,
1504
+ "step": 2020
1505
+ },
1506
+ {
1507
+ "epoch": 2.119060052219321,
1508
+ "grad_norm": 7.992194652557373,
1509
+ "learning_rate": 3.045831829664381e-06,
1510
+ "loss": 0.7146,
1511
+ "step": 2030
1512
+ },
1513
+ {
1514
+ "epoch": 2.129503916449086,
1515
+ "grad_norm": 7.102138996124268,
1516
+ "learning_rate": 3.009743774810538e-06,
1517
+ "loss": 0.686,
1518
+ "step": 2040
1519
+ },
1520
+ {
1521
+ "epoch": 2.139947780678851,
1522
+ "grad_norm": 7.446751117706299,
1523
+ "learning_rate": 2.9736557199566945e-06,
1524
+ "loss": 0.7681,
1525
+ "step": 2050
1526
+ },
1527
+ {
1528
+ "epoch": 2.1503916449086162,
1529
+ "grad_norm": 7.091776371002197,
1530
+ "learning_rate": 2.9375676651028514e-06,
1531
+ "loss": 0.7674,
1532
+ "step": 2060
1533
+ },
1534
+ {
1535
+ "epoch": 2.1608355091383813,
1536
+ "grad_norm": 7.994192123413086,
1537
+ "learning_rate": 2.901479610249008e-06,
1538
+ "loss": 0.7187,
1539
+ "step": 2070
1540
+ },
1541
+ {
1542
+ "epoch": 2.1712793733681464,
1543
+ "grad_norm": 4.8329386711120605,
1544
+ "learning_rate": 2.8653915553951643e-06,
1545
+ "loss": 0.7501,
1546
+ "step": 2080
1547
+ },
1548
+ {
1549
+ "epoch": 2.1817232375979114,
1550
+ "grad_norm": 6.802753925323486,
1551
+ "learning_rate": 2.829303500541321e-06,
1552
+ "loss": 0.5658,
1553
+ "step": 2090
1554
+ },
1555
+ {
1556
+ "epoch": 2.192167101827676,
1557
+ "grad_norm": 7.07351541519165,
1558
+ "learning_rate": 2.7932154456874776e-06,
1559
+ "loss": 0.8106,
1560
+ "step": 2100
1561
+ },
1562
+ {
1563
+ "epoch": 2.202610966057441,
1564
+ "grad_norm": 6.761138916015625,
1565
+ "learning_rate": 2.7571273908336344e-06,
1566
+ "loss": 0.7434,
1567
+ "step": 2110
1568
+ },
1569
+ {
1570
+ "epoch": 2.213054830287206,
1571
+ "grad_norm": 4.220724582672119,
1572
+ "learning_rate": 2.721039335979791e-06,
1573
+ "loss": 0.7526,
1574
+ "step": 2120
1575
+ },
1576
+ {
1577
+ "epoch": 2.2234986945169712,
1578
+ "grad_norm": 6.14243745803833,
1579
+ "learning_rate": 2.6849512811259478e-06,
1580
+ "loss": 0.6486,
1581
+ "step": 2130
1582
+ },
1583
+ {
1584
+ "epoch": 2.2339425587467363,
1585
+ "grad_norm": 8.640827178955078,
1586
+ "learning_rate": 2.6488632262721042e-06,
1587
+ "loss": 0.5949,
1588
+ "step": 2140
1589
+ },
1590
+ {
1591
+ "epoch": 2.2443864229765014,
1592
+ "grad_norm": 6.576625823974609,
1593
+ "learning_rate": 2.6127751714182607e-06,
1594
+ "loss": 0.6923,
1595
+ "step": 2150
1596
+ },
1597
+ {
1598
+ "epoch": 2.2548302872062664,
1599
+ "grad_norm": 6.136504173278809,
1600
+ "learning_rate": 2.5766871165644175e-06,
1601
+ "loss": 0.6255,
1602
+ "step": 2160
1603
+ },
1604
+ {
1605
+ "epoch": 2.2652741514360315,
1606
+ "grad_norm": 7.2910847663879395,
1607
+ "learning_rate": 2.540599061710574e-06,
1608
+ "loss": 0.6843,
1609
+ "step": 2170
1610
+ },
1611
+ {
1612
+ "epoch": 2.275718015665796,
1613
+ "grad_norm": 6.936916351318359,
1614
+ "learning_rate": 2.504511006856731e-06,
1615
+ "loss": 0.6751,
1616
+ "step": 2180
1617
+ },
1618
+ {
1619
+ "epoch": 2.286161879895561,
1620
+ "grad_norm": 5.899853229522705,
1621
+ "learning_rate": 2.4684229520028873e-06,
1622
+ "loss": 0.6584,
1623
+ "step": 2190
1624
+ },
1625
+ {
1626
+ "epoch": 2.2966057441253263,
1627
+ "grad_norm": 4.891731262207031,
1628
+ "learning_rate": 2.4323348971490438e-06,
1629
+ "loss": 0.6373,
1630
+ "step": 2200
1631
+ },
1632
+ {
1633
+ "epoch": 2.2966057441253263,
1634
+ "eval_loss": 0.958043098449707,
1635
+ "eval_runtime": 23.1881,
1636
+ "eval_samples_per_second": 36.7,
1637
+ "eval_steps_per_second": 4.614,
1638
+ "step": 2200
1639
+ },
1640
+ {
1641
+ "epoch": 2.3070496083550913,
1642
+ "grad_norm": 6.206886291503906,
1643
+ "learning_rate": 2.3962468422952e-06,
1644
+ "loss": 0.6725,
1645
+ "step": 2210
1646
+ },
1647
+ {
1648
+ "epoch": 2.3174934725848564,
1649
+ "grad_norm": 4.663551330566406,
1650
+ "learning_rate": 2.360158787441357e-06,
1651
+ "loss": 0.5578,
1652
+ "step": 2220
1653
+ },
1654
+ {
1655
+ "epoch": 2.3279373368146214,
1656
+ "grad_norm": 6.175649166107178,
1657
+ "learning_rate": 2.3240707325875135e-06,
1658
+ "loss": 0.6835,
1659
+ "step": 2230
1660
+ },
1661
+ {
1662
+ "epoch": 2.3383812010443865,
1663
+ "grad_norm": 6.676774501800537,
1664
+ "learning_rate": 2.2879826777336704e-06,
1665
+ "loss": 0.7966,
1666
+ "step": 2240
1667
+ },
1668
+ {
1669
+ "epoch": 2.3488250652741516,
1670
+ "grad_norm": 8.847614288330078,
1671
+ "learning_rate": 2.251894622879827e-06,
1672
+ "loss": 0.7679,
1673
+ "step": 2250
1674
+ },
1675
+ {
1676
+ "epoch": 2.3592689295039166,
1677
+ "grad_norm": 6.491757869720459,
1678
+ "learning_rate": 2.2158065680259837e-06,
1679
+ "loss": 0.6274,
1680
+ "step": 2260
1681
+ },
1682
+ {
1683
+ "epoch": 2.3697127937336813,
1684
+ "grad_norm": 6.540876388549805,
1685
+ "learning_rate": 2.17971851317214e-06,
1686
+ "loss": 0.674,
1687
+ "step": 2270
1688
+ },
1689
+ {
1690
+ "epoch": 2.3801566579634463,
1691
+ "grad_norm": 7.067712306976318,
1692
+ "learning_rate": 2.1436304583182966e-06,
1693
+ "loss": 0.6716,
1694
+ "step": 2280
1695
+ },
1696
+ {
1697
+ "epoch": 2.3906005221932114,
1698
+ "grad_norm": 4.959332466125488,
1699
+ "learning_rate": 2.1075424034644535e-06,
1700
+ "loss": 0.6729,
1701
+ "step": 2290
1702
+ },
1703
+ {
1704
+ "epoch": 2.4010443864229765,
1705
+ "grad_norm": 4.016025066375732,
1706
+ "learning_rate": 2.07145434861061e-06,
1707
+ "loss": 0.6358,
1708
+ "step": 2300
1709
+ },
1710
+ {
1711
+ "epoch": 2.4114882506527415,
1712
+ "grad_norm": 4.044537544250488,
1713
+ "learning_rate": 2.035366293756767e-06,
1714
+ "loss": 0.6867,
1715
+ "step": 2310
1716
+ },
1717
+ {
1718
+ "epoch": 2.4219321148825066,
1719
+ "grad_norm": 4.88841438293457,
1720
+ "learning_rate": 1.9992782389029233e-06,
1721
+ "loss": 0.7561,
1722
+ "step": 2320
1723
+ },
1724
+ {
1725
+ "epoch": 2.4323759791122717,
1726
+ "grad_norm": 7.33749532699585,
1727
+ "learning_rate": 1.96319018404908e-06,
1728
+ "loss": 0.6579,
1729
+ "step": 2330
1730
+ },
1731
+ {
1732
+ "epoch": 2.4428198433420367,
1733
+ "grad_norm": 6.818521976470947,
1734
+ "learning_rate": 1.9271021291952366e-06,
1735
+ "loss": 0.7322,
1736
+ "step": 2340
1737
+ },
1738
+ {
1739
+ "epoch": 2.453263707571802,
1740
+ "grad_norm": 5.549405097961426,
1741
+ "learning_rate": 1.8910140743413932e-06,
1742
+ "loss": 0.634,
1743
+ "step": 2350
1744
+ },
1745
+ {
1746
+ "epoch": 2.4637075718015664,
1747
+ "grad_norm": 6.154874801635742,
1748
+ "learning_rate": 1.85492601948755e-06,
1749
+ "loss": 0.6044,
1750
+ "step": 2360
1751
+ },
1752
+ {
1753
+ "epoch": 2.4741514360313315,
1754
+ "grad_norm": 5.5303521156311035,
1755
+ "learning_rate": 1.8188379646337066e-06,
1756
+ "loss": 0.6833,
1757
+ "step": 2370
1758
+ },
1759
+ {
1760
+ "epoch": 2.4845953002610965,
1761
+ "grad_norm": 6.135169982910156,
1762
+ "learning_rate": 1.7827499097798632e-06,
1763
+ "loss": 0.7765,
1764
+ "step": 2380
1765
+ },
1766
+ {
1767
+ "epoch": 2.4950391644908616,
1768
+ "grad_norm": 7.397289752960205,
1769
+ "learning_rate": 1.7466618549260197e-06,
1770
+ "loss": 0.6748,
1771
+ "step": 2390
1772
+ },
1773
+ {
1774
+ "epoch": 2.5054830287206267,
1775
+ "grad_norm": 5.909689426422119,
1776
+ "learning_rate": 1.7105738000721763e-06,
1777
+ "loss": 0.6791,
1778
+ "step": 2400
1779
+ },
1780
+ {
1781
+ "epoch": 2.5054830287206267,
1782
+ "eval_loss": 0.955007791519165,
1783
+ "eval_runtime": 23.2476,
1784
+ "eval_samples_per_second": 36.606,
1785
+ "eval_steps_per_second": 4.603,
1786
+ "step": 2400
1787
+ },
1788
+ {
1789
+ "epoch": 2.5159268929503917,
1790
+ "grad_norm": 6.320558547973633,
1791
+ "learning_rate": 1.6744857452183328e-06,
1792
+ "loss": 0.6219,
1793
+ "step": 2410
1794
+ },
1795
+ {
1796
+ "epoch": 2.526370757180157,
1797
+ "grad_norm": 7.978168487548828,
1798
+ "learning_rate": 1.6383976903644894e-06,
1799
+ "loss": 0.8254,
1800
+ "step": 2420
1801
+ },
1802
+ {
1803
+ "epoch": 2.5368146214099214,
1804
+ "grad_norm": 6.5808210372924805,
1805
+ "learning_rate": 1.602309635510646e-06,
1806
+ "loss": 0.7243,
1807
+ "step": 2430
1808
+ },
1809
+ {
1810
+ "epoch": 2.547258485639687,
1811
+ "grad_norm": 4.769480228424072,
1812
+ "learning_rate": 1.5662215806568025e-06,
1813
+ "loss": 0.7497,
1814
+ "step": 2440
1815
+ },
1816
+ {
1817
+ "epoch": 2.5577023498694516,
1818
+ "grad_norm": 5.738780975341797,
1819
+ "learning_rate": 1.5301335258029592e-06,
1820
+ "loss": 0.7048,
1821
+ "step": 2450
1822
+ },
1823
+ {
1824
+ "epoch": 2.5681462140992166,
1825
+ "grad_norm": 5.658013343811035,
1826
+ "learning_rate": 1.4940454709491159e-06,
1827
+ "loss": 0.739,
1828
+ "step": 2460
1829
+ },
1830
+ {
1831
+ "epoch": 2.5785900783289817,
1832
+ "grad_norm": 6.587325096130371,
1833
+ "learning_rate": 1.4579574160952725e-06,
1834
+ "loss": 0.7076,
1835
+ "step": 2470
1836
+ },
1837
+ {
1838
+ "epoch": 2.5890339425587467,
1839
+ "grad_norm": 5.956645965576172,
1840
+ "learning_rate": 1.4218693612414292e-06,
1841
+ "loss": 0.6372,
1842
+ "step": 2480
1843
+ },
1844
+ {
1845
+ "epoch": 2.599477806788512,
1846
+ "grad_norm": 5.966655731201172,
1847
+ "learning_rate": 1.3857813063875859e-06,
1848
+ "loss": 0.6934,
1849
+ "step": 2490
1850
+ },
1851
+ {
1852
+ "epoch": 2.609921671018277,
1853
+ "grad_norm": 5.313653945922852,
1854
+ "learning_rate": 1.3496932515337425e-06,
1855
+ "loss": 0.7163,
1856
+ "step": 2500
1857
+ },
1858
+ {
1859
+ "epoch": 2.620365535248042,
1860
+ "grad_norm": 6.935596466064453,
1861
+ "learning_rate": 1.313605196679899e-06,
1862
+ "loss": 0.7104,
1863
+ "step": 2510
1864
+ },
1865
+ {
1866
+ "epoch": 2.6308093994778066,
1867
+ "grad_norm": 4.822442054748535,
1868
+ "learning_rate": 1.2775171418260556e-06,
1869
+ "loss": 0.6378,
1870
+ "step": 2520
1871
+ },
1872
+ {
1873
+ "epoch": 2.641253263707572,
1874
+ "grad_norm": 5.288422107696533,
1875
+ "learning_rate": 1.2414290869722123e-06,
1876
+ "loss": 0.6463,
1877
+ "step": 2530
1878
+ },
1879
+ {
1880
+ "epoch": 2.6516971279373367,
1881
+ "grad_norm": 6.668851852416992,
1882
+ "learning_rate": 1.205341032118369e-06,
1883
+ "loss": 0.7505,
1884
+ "step": 2540
1885
+ },
1886
+ {
1887
+ "epoch": 2.6621409921671018,
1888
+ "grad_norm": 5.71054220199585,
1889
+ "learning_rate": 1.1692529772645256e-06,
1890
+ "loss": 0.5856,
1891
+ "step": 2550
1892
+ },
1893
+ {
1894
+ "epoch": 2.672584856396867,
1895
+ "grad_norm": 6.284550666809082,
1896
+ "learning_rate": 1.1331649224106823e-06,
1897
+ "loss": 0.8122,
1898
+ "step": 2560
1899
+ },
1900
+ {
1901
+ "epoch": 2.683028720626632,
1902
+ "grad_norm": 8.781463623046875,
1903
+ "learning_rate": 1.097076867556839e-06,
1904
+ "loss": 0.7063,
1905
+ "step": 2570
1906
+ },
1907
+ {
1908
+ "epoch": 2.693472584856397,
1909
+ "grad_norm": 7.29454231262207,
1910
+ "learning_rate": 1.0609888127029954e-06,
1911
+ "loss": 0.7429,
1912
+ "step": 2580
1913
+ },
1914
+ {
1915
+ "epoch": 2.703916449086162,
1916
+ "grad_norm": 5.689371109008789,
1917
+ "learning_rate": 1.024900757849152e-06,
1918
+ "loss": 0.6658,
1919
+ "step": 2590
1920
+ },
1921
+ {
1922
+ "epoch": 2.714360313315927,
1923
+ "grad_norm": 7.286506175994873,
1924
+ "learning_rate": 9.888127029953087e-07,
1925
+ "loss": 0.748,
1926
+ "step": 2600
1927
+ },
1928
+ {
1929
+ "epoch": 2.714360313315927,
1930
+ "eval_loss": 0.9431054592132568,
1931
+ "eval_runtime": 23.1747,
1932
+ "eval_samples_per_second": 36.721,
1933
+ "eval_steps_per_second": 4.617,
1934
+ "step": 2600
1935
+ },
1936
+ {
1937
+ "epoch": 2.7248041775456917,
1938
+ "grad_norm": 5.635782241821289,
1939
+ "learning_rate": 9.527246481414652e-07,
1940
+ "loss": 0.673,
1941
+ "step": 2610
1942
+ },
1943
+ {
1944
+ "epoch": 2.7352480417754568,
1945
+ "grad_norm": 5.282413959503174,
1946
+ "learning_rate": 9.166365932876219e-07,
1947
+ "loss": 0.9037,
1948
+ "step": 2620
1949
+ },
1950
+ {
1951
+ "epoch": 2.745691906005222,
1952
+ "grad_norm": 7.922749042510986,
1953
+ "learning_rate": 8.805485384337785e-07,
1954
+ "loss": 0.6862,
1955
+ "step": 2630
1956
+ },
1957
+ {
1958
+ "epoch": 2.756135770234987,
1959
+ "grad_norm": 5.463962078094482,
1960
+ "learning_rate": 8.444604835799351e-07,
1961
+ "loss": 0.6,
1962
+ "step": 2640
1963
+ },
1964
+ {
1965
+ "epoch": 2.766579634464752,
1966
+ "grad_norm": 8.0007963180542,
1967
+ "learning_rate": 8.083724287260918e-07,
1968
+ "loss": 0.6688,
1969
+ "step": 2650
1970
+ },
1971
+ {
1972
+ "epoch": 2.777023498694517,
1973
+ "grad_norm": 7.617900371551514,
1974
+ "learning_rate": 7.722843738722483e-07,
1975
+ "loss": 0.7277,
1976
+ "step": 2660
1977
+ },
1978
+ {
1979
+ "epoch": 2.787467362924282,
1980
+ "grad_norm": 5.969784259796143,
1981
+ "learning_rate": 7.36196319018405e-07,
1982
+ "loss": 0.7085,
1983
+ "step": 2670
1984
+ },
1985
+ {
1986
+ "epoch": 2.7979112271540467,
1987
+ "grad_norm": 5.169407367706299,
1988
+ "learning_rate": 7.001082641645617e-07,
1989
+ "loss": 0.7344,
1990
+ "step": 2680
1991
+ },
1992
+ {
1993
+ "epoch": 2.8083550913838122,
1994
+ "grad_norm": 8.009687423706055,
1995
+ "learning_rate": 6.640202093107181e-07,
1996
+ "loss": 0.6457,
1997
+ "step": 2690
1998
+ },
1999
+ {
2000
+ "epoch": 2.818798955613577,
2001
+ "grad_norm": 7.4137187004089355,
2002
+ "learning_rate": 6.279321544568748e-07,
2003
+ "loss": 0.6416,
2004
+ "step": 2700
2005
+ },
2006
+ {
2007
+ "epoch": 2.829242819843342,
2008
+ "grad_norm": 5.35453462600708,
2009
+ "learning_rate": 5.918440996030314e-07,
2010
+ "loss": 0.7867,
2011
+ "step": 2710
2012
+ },
2013
+ {
2014
+ "epoch": 2.839686684073107,
2015
+ "grad_norm": 7.469908237457275,
2016
+ "learning_rate": 5.557560447491881e-07,
2017
+ "loss": 0.7955,
2018
+ "step": 2720
2019
+ },
2020
+ {
2021
+ "epoch": 2.850130548302872,
2022
+ "grad_norm": 6.326605319976807,
2023
+ "learning_rate": 5.196679898953446e-07,
2024
+ "loss": 0.6995,
2025
+ "step": 2730
2026
+ },
2027
+ {
2028
+ "epoch": 2.860574412532637,
2029
+ "grad_norm": 7.096553802490234,
2030
+ "learning_rate": 4.835799350415013e-07,
2031
+ "loss": 0.6625,
2032
+ "step": 2740
2033
+ },
2034
+ {
2035
+ "epoch": 2.871018276762402,
2036
+ "grad_norm": 5.128674507141113,
2037
+ "learning_rate": 4.474918801876579e-07,
2038
+ "loss": 0.6791,
2039
+ "step": 2750
2040
+ },
2041
+ {
2042
+ "epoch": 2.8814621409921672,
2043
+ "grad_norm": 6.457350254058838,
2044
+ "learning_rate": 4.1140382533381457e-07,
2045
+ "loss": 0.7586,
2046
+ "step": 2760
2047
+ },
2048
+ {
2049
+ "epoch": 2.891906005221932,
2050
+ "grad_norm": 6.231655597686768,
2051
+ "learning_rate": 3.753157704799711e-07,
2052
+ "loss": 0.677,
2053
+ "step": 2770
2054
+ },
2055
+ {
2056
+ "epoch": 2.9023498694516974,
2057
+ "grad_norm": 6.412544250488281,
2058
+ "learning_rate": 3.392277156261278e-07,
2059
+ "loss": 0.7449,
2060
+ "step": 2780
2061
+ },
2062
+ {
2063
+ "epoch": 2.912793733681462,
2064
+ "grad_norm": 3.725374698638916,
2065
+ "learning_rate": 3.031396607722844e-07,
2066
+ "loss": 0.5804,
2067
+ "step": 2790
2068
+ },
2069
+ {
2070
+ "epoch": 2.923237597911227,
2071
+ "grad_norm": 5.284286975860596,
2072
+ "learning_rate": 2.67051605918441e-07,
2073
+ "loss": 0.571,
2074
+ "step": 2800
2075
+ },
2076
+ {
2077
+ "epoch": 2.923237597911227,
2078
+ "eval_loss": 0.94057297706604,
2079
+ "eval_runtime": 23.1841,
2080
+ "eval_samples_per_second": 36.706,
2081
+ "eval_steps_per_second": 4.615,
2082
+ "step": 2800
2083
+ },
2084
+ {
2085
+ "epoch": 2.933681462140992,
2086
+ "grad_norm": 7.735732555389404,
2087
+ "learning_rate": 2.3096355106459763e-07,
2088
+ "loss": 0.6486,
2089
+ "step": 2810
2090
+ },
2091
+ {
2092
+ "epoch": 2.944125326370757,
2093
+ "grad_norm": 5.862439155578613,
2094
+ "learning_rate": 1.9487549621075424e-07,
2095
+ "loss": 0.6731,
2096
+ "step": 2820
2097
+ },
2098
+ {
2099
+ "epoch": 2.9545691906005223,
2100
+ "grad_norm": 7.650035858154297,
2101
+ "learning_rate": 1.5878744135691087e-07,
2102
+ "loss": 0.7415,
2103
+ "step": 2830
2104
+ },
2105
+ {
2106
+ "epoch": 2.9650130548302873,
2107
+ "grad_norm": 5.932896614074707,
2108
+ "learning_rate": 1.226993865030675e-07,
2109
+ "loss": 0.701,
2110
+ "step": 2840
2111
+ },
2112
+ {
2113
+ "epoch": 2.9754569190600524,
2114
+ "grad_norm": 5.462695598602295,
2115
+ "learning_rate": 8.661133164922412e-08,
2116
+ "loss": 0.5798,
2117
+ "step": 2850
2118
+ },
2119
+ {
2120
+ "epoch": 2.985900783289817,
2121
+ "grad_norm": 7.089748382568359,
2122
+ "learning_rate": 5.0523276795380736e-08,
2123
+ "loss": 0.7796,
2124
+ "step": 2860
2125
+ },
2126
+ {
2127
+ "epoch": 2.9963446475195825,
2128
+ "grad_norm": 7.061713695526123,
2129
+ "learning_rate": 1.4435221941537352e-08,
2130
+ "loss": 0.6495,
2131
+ "step": 2870
2132
+ }
2133
+ ],
2134
+ "logging_steps": 10,
2135
+ "max_steps": 2871,
2136
+ "num_input_tokens_seen": 0,
2137
+ "num_train_epochs": 3,
2138
+ "save_steps": 200,
2139
+ "stateful_callbacks": {
2140
+ "TrainerControl": {
2141
+ "args": {
2142
+ "should_epoch_stop": false,
2143
+ "should_evaluate": false,
2144
+ "should_log": false,
2145
+ "should_save": true,
2146
+ "should_training_stop": true
2147
+ },
2148
+ "attributes": {}
2149
+ }
2150
+ },
2151
+ "total_flos": 3.1064837698093056e+16,
2152
+ "train_batch_size": 2,
2153
+ "trial_name": null,
2154
+ "trial_params": null
2155
+ }
checkpoint-2871/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61994a9000437d03570fb294e230b58ca459988c8e493e7a09ed1c5e37d56ce8
3
+ size 5240
checkpoint-2871/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151643,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "max_position_embeddings": 32768,
15
+ "max_window_layers": 28,
16
+ "model_type": "qwen3",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 8,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": null,
22
+ "rope_theta": 1000000,
23
+ "sliding_window": null,
24
+ "tie_word_embeddings": true,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.51.3",
27
+ "use_cache": true,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 151936
30
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.51.3"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7457ab2fe2401fcf9d0e503c89ae96efe6561e34a70ef69b228912ecb5aba18f
3
+ size 2384234968
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:352a863cd2761388ccc58f1432467ba6a1037bf12df9069889b142fa246471f6
3
+ size 11422752
tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|endoftext|>",
233
+ "errors": "replace",
234
+ "extra_special_tokens": {},
235
+ "model_max_length": 131072,
236
+ "pad_token": "<|endoftext|>",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff